161 files changed, 20100 insertions, 14523 deletions
diff --git a/db/background.h b/db/background.h
index 24ea1cb..ea424c9 100644
--- a/db/background.h
+++ b/db/background.h
@@ -21,16 +21,16 @@
 
 #pragma once
 
-namespace mongo { 
+namespace mongo {
 
-    /* these are administrative operations / jobs 
-       for a namespace running in the background, and that only one 
+    /* these are administrative operations / jobs
+       for a namespace running in the background, and that only one
        at a time per namespace is permitted, and that if in progress,
        you aren't allowed to do other NamespaceDetails major manipulations
-       (such as dropping ns or db) even in the foreground and must 
-       instead uassert. 
+       (such as dropping ns or db) even in the foreground and must
+       instead uassert.
 
-       It's assumed this is not for super-high RPS things, so we don't do 
+       It's assumed this is not for super-high RPS things, so we don't do
        anything special in the implementation here to be fast.
     */
     class BackgroundOperation : public boost::noncopyable {
diff --git a/db/btree.cpp b/db/btree.cpp
index d646de8..d547a1b 100644
--- a/db/btree.cpp
+++ b/db/btree.cpp
@@ -24,48 +24,92 @@
 #include "clientcursor.h"
 #include "client.h"
 #include "dbhelpers.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "stats/counters.h"
+#include "dur_commitjob.h"
 
 namespace mongo {
 
 #define VERIFYTHISLOC dassert( thisLoc.btree() == this );
 
+    /**
+     * give us a writable version of the btree bucket (declares write intent).
+     * note it is likely more efficient to declare write intent on something smaller when you can.
+     */
+    BtreeBucket* DiskLoc::btreemod() const {
+        assert( _a != -1 );
+        BtreeBucket *b = const_cast< BtreeBucket * >( btree() );
+        return static_cast< BtreeBucket* >( getDur().writingPtr( b, BucketSize ) );
+    }
+
+    _KeyNode& _KeyNode::writing() const {
+        return *getDur().writing( const_cast< _KeyNode* >( this ) );
+    }
+
     KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) :
-            prevChildBucket(k.prevChildBucket),
-            recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
+        prevChildBucket(k.prevChildBucket),
+        recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
     { }
 
-    const int KeyMax = BucketSize / 10;
+    // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
+    static const int KeyMax = BucketSize / 10;
+
+    // We define this value as the maximum number of bytes such that, if we have
+    // fewer than this many bytes, we must be able to either merge with or receive
+    // keys from any neighboring node.  If our utilization goes below this value we
+    // know we can bring up the utilization with a simple operation.  Ignoring the
+    // 90/10 split policy which is sometimes employed and our 'unused' nodes, this
+    // is a lower bound on bucket utilization for non root buckets.
+    //
+    // Note that the exact value here depends on the implementation of
+    // rebalancedSeparatorPos().  The conditions for lowWaterMark - 1 are as
+    // follows:  We know we cannot merge with the neighbor, so the total data size
+    // for us, the neighbor, and the separator must be at least
+    // BtreeBucket::bodySize() + 1.  We must be able to accept one key of any
+    // allowed size, so our size plus storage for that additional key must be
+    // <= BtreeBucket::bodySize() / 2.  This way, with the extra key we'll have a
+    // new bucket data size < half the total data size and by the implementation
+    // of rebalancedSeparatorPos() the key must be added.
+    static const int lowWaterMark = BtreeBucket::bodySize() / 2 - KeyMax - sizeof( _KeyNode ) + 1;
+
+    static const int split_debug = 0;
+    static const int insert_debug = 0;
 
     extern int otherTraceLevel;
-    const int split_debug = 0;
-    const int insert_debug = 0;
 
-    static void alreadyInIndex() { 
+    /**
+     * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly
+     * for the 10287 error code.
+     */
+    static void alreadyInIndex() {
         // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord()
         throw MsgAssertionException(10287, "btree: key+recloc already in index");
     }
 
     /* BucketBasics --------------------------------------------------- */
 
-    inline void BucketBasics::modified(const DiskLoc& thisLoc) {
-        VERIFYTHISLOC
-        btreeStore->modified(thisLoc);
+    void BucketBasics::assertWritable() {
+        if( cmdLine.dur )
+            dur::assertAlreadyDeclared(this, sizeof(*this));
+    }
+
+    string BtreeBucket::bucketSummary() const {
+        stringstream ss;
+        ss << "  Bucket info:" << endl;
+        ss << "    n: " << n << endl;
+        ss << "    parent: " << parent.toString() << endl;
+        ss << "    nextChild: " << parent.toString() << endl;
+        ss << "    flags:" << flags << endl;
+        ss << "    emptySize: " << emptySize << " topSize: " << topSize << endl;
+        return ss.str();
     }
 
     int BucketBasics::Size() const {
         assert( _wasSize == BucketSize );
         return BucketSize;
     }
-    inline void BucketBasics::setNotPacked() {
-        flags &= ~Packed;
-    }
-    inline void BucketBasics::setPacked() {
-        flags |= Packed;
-    }
 
-    void BucketBasics::_shape(int level, stringstream& ss) {
+    void BucketBasics::_shape(int level, stringstream& ss) const {
         for ( int i = 0; i < level; i++ ) ss << ' ';
         ss << "*\n";
         for ( int i = 0; i < n; i++ )
@@ -78,13 +122,13 @@ namespace mongo {
     int bt_fv=0;
     int bt_dmp=0;
 
-    void BucketBasics::dumpTree(DiskLoc thisLoc, const BSONObj &order) {
+    void BtreeBucket::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const {
         bt_dmp=1;
         fullValidate(thisLoc, order);
         bt_dmp=0;
     }
 
-    int BucketBasics::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount) {
+    int BtreeBucket::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount, bool strict) const {
         {
             bool f = false;
             assert( f = true );
@@ -93,8 +137,6 @@ namespace mongo {
 
         killCurrentOp.checkForInterrupt();
         assertValid(order, true);
-//	if( bt_fv==0 )
-//		return;
 
         if ( bt_dmp ) {
             out() << thisLoc.toString() << ' ';
@@ -105,26 +147,37 @@ namespace mongo {
         int kc = 0;
 
         for ( int i = 0; i < n; i++ ) {
-            _KeyNode& kn = k(i);
+            const _KeyNode& kn = k(i);
 
             if ( kn.isUsed() ) {
                 kc++;
-            } else {
+            }
+            else {
                 if ( unusedCount ) {
                     ++( *unusedCount );
                 }
             }
             if ( !kn.prevChildBucket.isNull() ) {
                 DiskLoc left = kn.prevChildBucket;
-                BtreeBucket *b = left.btree();
-                wassert( b->parent == thisLoc );
-                kc += b->fullValidate(kn.prevChildBucket, order, unusedCount);
+                const BtreeBucket *b = left.btree();
+                if ( strict ) {
+                    assert( b->parent == thisLoc );
+                }
+                else {
+                    wassert( b->parent == thisLoc );
+                }
+                kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict);
             }
         }
         if ( !nextChild.isNull() ) {
-            BtreeBucket *b = nextChild.btree();
-            wassert( b->parent == thisLoc );
-            kc += b->fullValidate(nextChild, order, unusedCount);
+            const BtreeBucket *b = nextChild.btree();
+            if ( strict ) {
+                assert( b->parent == thisLoc );
+            }
+            else {
+                wassert( b->parent == thisLoc );
+            }
+            kc += b->fullValidate(nextChild, order, unusedCount, strict);
         }
 
         return kc;
@@ -132,12 +185,20 @@ namespace mongo {
 
     int nDumped = 0;
 
-    void BucketBasics::assertValid(const Ordering &order, bool force) {
+    void BucketBasics::assertValid(const Ordering &order, bool force) const {
         if ( !debug && !force )
             return;
         wassert( n >= 0 && n < Size() );
         wassert( emptySize >= 0 && emptySize < BucketSize );
         wassert( topSize >= n && topSize <= BucketSize );
+
+        // this is very slow so don't do often
+        {
+            static int _k;
+            if( ++_k % 128 )
+                return;
+        }
+
         DEV {
             // slow:
             for ( int i = 0; i < n-1; i++ ) {
@@ -204,15 +265,16 @@ namespace mongo {
         reserved = 0;
     }
 
-    /* see _alloc */
+    /** see _alloc */
     inline void BucketBasics::_unalloc(int bytes) {
         topSize -= bytes;
         emptySize += bytes;
     }
 
-    /* we allocate space from the end of the buffer for data.
-       the keynodes grow from the front.
-    */
+    /**
+     * we allocate space from the end of the buffer for data.
+     * the keynodes grow from the front.
+     */
     inline int BucketBasics::_alloc(int bytes) {
         topSize += bytes;
         emptySize -= bytes;
@@ -221,21 +283,23 @@ namespace mongo {
         return ofs;
     }
 
-    void BucketBasics::_delKeyAtPos(int keypos) {
+    void BucketBasics::_delKeyAtPos(int keypos, bool mayEmpty) {
         assert( keypos >= 0 && keypos <= n );
         assert( childForPos(keypos).isNull() );
+        // TODO audit cases where nextChild is null
+        assert( ( mayEmpty && n > 0 ) || n > 1 || nextChild.isNull() );
+        emptySize += sizeof(_KeyNode);
         n--;
-        assert( n > 0 || nextChild.isNull() );
         for ( int j = keypos; j < n; j++ )
             k(j) = k(j+1);
-        emptySize += sizeof(_KeyNode);
         setNotPacked();
     }
 
-    /* pull rightmost key from the bucket.  this version requires its right child to be null so it 
-	   does not bother returning that value.
-    */
-    void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) { 
+    /**
+     * pull rightmost key from the bucket.  this version requires its right child to be null so it
+     *  does not bother returning that value.
+     */
+    void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) {
         massert( 10282 ,  "n==0 in btree popBack()", n > 0 );
         assert( k(n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that
         KeyNode kn = keyNode(n-1);
@@ -243,18 +307,18 @@ namespace mongo {
         key = kn.key;
         int keysize = kn.key.objsize();
 
-		massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull());
+        massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull());
 
-		/* weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full. */
-		nextChild = kn.prevChildBucket;
+        // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full.
+        nextChild = kn.prevChildBucket;
 
         n--;
         emptySize += sizeof(_KeyNode);
         _unalloc(keysize);
     }
 
-    /* add a key.  must be > all existing.  be careful to set next ptr right. */
-    bool BucketBasics::_pushBack(const DiskLoc& recordLoc, BSONObj& key, const Ordering &order, DiskLoc prevChild) {
+    /** add a key.  must be > all existing.  be careful to set next ptr right. */
+    bool BucketBasics::_pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) {
         int bytesNeeded = key.objsize() + sizeof(_KeyNode);
         if ( bytesNeeded > emptySize )
             return false;
@@ -269,38 +333,96 @@ namespace mongo {
         memcpy(p, key.objdata(), key.objsize());
         return true;
     }
-    /*void BucketBasics::pushBack(const DiskLoc& recordLoc, BSONObj& key, const BSONObj &order, DiskLoc prevChild, DiskLoc nextChild) { 
-        pushBack(recordLoc, key, order, prevChild);
-        childForPos(n) = nextChild;
-    }*/
 
-    /* insert a key in a bucket with no complexity -- no splits required */
-    bool BucketBasics::basicInsert(const DiskLoc& thisLoc, int &keypos, const DiskLoc& recordLoc, const BSONObj& key, const Ordering &order) {
-        modified(thisLoc);
+    /* durability note
+       we do separate intent declarations herein.  arguably one could just declare
+       the whole bucket given we do group commits. this is something we could investigate
+       later as to what is faster under what situations.
+       */
+    /** insert a key in a bucket with no complexity -- no splits required
+        @return false if a split is required.
+    */
+    bool BucketBasics::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const {
         assert( keypos >= 0 && keypos <= n );
         int bytesNeeded = key.objsize() + sizeof(_KeyNode);
         if ( bytesNeeded > emptySize ) {
-            pack( order, keypos );
+            _pack(thisLoc, order, keypos);
             if ( bytesNeeded > emptySize )
                 return false;
         }
-        for ( int j = n; j > keypos; j-- ) // make room
-            k(j) = k(j-1);
-        n++;
-        emptySize -= sizeof(_KeyNode);
-        _KeyNode& kn = k(keypos);
+
+        BucketBasics *b;
+        {
+            const char *p = (const char *) &k(keypos);
+            const char *q = (const char *) &k(n+1);
+            // declare that we will write to [k(keypos),k(n)]
+            // todo: this writes a medium amount to the journal.  we may want to add a verb "shift" to the redo log so
+            //       we can log a very small amount.
+            b = (BucketBasics*) getDur().writingAtOffset((void *) this, p-(char*)this, q-p);
+
+            // e.g. n==3, keypos==2
+            // 1 4 9
+            // ->
+            // 1 4 _ 9
+            for ( int j = n; j > keypos; j-- ) // make room
+                b->k(j) = b->k(j-1);
+        }
+
+        getDur().declareWriteIntent(&b->emptySize, 12); // [b->emptySize..b->n] is 12 bytes and we are going to write those
+        b->emptySize -= sizeof(_KeyNode);
+        b->n++;
+
+        _KeyNode& kn = b->k(keypos);
         kn.prevChildBucket.Null();
         kn.recordLoc = recordLoc;
-        kn.setKeyDataOfs((short) _alloc(key.objsize()) );
-        char *p = dataAt(kn.keyDataOfs());
+        kn.setKeyDataOfs((short) b->_alloc(key.objsize()) );
+        char *p = b->dataAt(kn.keyDataOfs());
+        getDur().declareWriteIntent(p, key.objsize());
         memcpy(p, key.objdata(), key.objsize());
         return true;
     }
 
-    /* when we delete things we just leave empty space until the node is
-       full and then we repack it.
-    */
-    void BucketBasics::pack( const Ordering &order, int &refPos ) {
+    /** with this implementation, refPos == 0 disregards effect of refPos */
+    bool BucketBasics::mayDropKey( int index, int refPos ) const {
+        return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull();
+    }
+
+    int BucketBasics::packedDataSize( int refPos ) const {
+        if ( flags & Packed ) {
+            return BucketSize - emptySize - headerSize();
+        }
+        int size = 0;
+        for( int j = 0; j < n; ++j ) {
+            if ( mayDropKey( j, refPos ) ) {
+                continue;
+            }
+            size += keyNode( j ).key.objsize() + sizeof( _KeyNode );
+        }
+        return size;
+    }
+
+    /**
+     * when we delete things we just leave empty space until the node is
+     * full and then we repack it.
+     */
+    void BucketBasics::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const {
+        if ( flags & Packed )
+            return;
+
+        VERIFYTHISLOC
+
+        /** TODO perhaps this can be optimized.  for example if packing does no write, we can skip intent decl.
+                 an empirical approach is probably best than just adding new code : perhaps the bucket would need
+                 declaration anyway within the group commit interval, in which case we would just be adding
+                 code and complexity without benefit.
+        */
+        thisLoc.btreemod()->_packReadyForMod(order, refPos);
+    }
+
+    /** version when write intent already declared */
+    void BucketBasics::_packReadyForMod( const Ordering &order, int &refPos ) {
+        assertWritable();
+
         if ( flags & Packed )
             return;
 
@@ -310,7 +432,7 @@ namespace mongo {
         topSize = 0;
         int i = 0;
         for ( int j = 0; j < n; j++ ) {
-            if( j > 0 && ( j != refPos ) && k( j ).isUnused() && k( j ).prevChildBucket.isNull() ) {
+            if( mayDropKey( j, refPos ) ) {
                 continue; // key is unused and has no children - drop it
             }
             if( i != j ) {
@@ -333,26 +455,104 @@ namespace mongo {
         n = i;
         int dataUsed = tdz - ofs;
         memcpy(data + ofs, temp + ofs, dataUsed);
+
+        // assertWritable();
+        // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this));
+
         emptySize = tdz - dataUsed - n * sizeof(_KeyNode);
         assert( emptySize >= 0 );
 
         setPacked();
+
         assertValid( order );
     }
 
     inline void BucketBasics::truncateTo(int N, const Ordering &order, int &refPos) {
+        dbMutex.assertWriteLocked();
+        assertWritable();
+
         n = N;
         setNotPacked();
-        pack( order, refPos );
+        _packReadyForMod( order, refPos );
+    }
+
+    /**
+     * In the standard btree algorithm, we would split based on the
+     * existing keys _and_ the new key.  But that's more work to
+     * implement, so we split the existing keys and then add the new key.
+     *
+     * There are several published heuristic algorithms for doing splits,
+     * but basically what you want are (1) even balancing between the two
+     * sides and (2) a small split key so the parent can have a larger
+     * branching factor.
+     *
+     * We just have a simple algorithm right now: if a key includes the
+     * halfway point (or 10% way point) in terms of bytes, split on that key;
+     * otherwise split on the key immediately to the left of the halfway
+     * point.
+     *
+     * This function is expected to be called on a packed bucket.
+     */
+    int BucketBasics::splitPos( int keypos ) const {
+        assert( n > 2 );
+        int split = 0;
+        int rightSize = 0;
+        // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split.
+        // see SERVER-983
+        int rightSizeLimit = ( topSize + sizeof( _KeyNode ) * n ) / ( keypos == n ? 10 : 2 );
+        for( int i = n - 1; i > -1; --i ) {
+            rightSize += keyNode( i ).key.objsize() + sizeof( _KeyNode );
+            if ( rightSize > rightSizeLimit ) {
+                split = i;
+                break;
+            }
+        }
+        // safeguards - we must not create an empty bucket
+        if ( split < 1 ) {
+            split = 1;
+        }
+        else if ( split > n - 2 ) {
+            split = n - 2;
+        }
+
+        return split;
+    }
+
+    void BucketBasics::reserveKeysFront( int nAdd ) {
+        assert( emptySize >= int( sizeof( _KeyNode ) * nAdd ) );
+        emptySize -= sizeof( _KeyNode ) * nAdd;
+        for( int i = n - 1; i > -1; --i ) {
+            k( i + nAdd ) = k( i );
+        }
+        n += nAdd;
+    }
+
+    void BucketBasics::setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket ) {
+        _KeyNode &kn = k( i );
+        kn.recordLoc = recordLoc;
+        kn.prevChildBucket = prevChildBucket;
+        short ofs = (short) _alloc( key.objsize() );
+        kn.setKeyDataOfs( ofs );
+        char *p = dataAt( ofs );
+        memcpy( p, key.objdata(), key.objsize() );
+    }
+
+    void BucketBasics::dropFront( int nDrop, const Ordering &order, int &refpos ) {
+        for( int i = nDrop; i < n; ++i ) {
+            k( i - nDrop ) = k( i );
+        }
+        n -= nDrop;
+        setNotPacked();
+        _packReadyForMod( order, refpos );
     }
 
     /* - BtreeBucket --------------------------------------------------- */
 
-    /* return largest key in the subtree. */
+    /** @return largest key in the subtree. */
     void BtreeBucket::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) {
         DiskLoc loc = thisLoc;
         while ( 1 ) {
-            BtreeBucket *b = loc.btree();
+            const BtreeBucket *b = loc.btree();
             if ( !b->nextChild.isNull() ) {
                 loc = b->nextChild;
                 continue;
@@ -365,23 +565,34 @@ namespace mongo {
             break;
         }
     }
-    
-    int BtreeBucket::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, const vector< const BSONElement * > &rEnd, const Ordering &o ) {
+
+    /**
+     * NOTE Currently the Ordering implementation assumes a compound index will
+     * not have more keys than an unsigned variable has bits.  The same
+     * assumption is used in the implementation below with respect to the 'mask'
+     * variable.
+     */
+    int BtreeBucket::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) {
         BSONObjIterator ll( l );
         BSONObjIterator rr( rBegin );
         vector< const BSONElement * >::const_iterator rr2 = rEnd.begin();
+        vector< bool >::const_iterator inc = rEndInclusive.begin();
         unsigned mask = 1;
         for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) {
             BSONElement lll = ll.next();
             BSONElement rrr = rr.next();
             ++rr2;
-            
+            ++inc;
+
             int x = lll.woCompare( rrr, false );
             if ( o.descending( mask ) )
                 x = -x;
             if ( x != 0 )
                 return x;
         }
+        if ( rSup ) {
+            return -direction;
+        }
         for( ; ll.more(); mask <<= 1 ) {
             BSONElement lll = ll.next();
             BSONElement rrr = **rr2;
@@ -391,11 +602,15 @@ namespace mongo {
                 x = -x;
             if ( x != 0 )
                 return x;
+            if ( !*inc ) {
+                return -direction;
+            }
+            ++inc;
         }
         return 0;
     }
 
-    bool BtreeBucket::exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, const Ordering& order) { 
+    bool BtreeBucket::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const {
         int pos;
         bool found;
         DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
@@ -404,8 +619,8 @@ namespace mongo {
         while ( 1 ) {
             if( b.isNull() )
                 break;
-            BtreeBucket *bucket = b.btree();
-            _KeyNode& kn = bucket->k(pos);
+            const BtreeBucket *bucket = b.btree();
+            const _KeyNode& kn = bucket->k(pos);
             if ( kn.isUsed() )
                 return bucket->keyAt(pos).woEqual(key);
             b = bucket->advance(b, pos, 1, "BtreeBucket::exists");
@@ -413,22 +628,22 @@ namespace mongo {
         return false;
     }
 
-    /* @param self - don't complain about ourself already being in the index case.
-       @return true = there is a duplicate.
-    */
+    /**
+     * @param self - don't complain about ourself already being in the index case.
+     * @return true = there is a duplicate.
+     */
     bool BtreeBucket::wouldCreateDup(
-        const IndexDetails& idx, DiskLoc thisLoc, 
+        const IndexDetails& idx, const DiskLoc &thisLoc,
         const BSONObj& key, const Ordering& order,
-        DiskLoc self) 
-    { 
+        const DiskLoc &self) const {
         int pos;
         bool found;
         DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
 
         while ( !b.isNull() ) {
             // we skip unused keys
-            BtreeBucket *bucket = b.btree();
-            _KeyNode& kn = bucket->k(pos);
+            const BtreeBucket *bucket = b.btree();
+            const _KeyNode& kn = bucket->k(pos);
             if ( kn.isUsed() ) {
                 if( bucket->keyAt(pos).woEqual(key) )
                     return kn.recordLoc != self;
@@ -440,7 +655,7 @@ namespace mongo {
         return false;
     }
 
-    string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ){
+    string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ) {
         stringstream ss;
         ss << "E11000 duplicate key error ";
         ss << "index: " << idx.indexNamespace() << "  ";
@@ -448,37 +663,38 @@ namespace mongo {
         return ss.str();
     }
 
-    /* Find a key withing this btree bucket.
- 
-       When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the 
-       key.  That assures that even when there are many duplicates (e.g., 1 million) for a key,
-       our performance is still good.
-
-       assertIfDup: if the key exists (ignoring the recordLoc), uassert
-
-       pos: for existing keys k0...kn-1.
-       returns # it goes BEFORE.  so key[pos-1] < key < key[pos]
-       returns n if it goes after the last existing key.
-       note result might be an Unused location!
-    */
-	char foo;
-    bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, DiskLoc recordLoc, const Ordering &order, int& pos, bool assertIfDup) {
+    /**
+     * Find a key withing this btree bucket.
+     *
+     * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the
+     * key.  That assures that even when there are many duplicates (e.g., 1 million) for a key,
+     * our performance is still good.
+     *
+     * assertIfDup: if the key exists (ignoring the recordLoc), uassert
+     *
+     * pos: for existing keys k0...kn-1.
+     * returns # it goes BEFORE.  so key[pos-1] < key < key[pos]
+     * returns n if it goes after the last existing key.
+     * note result might be an Unused location!
+     */
+    char foo;
+    bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const {
 #if defined(_EXPERIMENT1)
-		{
-			char *z = (char *) this;
-			int i = 0;
-			while( 1 ) {
-				i += 4096;
-				if( i >= BucketSize )
-					break;
-				foo += z[i];
-			}
-		}
+        {
+            char *z = (char *) this;
+            int i = 0;
+            while( 1 ) {
+                i += 4096;
+                if( i >= BucketSize )
+                    break;
+                foo += z[i];
+            }
+        }
 #endif
-        
+
         globalIndexCounters.btree( (char*)this );
-        
-        /* binary search for this key */
+
+        // binary search for this key
         bool dupsChecked = false;
         int l=0;
         int h=n-1;
@@ -486,13 +702,13 @@ namespace mongo {
             int m = (l+h)/2;
             KeyNode M = keyNode(m);
             int x = key.woCompare(M.key, order);
-            if ( x == 0 ) { 
+            if ( x == 0 ) {
                 if( assertIfDup ) {
-                    if( k(m).isUnused() ) { 
-                        // ok that key is there if unused.  but we need to check that there aren't other 
-                        // entries for the key then.  as it is very rare that we get here, we don't put any 
+                    if( k(m).isUnused() ) {
+                        // ok that key is there if unused.  but we need to check that there aren't other
+                        // entries for the key then.  as it is very rare that we get here, we don't put any
                         // coding effort in here to make this particularly fast
-                        if( !dupsChecked ) { 
+                        if( !dupsChecked ) {
                             dupsChecked = true;
                             if( idx.head.btree()->exists(idx, idx.head, key, order) ) {
                                 if( idx.head.btree()->wouldCreateDup(idx, idx.head, key, order, recordLoc) )
@@ -503,7 +719,7 @@ namespace mongo {
                         }
                     }
                     else {
-                        if( M.recordLoc == recordLoc ) 
+                        if( M.recordLoc == recordLoc )
                             alreadyInIndex();
                         uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
                     }
@@ -537,86 +753,378 @@ namespace mongo {
         return false;
     }
 
-    void BtreeBucket::delBucket(const DiskLoc& thisLoc, IndexDetails& id) {
+    void BtreeBucket::delBucket(const DiskLoc thisLoc, const IndexDetails& id) {
         ClientCursor::informAboutToDeleteBucket(thisLoc); // slow...
         assert( !isHead() );
 
-        BtreeBucket *p = parent.btreemod();
-        if ( p->nextChild == thisLoc ) {
-            p->nextChild.Null();
-        }
-        else {
-            for ( int i = 0; i < p->n; i++ ) {
-                if ( p->k(i).prevChildBucket == thisLoc ) {
-                    p->k(i).prevChildBucket.Null();
-                    goto found;
-                }
-            }
-            out() << "ERROR: can't find ref to deleted bucket.\n";
-            out() << "To delete:\n";
-            dump();
-            out() << "Parent:\n";
-            p->dump();
-            assert(false);
-        }
-found:
+        const BtreeBucket *p = parent.btree();
+        int parentIdx = indexInParent( thisLoc );
+        p->childForPos( parentIdx ).writing().Null();
         deallocBucket( thisLoc, id );
     }
-    
-    void BtreeBucket::deallocBucket(const DiskLoc &thisLoc, IndexDetails &id) {
+
+    void BtreeBucket::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) {
 #if 0
-        /* as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
-           it (meaning it is ineligible for reuse).
-           */
+        // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
+        // it (meaning it is ineligible for reuse).
         memset(this, 0, Size());
-        modified(thisLoc);
 #else
-        //defensive:
+        // defensive:
         n = -1;
         parent.Null();
         string ns = id.indexNamespace();
-        btreeStore->deleteRecord(ns.c_str(), thisLoc);
+        theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc);
 #endif
     }
 
-    /* note: may delete the entire bucket!  this invalid upon return sometimes. */
-    void BtreeBucket::delKeyAtPos(const DiskLoc& thisLoc, IndexDetails& id, int p) {
-        modified(thisLoc);
+    /** note: may delete the entire bucket!  this invalid upon return sometimes. */
+    void BtreeBucket::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) {
         assert(n>0);
         DiskLoc left = childForPos(p);
 
         if ( n == 1 ) {
             if ( left.isNull() && nextChild.isNull() ) {
-                if ( isHead() )
-                    _delKeyAtPos(p); // we don't delete the top bucket ever
-                else
-                    delBucket(thisLoc, id);
+                _delKeyAtPos(p);
+                if ( isHead() ) {
+                    // we don't delete the top bucket ever
+                }
+                else {
+                    if ( !mayBalanceWithNeighbors( thisLoc, id, order ) ) {
+                        // An empty bucket is only allowed as a transient state.  If
+                        // there are no neighbors to balance with, we delete ourself.
+                        // This condition is only expected in legacy btrees.
+                        delBucket(thisLoc, id);
+                    }
+                }
                 return;
             }
-            markUnused(p);
+            deleteInternalKey( thisLoc, p, id, order );
             return;
         }
 
-        if ( left.isNull() )
+        if ( left.isNull() ) {
             _delKeyAtPos(p);
-        else
-            markUnused(p);
+            mayBalanceWithNeighbors( thisLoc, id, order );
+        }
+        else {
+            deleteInternalKey( thisLoc, p, id, order );
+        }
     }
 
-    int qqq = 0;
+    /**
+     * This function replaces the specified key (k) by either the prev or next
+     * key in the btree (k').  We require that k have either a left or right
+     * child.  If k has a left child, we set k' to the prev key of k, which must
+     * be a leaf present in the left child.  If k does not have a left child, we
+     * set k' to the next key of k, which must be a leaf present in the right
+     * child.  When we replace k with k', we copy k' over k (which may cause a
+     * split) and then remove k' from its original location.  Because k' is
+     * stored in a descendent of k, replacing k by k' will not modify the
+     * storage location of the original k', and we can easily remove k' from
+     * its original location.
+     *
+     * This function is only needed in cases where k has a left or right child;
+     * in other cases a simpler key removal implementation is possible.
+     *
+     * NOTE on legacy btree structures:
+     * In legacy btrees, k' can be a nonleaf.  In such a case we 'delete' k by
+     * marking it as an unused node rather than replacing it with k'.  Also, k'
+     * may be a leaf but marked as an unused node.  In such a case we replace
+     * k by k', preserving the key's unused marking.  This function is only
+     * expected to mark a key as unused when handling a legacy btree.
+     */
+    void BtreeBucket::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = childForPos( keypos );
+        DiskLoc rchild = childForPos( keypos + 1 );
+        assert( !lchild.isNull() || !rchild.isNull() );
+        int advanceDirection = lchild.isNull() ? 1 : -1;
+        int advanceKeyOfs = keypos;
+        DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ );
+
+        if ( !advanceLoc.btree()->childForPos( advanceKeyOfs ).isNull() ||
+                !advanceLoc.btree()->childForPos( advanceKeyOfs + 1 ).isNull() ) {
+            // only expected with legacy btrees, see note above
+            markUnused( keypos );
+            return;
+        }
 
-    /* remove a key from the index */
-    bool BtreeBucket::unindex(const DiskLoc& thisLoc, IndexDetails& id, BSONObj& key, const DiskLoc& recordLoc ) {
-        if ( key.objsize() > KeyMax ) {
-            OCCASIONALLY problem() << "unindex: key too large to index, skipping " << id.indexNamespace() << /* ' ' << key.toString() << */ endl;
+        KeyNode kn = advanceLoc.btree()->keyNode( advanceKeyOfs );
+        setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, childForPos( keypos ), childForPos( keypos + 1 ), id );
+        advanceLoc.btreemod()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order );
+    }
+
+    void BtreeBucket::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) {
+        assert( n == 0 && !nextChild.isNull() );
+        if ( parent.isNull() ) {
+            assert( id.head == thisLoc );
+            id.head.writing() = nextChild;
+        }
+        else {
+            parent.btree()->childForPos( indexInParent( thisLoc ) ).writing() = nextChild;
+        }
+        nextChild.btree()->parent.writing() = parent;
+        ClientCursor::informAboutToDeleteBucket( thisLoc );
+        deallocBucket( thisLoc, id );
+    }
+
+    bool BtreeBucket::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const {
+        assert( leftIndex >= 0 && leftIndex < n );
+        DiskLoc leftNodeLoc = childForPos( leftIndex );
+        DiskLoc rightNodeLoc = childForPos( leftIndex + 1 );
+        if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) {
+            // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway
             return false;
         }
+        int pos = 0;
+        {
+            const BtreeBucket *l = leftNodeLoc.btree();
+            const BtreeBucket *r = rightNodeLoc.btree();
+            if ( ( headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.objsize() + sizeof(_KeyNode) > unsigned( BucketSize ) ) ) {
+                return false;
+            }
+        }
+        return true;
+    }
 
+    /**
+     * This implementation must respect the meaning and value of lowWaterMark.
+     * Also see comments in splitPos().
+     */
+    int BtreeBucket::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const {
+        int split = -1;
+        int rightSize = 0;
+        const BtreeBucket *l = childForPos( leftIndex ).btree();
+        const BtreeBucket *r = childForPos( leftIndex + 1 ).btree();
+
+        int KNS = sizeof( _KeyNode );
+        int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.objsize() + KNS + r->topSize + r->n * KNS ) / 2;
+        // This constraint should be ensured by only calling this function
+        // if we go below the low water mark.
+        assert( rightSizeLimit < BtreeBucket::bodySize() );
+        for( int i = r->n - 1; i > -1; --i ) {
+            rightSize += r->keyNode( i ).key.objsize() + KNS;
+            if ( rightSize > rightSizeLimit ) {
+                split = l->n + 1 + i;
+                break;
+            }
+        }
+        if ( split == -1 ) {
+            rightSize += keyNode( leftIndex ).key.objsize() + KNS;
+            if ( rightSize > rightSizeLimit ) {
+                split = l->n;
+            }
+        }
+        if ( split == -1 ) {
+            for( int i = l->n - 1; i > -1; --i ) {
+                rightSize += l->keyNode( i ).key.objsize() + KNS;
+                if ( rightSize > rightSizeLimit ) {
+                    split = i;
+                    break;
+                }
+            }
+        }
+        // safeguards - we must not create an empty bucket
+        if ( split < 1 ) {
+            split = 1;
+        }
+        else if ( split > l->n + 1 + r->n - 2 ) {
+            split = l->n + 1 + r->n - 2;
+        }
+
+        return split;
+    }
+
+    void BtreeBucket::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc leftNodeLoc = childForPos( leftIndex );
+        DiskLoc rightNodeLoc = childForPos( leftIndex + 1 );
+        BtreeBucket *l = leftNodeLoc.btreemod();
+        BtreeBucket *r = rightNodeLoc.btreemod();
+        int pos = 0;
+        l->_packReadyForMod( order, pos );
+        r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys
+
+        int oldLNum = l->n;
+        {
+            KeyNode kn = keyNode( leftIndex );
+            l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        for( int i = 0; i < r->n; ++i ) {
+            KeyNode kn = r->keyNode( i );
+            l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+        }
+        l->nextChild = r->nextChild;
+        l->fixParentPtrs( leftNodeLoc, oldLNum );
+        r->delBucket( rightNodeLoc, id );
+        childForPos( leftIndex + 1 ) = leftNodeLoc;
+        childForPos( leftIndex ) = DiskLoc();
+        _delKeyAtPos( leftIndex, true );
+        if ( n == 0 ) {
+            // will trash this and thisLoc
+            // TODO To ensure all leaves are of equal height, we should ensure
+            // this is only called on the root.
+            replaceWithNextChild( thisLoc, id );
+        }
+        else {
+            // balance recursively - maybe we should do this even when n == 0?
+            mayBalanceWithNeighbors( thisLoc, id, order );
+        }
+    }
+
+    int BtreeBucket::indexInParent( const DiskLoc &thisLoc ) const {
+        assert( !parent.isNull() );
+        const BtreeBucket *p = parent.btree();
+        if ( p->nextChild == thisLoc ) {
+            return p->n;
+        }
+        else {
+            for( int i = 0; i < p->n; ++i ) {
+                if ( p->k( i ).prevChildBucket == thisLoc ) {
+                    return i;
+                }
+            }
+        }
+        out() << "ERROR: can't find ref to child bucket.\n";
+        out() << "child: " << thisLoc << "\n";
+        dump();
+        out() << "Parent: " << parent << "\n";
+        p->dump();
+        assert(false);
+        return -1; // just to compile
+    }
+
+    bool BtreeBucket::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const {
+        // If we can merge, then we must merge rather than balance to preserve
+        // bucket utilization constraints.
+        if ( canMergeChildren( thisLoc, leftIndex ) ) {
+            return false;
+        }
+        thisLoc.btreemod()->doBalanceChildren( thisLoc, leftIndex, id, order );
+        return true;
+    }
+
+    void BtreeBucket::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+                                            BtreeBucket *l, const DiskLoc lchild,
+                                            BtreeBucket *r, const DiskLoc rchild,
+                                            IndexDetails &id, const Ordering &order ) {
+        // TODO maybe do some audits the same way pushBack() does?
+        int rAdd = l->n - split;
+        r->reserveKeysFront( rAdd );
+        for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) {
+            KeyNode kn = l->keyNode( i );
+            r->setKey( j, kn.recordLoc, kn.key, kn.prevChildBucket );
+        }
+        {
+            KeyNode kn = keyNode( leftIndex );
+            r->setKey( rAdd - 1, kn.recordLoc, kn.key, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        r->fixParentPtrs( rchild, 0, rAdd - 1 );
+        {
+            KeyNode kn = l->keyNode( split );
+            l->nextChild = kn.prevChildBucket;
+            setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+        }
+        int zeropos = 0;
+        l->truncateTo( split, order, zeropos );
+    }
+
+    void BtreeBucket::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+                                            BtreeBucket *l, const DiskLoc lchild,
+                                            BtreeBucket *r, const DiskLoc rchild,
+                                            IndexDetails &id, const Ordering &order ) {
+        int lN = l->n;
+        {
+            KeyNode kn = keyNode( leftIndex );
+            l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        for( int i = 0; i < split - lN - 1; ++i ) {
+            KeyNode kn = r->keyNode( i );
+            l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+        }
+        {
+            KeyNode kn = r->keyNode( split - lN - 1 );
+            l->nextChild = kn.prevChildBucket;
+            l->fixParentPtrs( lchild, lN + 1, l->n );
+            setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+        }
+        int zeropos = 0;
+        r->dropFront( split - lN, order, zeropos );
+    }
+
+    void BtreeBucket::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = childForPos( leftIndex );
+        DiskLoc rchild = childForPos( leftIndex + 1 );
+        int zeropos = 0;
+        BtreeBucket *l = lchild.btreemod();
+        l->_packReadyForMod( order, zeropos );
+        BtreeBucket *r = rchild.btreemod();
+        r->_packReadyForMod( order, zeropos );
+        int split = rebalancedSeparatorPos( thisLoc, leftIndex );
+
+        // By definition, if we are below the low water mark and cannot merge
+        // then we must actively balance.
+        assert( split != l->n );
+        if ( split < l->n ) {
+            doBalanceLeftToRight( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+        }
+        else {
+            doBalanceRightToLeft( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+        }
+    }
+
+    bool BtreeBucket::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const {
+        if ( parent.isNull() ) { // we are root, there are no neighbors
+            return false;
+        }
+
+        if ( packedDataSize( 0 ) >= lowWaterMark ) {
+            return false;
+        }
+
+        const BtreeBucket *p = parent.btree();
+        int parentIdx = indexInParent( thisLoc );
+
+        // TODO will missing neighbor case be possible long term?  Should we try to merge/balance somehow in that case if so?
+        bool mayBalanceRight = ( ( parentIdx < p->n ) && !p->childForPos( parentIdx + 1 ).isNull() );
+        bool mayBalanceLeft = ( ( parentIdx > 0 ) && !p->childForPos( parentIdx - 1 ).isNull() );
+
+        // Balance if possible on one side - we merge only if absolutely necessary
+        // to preserve btree bucket utilization constraints since that's a more
+        // heavy duty operation (especially if we must re-split later).
+        if ( mayBalanceRight &&
+                p->tryBalanceChildren( parent, parentIdx, id, order ) ) {
+            return true;
+        }
+        if ( mayBalanceLeft &&
+                p->tryBalanceChildren( parent, parentIdx - 1, id, order ) ) {
+            return true;
+        }
+
+        BtreeBucket *pm = parent.btreemod();
+        if ( mayBalanceRight ) {
+            pm->doMergeChildren( parent, parentIdx, id, order );
+            return true;
+        }
+        else if ( mayBalanceLeft ) {
+            pm->doMergeChildren( parent, parentIdx - 1, id, order );
+            return true;
+        }
+
+        return false;
+    }
+
+    /** remove a key from the index */
+    bool BtreeBucket::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const {
         int pos;
         bool found;
         DiskLoc loc = locate(id, thisLoc, key, Ordering::make(id.keyPattern()), pos, found, recordLoc, 1);
         if ( found ) {
-            loc.btree()->delKeyAtPos(loc, id, pos);
+
+            if ( key.objsize() > KeyMax ) {
+                OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl;
+            }
+            
+            loc.btreemod()->delKeyAtPos(loc, id, pos, Ordering::make(id.keyPattern()));
+            
             return true;
         }
         return false;
@@ -628,40 +1136,68 @@ found:
         return b;
     }
 
-    inline void fix(const DiskLoc& thisLoc, const DiskLoc& child) {
+    inline void BtreeBucket::fix(const DiskLoc thisLoc, const DiskLoc child) {
         if ( !child.isNull() ) {
             if ( insert_debug )
                 out() << "      " << child.toString() << ".parent=" << thisLoc.toString() << endl;
-            child.btreemod()->parent = thisLoc;
+            child.btree()->parent.writing() = thisLoc;
         }
     }
 
-    /* this sucks.  maybe get rid of parent ptrs. */
-    void BtreeBucket::fixParentPtrs(const DiskLoc& thisLoc) {
+    /** this sucks.  maybe get rid of parent ptrs. */
+    void BtreeBucket::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const {
         VERIFYTHISLOC
-        fix(thisLoc, nextChild);
-        for ( int i = 0; i < n; i++ )
-            fix(thisLoc, k(i).prevChildBucket);
+        if ( lastIndex == -1 ) {
+            lastIndex = n;
+        }
+        for ( int i = firstIndex; i <= lastIndex; i++ ) {
+            fix(thisLoc, childForPos(i));
+        }
     }
 
-    /* insert a key in this bucket, splitting if necessary.
-       keypos - where to insert the key i3n range 0..n.  0=make leftmost, n=make rightmost.
-       NOTE this function may free some data, and as a result the value passed for keypos may
-       be invalid after calling insertHere()
-    */
-    void BtreeBucket::insertHere(DiskLoc thisLoc, int keypos,
-                                 DiskLoc recordLoc, const BSONObj& key, const Ordering& order,
-                                 DiskLoc lchild, DiskLoc rchild, IndexDetails& idx)
-    {
-        modified(thisLoc);
+    void BtreeBucket::setInternalKey( const DiskLoc thisLoc, int keypos,
+                                      const DiskLoc recordLoc, const BSONObj &key, const Ordering &order,
+                                      const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) {
+        childForPos( keypos ).Null();
+
+        // This may leave the bucket empty (n == 0) which is ok only as a
+        // transient state.  In the instant case, the implementation of
+        // insertHere behaves correctly when n == 0 and as a side effect
+        // increments n.
+        _delKeyAtPos( keypos, true );
+
+        // Ensure we do not orphan neighbor's old child.
+        assert( childForPos( keypos ) == rchild );
+
+        // Just set temporarily - required to pass validation in insertHere()
+        childForPos( keypos ) = lchild;
+
+        insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx );
+    }
+
+    /**
+     * insert a key in this bucket, splitting if necessary.
+     * @keypos - where to insert the key in range 0..n.  0=make leftmost, n=make rightmost.
+     * NOTE this function may free some data, and as a result the value passed for keypos may
+     * be invalid after calling insertHere()
+     */
+    void BtreeBucket::insertHere( const DiskLoc thisLoc, int keypos,
+                                  const DiskLoc recordLoc, const BSONObj& key, const Ordering& order,
+                                  const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const {
         if ( insert_debug )
             out() << "   " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' '
-                 << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
+                  << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
 
         DiskLoc oldLoc = thisLoc;
 
-        if ( basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
-            _KeyNode& kn = k(keypos);
+        if ( !basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
+            thisLoc.btreemod()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx);
+            return;
+        }
+
+        {
+            const _KeyNode *_kn = &k(keypos);
+            _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert()
             if ( keypos+1 == n ) { // last key
                 if ( nextChild != lchild ) {
                     out() << "ERROR nextChild != lchild" << endl;
@@ -671,22 +1207,16 @@ found:
                     out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
                     out() << "  key: " << key.toString() << endl;
                     dump();
-#if 0
-                    out() << "\n\nDUMPING FULL INDEX" << endl;
-                    bt_dmp=1;
-                    bt_fv=1;
-                    idx.head.btree()->fullValidate(idx.head);
-#endif
                     assert(false);
                 }
-                kn.prevChildBucket = nextChild;
-                assert( kn.prevChildBucket == lchild );
-                nextChild = rchild;
+                kn->prevChildBucket = nextChild;
+                assert( kn->prevChildBucket == lchild );
+                nextChild.writing() = rchild;
                 if ( !rchild.isNull() )
-                    rchild.btreemod()->parent = thisLoc;
+                    rchild.btree()->parent.writing() = thisLoc;
             }
             else {
-                k(keypos).prevChildBucket = lchild;
+                kn->prevChildBucket = lchild;
                 if ( k(keypos+1).prevChildBucket != lchild ) {
                     out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl;
                     out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
@@ -695,33 +1225,24 @@ found:
                     out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
                     out() << "  key: " << key.toString() << endl;
                     dump();
-#if 0
-                    out() << "\n\nDUMPING FULL INDEX" << endl;
-                    bt_dmp=1;
-                    bt_fv=1;
-                    idx.head.btree()->fullValidate(idx.head);
-#endif
                     assert(false);
                 }
-                k(keypos+1).prevChildBucket = rchild;
+                const DiskLoc *pc = &k(keypos+1).prevChildBucket;
+                *getDur().alreadyDeclared((DiskLoc*) pc) = rchild; // declared in basicInsert()
                 if ( !rchild.isNull() )
-                    rchild.btreemod()->parent = thisLoc;
+                    rchild.btree()->parent.writing() = thisLoc;
             }
             return;
         }
+    }
 
-        /* ---------- split ---------------- */
+    void BtreeBucket::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) {
+        assertWritable();
 
         if ( split_debug )
             out() << "    " << thisLoc.toString() << ".split" << endl;
 
-        int split = n / 2;
-        if ( keypos == n ) { // see SERVER-983
-            split = (int) (0.9 * n);
-            if ( split > n - 2 )
-                split = n - 2;
-        }
-
+        int split = splitPos( keypos );
         DiskLoc rLoc = addBucket(idx);
         BtreeBucket *r = rLoc.btreemod();
         if ( split_debug )
@@ -753,15 +1274,14 @@ found:
                 p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc);
                 p->nextChild = rLoc;
                 p->assertValid( order );
-                parent = idx.head = L;
+                parent = idx.head.writing() = L;
                 if ( split_debug )
                     out() << "    we were root, making new root:" << hex << parent.getOfs() << dec << endl;
-                rLoc.btreemod()->parent = parent;
+                rLoc.btree()->parent.writing() = parent;
             }
             else {
-                /* set this before calling _insert - if it splits it will do fixParent() logic and change the value.
-                */
-                rLoc.btreemod()->parent = parent;
+                // set this before calling _insert - if it splits it will do fixParent() logic and change the value.
+                rLoc.btree()->parent.writing() = parent;
                 if ( split_debug )
                     out() << "    promoting splitkey key " << splitkey.key.toString() << endl;
                 parent.btree()->_insert(parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx);
@@ -769,16 +1289,17 @@ found:
         }
 
         int newpos = keypos;
+        // note this may trash splitkey.key.  thus we had to promote it before finishing up here.
         truncateTo(split, order, newpos);  // note this may trash splitkey.key.  thus we had to promote it before finishing up here.
 
         // add our new key, there is room now
         {
-
             if ( keypos <= split ) {
                 if ( split_debug )
                     out() << "  keypos<split, insertHere() the new key" << endl;
                 insertHere(thisLoc, newpos, recordLoc, key, order, lchild, rchild, idx);
-            } else {
+            }
+            else {
                 int kp = keypos-split-1;
                 assert(kp>=0);
                 rLoc.btree()->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx);
@@ -789,26 +1310,27 @@ found:
             out() << "     split end " << hex << thisLoc.getOfs() << dec << endl;
     }
 
-    /* start a new index off, empty */
-    DiskLoc BtreeBucket::addBucket(IndexDetails& id) {
-        DiskLoc loc = btreeStore->insert(id.indexNamespace().c_str(), 0, BucketSize, true);
+    /** start a new index off, empty */
+    DiskLoc BtreeBucket::addBucket(const IndexDetails& id) {
+        string ns = id.indexNamespace();
+        DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, BucketSize, true);
         BtreeBucket *b = loc.btreemod();
         b->init();
         return loc;
     }
 
     void BtreeBucket::renameIndexNamespace(const char *oldNs, const char *newNs) {
-        btreeStore->rename( oldNs, newNs );
+        renameNamespace( oldNs, newNs );
     }
 
-    DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) {
+    const DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) const {
         DiskLoc p = thisLoc;
         while ( !p.btree()->isHead() )
             p = p.btree()->parent;
         return p;
     }
 
-    DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+    DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const {
         if ( keyOfs < 0 || keyOfs >= n ) {
             out() << "ASSERT failure BtreeBucket::advance, caller: " << caller << endl;
             out() << "  thisLoc: " << thisLoc.toString() << endl;
@@ -841,7 +1363,7 @@ found:
         while ( 1 ) {
             if ( ancestor.isNull() )
                 break;
-            BtreeBucket *an = ancestor.btree();
+            const BtreeBucket *an = ancestor.btree();
             for ( int i = 0; i < an->n; i++ ) {
                 if ( an->childForPos(i+adj) == childLoc ) {
                     keyOfs = i;
@@ -857,7 +1379,7 @@ found:
         return DiskLoc();
     }
 
-    DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, DiskLoc recordLoc, int direction) {
+    DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
         int p;
         found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false);
         if ( found ) {
@@ -880,7 +1402,7 @@ found:
             return pos == n ? DiskLoc() /*theend*/ : thisLoc;
     }
 
-    bool BtreeBucket::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) {
+    bool BtreeBucket::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const {
         while( 1 ) {
             if ( l + 1 == h ) {
                 keyOfs = ( direction > 0 ) ? h : l;
@@ -889,101 +1411,123 @@ found:
                     bestParent = make_pair( thisLoc, keyOfs );
                     thisLoc = next;
                     return true;
-                } else {
+                }
+                else {
                     return false;
                 }
             }
             int m = l + ( h - l ) / 2;
-            int cmp = customBSONCmp( thisLoc.btree()->keyNode( m ).key, keyBegin, keyBeginLen, keyEnd, order );
+            int cmp = customBSONCmp( thisLoc.btree()->keyNode( m ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
             if ( cmp < 0 ) {
                 l = m;
-            } else if ( cmp > 0 ) {
+            }
+            else if ( cmp > 0 ) {
                 h = m;
-            } else {
+            }
+            else {
                 if ( direction < 0 ) {
                     l = m;
-                } else {
+                }
+                else {
                     h = m;
                 }
             }
-        }        
+        }
     }
-    
-    // find smallest/biggest value greater-equal/less-equal than specified
-    // starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd
-    // All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient
-    void BtreeBucket::advanceTo(const IndexDetails &id, DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction ) {
+
+    /**
+     * find smallest/biggest value greater-equal/less-equal than specified
+     * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd
+     * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient
+     */
+    void BtreeBucket::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const {
         int l,h;
         bool dontGoUp;
         if ( direction > 0 ) {
             l = keyOfs;
             h = n - 1;
-            dontGoUp = ( customBSONCmp( keyNode( h ).key, keyBegin, keyBeginLen, keyEnd, order ) >= 0 );
-        } else {
+            dontGoUp = ( customBSONCmp( keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
+        }
+        else {
             l = 0;
             h = keyOfs;
-            dontGoUp = ( customBSONCmp( keyNode( l ).key, keyBegin, keyBeginLen, keyEnd, order ) <= 0 );
+            dontGoUp = ( customBSONCmp( keyNode( l ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
         }
         pair< DiskLoc, int > bestParent;
         if ( dontGoUp ) {
             // this comparison result assures h > l
-            if ( !customFind( l, h, keyBegin, keyBeginLen, keyEnd, order, direction, thisLoc, keyOfs, bestParent ) ) {
+            if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) {
                 return;
             }
-        } else {
+        }
+        else {
             // go up parents until rightmost/leftmost node is >=/<= target or at top
             while( !thisLoc.btree()->parent.isNull() ) {
                 thisLoc = thisLoc.btree()->parent;
                 if ( direction > 0 ) {
-                    if ( customBSONCmp( thisLoc.btree()->keyNode( thisLoc.btree()->n - 1 ).key, keyBegin, keyBeginLen, keyEnd, order ) >= 0 ) {
+                    if ( customBSONCmp( thisLoc.btree()->keyNode( thisLoc.btree()->n - 1 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) {
                         break;
                     }
-                } else {
-                    if ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, keyEnd, order ) <= 0 ) {
+                }
+                else {
+                    if ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) {
                         break;
-                    }                    
+                    }
                 }
             }
         }
+        customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent );
+    }
+
+    void BtreeBucket::customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const {
+        if ( thisLoc.btree()->n == 0 ) {
+            thisLoc = DiskLoc();
+            return;
+        }
         // go down until find smallest/biggest >=/<= target
         while( 1 ) {
-            l = 0;
-            h = thisLoc.btree()->n - 1;
+            int l = 0;
+            int h = thisLoc.btree()->n - 1;
             // leftmost/rightmost key may possibly be >=/<= search key
             bool firstCheck;
             if ( direction > 0 ) {
-                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, keyEnd, order ) >= 0 );
-            } else {
-                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, keyEnd, order ) <= 0 );
+                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
+            }
+            else {
+                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
             }
             if ( firstCheck ) {
                 DiskLoc next;
                 if ( direction > 0 ) {
                     next = thisLoc.btree()->k( 0 ).prevChildBucket;
                     keyOfs = 0;
-                } else {
+                }
+                else {
                     next = thisLoc.btree()->nextChild;
                     keyOfs = h;
                 }
                 if ( !next.isNull() ) {
-                    bestParent = make_pair( thisLoc, keyOfs );
+                    bestParent = pair< DiskLoc, int >( thisLoc, keyOfs );
                     thisLoc = next;
                     continue;
-                } else {
+                }
+                else {
                     return;
                 }
             }
             bool secondCheck;
             if ( direction > 0 ) {
-                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, keyEnd, order ) < 0 );
-            } else {
-                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, keyEnd, order ) > 0 );
+                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) < 0 );
+            }
+            else {
+                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) > 0 );
             }
             if ( secondCheck ) {
                 DiskLoc next;
                 if ( direction > 0 ) {
                     next = thisLoc.btree()->nextChild;
-                } else {
+                }
+                else {
                     next = thisLoc.btree()->k( 0 ).prevChildBucket;
                 }
                 if ( next.isNull() ) {
@@ -991,23 +1535,23 @@ found:
                     thisLoc = bestParent.first;
                     keyOfs = bestParent.second;
                     return;
-                } else {
+                }
+                else {
                     thisLoc = next;
                     continue;
                 }
             }
-            if ( !customFind( l, h, keyBegin, keyBeginLen, keyEnd, order, direction, thisLoc, keyOfs, bestParent ) ) {
+            if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) {
                 return;
             }
         }
     }
 
-    
-    /* @thisLoc disk location of *this
-    */
-    int BtreeBucket::_insert(DiskLoc thisLoc, DiskLoc recordLoc,
+
+    /** @thisLoc disk location of *this */
+    int BtreeBucket::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
                              const BSONObj& key, const Ordering &order, bool dupsAllowed,
-                             DiskLoc lChild, DiskLoc rChild, IndexDetails& idx) {
+                             const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const {
         if ( key.objsize() > KeyMax ) {
             problem() << "ERROR: key too large len:" << key.objsize() << " max:" << KeyMax << ' ' << key.objsize() << ' ' << idx.indexNamespace() << endl;
             return 2;
@@ -1018,34 +1562,34 @@ found:
         bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed);
         if ( insert_debug ) {
             out() << "  " << thisLoc.toString() << '.' << "_insert " <<
-                 key.toString() << '/' << recordLoc.toString() <<
-                 " l:" << lChild.toString() << " r:" << rChild.toString() << endl;
+                  key.toString() << '/' << recordLoc.toString() <<
+                  " l:" << lChild.toString() << " r:" << rChild.toString() << endl;
             out() << "    found:" << found << " pos:" << pos << " n:" << n << endl;
         }
 
         if ( found ) {
-            _KeyNode& kn = k(pos);
+            const _KeyNode& kn = k(pos);
             if ( kn.isUnused() ) {
                 log(4) << "btree _insert: reusing unused key" << endl;
                 massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull());
                 massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull());
-                kn.setUsed();
+                kn.writing().setUsed();
                 return 0;
             }
 
-            DEV { 
-                out() << "_insert(): key already exists in index (ok for background:true)\n";
-                out() << "  " << idx.indexNamespace().c_str() << " thisLoc:" << thisLoc.toString() << '\n';
-                out() << "  " << key.toString() << '\n';
-                out() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
-                out() << "  old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl;
-                out() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
+            DEV {
+                log() << "_insert(): key already exists in index (ok for background:true)\n";
+                log() << "  " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n';
+                log() << "  " << key.toString() << '\n';
+                log() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
+                log() << "  old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl;
+                log() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
             }
             alreadyInIndex();
         }
 
         DEBUGGING out() << "TEMP: key: " << key.toString() << endl;
-        DiskLoc& child = childForPos(pos);
+        DiskLoc child = childForPos(pos);
         if ( insert_debug )
             out() << "    getChild(" << pos << "): " << child.toString() << endl;
         if ( child.isNull() || !rChild.isNull() /* means an 'internal' insert */ ) {
@@ -1056,28 +1600,27 @@ found:
         return child.btree()->bt_insert(child, recordLoc, key, order, dupsAllowed, idx, /*toplevel*/false);
     }
 
-    void BtreeBucket::dump() {
+    void BtreeBucket::dump() const {
         out() << "DUMP btreebucket n:" << n;
         out() << " parent:" << hex << parent.getOfs() << dec;
         for ( int i = 0; i < n; i++ ) {
             out() << '\n';
             KeyNode k = keyNode(i);
             out() << '\t' << i << '\t' << k.key.toString() << "\tleft:" << hex <<
-                 k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec;
+                  k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec;
             if ( this->k(i).isUnused() )
                 out() << " UNUSED";
         }
         out() << " right:" << hex << nextChild.getOfs() << dec << endl;
     }
 
-    /* todo: meaning of return code unclear clean up */
-    int BtreeBucket::bt_insert(DiskLoc thisLoc, DiskLoc recordLoc,
-                            const BSONObj& key, const Ordering &order, bool dupsAllowed,
-                            IndexDetails& idx, bool toplevel)
-    {
+    /** todo: meaning of return code unclear clean up */
+    int BtreeBucket::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                               const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                               IndexDetails& idx, bool toplevel) const {
         if ( toplevel ) {
             if ( key.objsize() > KeyMax ) {
-                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace().c_str() << ' ' << key.objsize() << ' ' << key.toString() << endl;
+                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.objsize() << ' ' << key.toString() << endl;
                 return 3;
             }
         }
@@ -1088,22 +1631,30 @@ found:
         return x;
     }
 
-    void BtreeBucket::shape(stringstream& ss) {
+    void BtreeBucket::shape(stringstream& ss) const {
         _shape(0, ss);
     }
-    
-    DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ){
+
+    int BtreeBucket::getLowWaterMark() {
+        return lowWaterMark;
+    }
+
+    int BtreeBucket::getKeyMax() {
+        return KeyMax;
+    }
+
+    DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const {
         int pos;
         bool found;
-        /* TODO: is it really ok here that the order is a default? */
+        // TODO: is it really ok here that the order is a default?
         Ordering o = Ordering::make(BSONObj());
         DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc );
         if ( bucket.isNull() )
             return bucket;
 
-        BtreeBucket *b = bucket.btree();
-        while ( 1 ){
-            _KeyNode& knraw = b->k(pos);
+        const BtreeBucket *b = bucket.btree();
+        while ( 1 ) {
+            const _KeyNode& knraw = b->k(pos);
             if ( knraw.isUsed() )
                 break;
             bucket = b->advance( bucket , pos , 1 , "findSingle" );
@@ -1125,7 +1676,7 @@ found:
 namespace mongo {
 
     void BtreeBucket::a_test(IndexDetails& id) {
-        BtreeBucket *b = id.head.btree();
+        BtreeBucket *b = id.head.btreemod();
 
         // record locs for testing
         DiskLoc A(1, 20);
@@ -1171,26 +1722,37 @@ namespace mongo {
 
     /* --- BtreeBuilder --- */
 
-    BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) : 
-      dupsAllowed(_dupsAllowed), 
-      idx(_idx), 
-      n(0),
-      order( idx.keyPattern() ),
-      ordering( Ordering::make(idx.keyPattern()) )
-    {
+    BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) :
+        dupsAllowed(_dupsAllowed),
+        idx(_idx),
+        n(0),
+        order( idx.keyPattern() ),
+        ordering( Ordering::make(idx.keyPattern()) ) {
         first = cur = BtreeBucket::addBucket(idx);
         b = cur.btreemod();
         committed = false;
     }
 
-    void BtreeBuilder::newBucket() { 
+    void BtreeBuilder::newBucket() {
         DiskLoc L = BtreeBucket::addBucket(idx);
         b->tempNext() = L;
         cur = L;
         b = cur.btreemod();
     }
 
-    void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) { 
+    void BtreeBuilder::mayCommitProgressDurably() {
+        if ( getDur().commitIfNeeded() ) {
+            b = cur.btreemod();
+        }
+    }
+
+    void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) {
+        if ( key.objsize() > KeyMax ) {
+            problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() 
+                      << ' ' << key.objsize() << ' ' << key.toString() << endl;
+            return;
+        }
+
         if( !dupsAllowed ) {
             if( n > 0 ) {
                 int cmp = keyLast.woCompare(key, order);
@@ -1203,26 +1765,21 @@ namespace mongo {
             keyLast = key;
         }
 
-        if ( ! b->_pushBack(loc, key, ordering, DiskLoc()) ){
-            // no room
-            if ( key.objsize() > KeyMax ) {
-                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace().c_str() << ' ' << key.objsize() << ' ' << key.toString() << endl;
-            }
-            else { 
-                // bucket was full
-                newBucket();
-                b->pushBack(loc, key, ordering, DiskLoc());
-            }
+        if ( ! b->_pushBack(loc, key, ordering, DiskLoc()) ) {
+            // bucket was full
+            newBucket();
+            b->pushBack(loc, key, ordering, DiskLoc());
         }
         n++;
+        mayCommitProgressDurably();
     }
 
-    void BtreeBuilder::buildNextLevel(DiskLoc loc) { 
+    void BtreeBuilder::buildNextLevel(DiskLoc loc) {
         int levels = 1;
-        while( 1 ) { 
-            if( loc.btree()->tempNext().isNull() ) { 
+        while( 1 ) {
+            if( loc.btree()->tempNext().isNull() ) {
                 // only 1 bucket at this level. we are done.
-                idx.head = loc;
+                getDur().writingDiskLoc(idx.head) = loc;
                 break;
             }
             levels++;
@@ -1232,59 +1789,70 @@ namespace mongo {
             BtreeBucket *up = upLoc.btreemod();
 
             DiskLoc xloc = loc;
-            while( !xloc.isNull() ) { 
+            while( !xloc.isNull() ) {
+                if ( getDur().commitIfNeeded() ) {
+                    b = cur.btreemod();
+                    up = upLoc.btreemod();
+                }
+
                 BtreeBucket *x = xloc.btreemod();
-                BSONObj k; 
+                BSONObj k;
                 DiskLoc r;
                 x->popBack(r,k);
                 bool keepX = ( x->n != 0 );
                 DiskLoc keepLoc = keepX ? xloc : x->nextChild;
 
-                if ( ! up->_pushBack(r, k, ordering, keepLoc) ){
+                if ( ! up->_pushBack(r, k, ordering, keepLoc) ) {
                     // current bucket full
                     DiskLoc n = BtreeBucket::addBucket(idx);
                     up->tempNext() = n;
-                    upLoc = n; 
+                    upLoc = n;
                     up = upLoc.btreemod();
                     up->pushBack(r, k, ordering, keepLoc);
                 }
 
-                DiskLoc nextLoc = x->tempNext(); /* get next in chain at current level */
+                DiskLoc nextLoc = x->tempNext(); // get next in chain at current level
                 if ( keepX ) {
-                    x->parent = upLoc;                
-                } else {
+                    x->parent = upLoc;
+                }
+                else {
                     if ( !x->nextChild.isNull() )
                         x->nextChild.btreemod()->parent = upLoc;
                     x->deallocBucket( xloc, idx );
                 }
                 xloc = nextLoc;
             }
-            
+
             loc = upStart;
+            mayCommitProgressDurably();
         }
 
         if( levels > 1 )
             log(2) << "btree levels: " << levels << endl;
     }
 
-    /* when all addKeys are done, we then build the higher levels of the tree */
-    void BtreeBuilder::commit() { 
+    /** when all addKeys are done, we then build the higher levels of the tree */
+    void BtreeBuilder::commit() {
         buildNextLevel(first);
         committed = true;
     }
 
-    BtreeBuilder::~BtreeBuilder() { 
-        if( !committed ) { 
-            log(2) << "Rolling back partially built index space" << endl;
-            DiskLoc x = first;
-            while( !x.isNull() ) { 
-                DiskLoc next = x.btree()->tempNext();
-                btreeStore->deleteRecord(idx.indexNamespace().c_str(), x);
-                x = next;
+    BtreeBuilder::~BtreeBuilder() {
+        DESTRUCTOR_GUARD(
+            if( !committed ) {
+                log(2) << "Rolling back partially built index space" << endl;
+                DiskLoc x = first;
+                while( !x.isNull() ) {
+                    DiskLoc next = x.btree()->tempNext();
+                    string ns = idx.indexNamespace();
+                    theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x);
+                    x = next;
+                    getDur().commitIfNeeded();
+                }
+                assert( idx.head.isNull() );
+                log(2) << "done rollback" << endl;
             }
-            assert( idx.head.isNull() );
-            log(2) << "done rollback" << endl;
-        }
+        )
     }
 
 }
diff --git a/db/btree.h b/db/btree.h
index 233b4dc..bced95e 100644
--- a/db/btree.h
+++ b/db/btree.h
@@ -25,8 +25,12 @@
 
 namespace mongo {
 
+    const int BucketSize = 8192;
+
 #pragma pack(1)
     struct _KeyNode {
+        /** Signals that we are writing this _KeyNode and casts away const */
+        _KeyNode& writing() const;
         DiskLoc prevChildBucket; // the lchild
         DiskLoc recordLoc; // location of the record associated with the key
         short keyDataOfs() const {
@@ -41,15 +45,12 @@ namespace mongo {
             _kdo = s;
             assert(s>=0);
         }
-        void setUsed() { 
-            recordLoc.GETOFS() &= ~1;
-        }
+        void setUsed() { recordLoc.GETOFS() &= ~1; }
         void setUnused() {
-            /* Setting ofs to odd is the sentinel for unused, as real recordLoc's are always
-               even numbers.
-               Note we need to keep its value basically the same as we use the recordLoc
-               as part of the key in the index (to handle duplicate keys efficiently).
-            */
+            // Setting ofs to odd is the sentinel for unused, as real recordLoc's are always
+            //  even numbers.
+            // Note we need to keep its value basically the same as we use the recordLoc
+            // as part of the key in the index (to handle duplicate keys efficiently).
             recordLoc.GETOFS() |= 1;
         }
         int isUnused() const {
@@ -63,7 +64,12 @@ namespace mongo {
 
     class BucketBasics;
 
-    /* wrapper - this is our in memory representation of the key.  _KeyNode is the disk representation. */
+    /**
+     * wrapper - this is our in memory representation of the key.
+     * _KeyNode is the disk representation.
+     *
+     * This object and its bson key will become invalid if the key is moved.
+     */
     class KeyNode {
     public:
         KeyNode(const BucketBasics& bb, const _KeyNode &k);
@@ -73,51 +79,111 @@ namespace mongo {
     };
 
 #pragma pack(1)
-    /* this class is all about the storage management */
-    class BucketBasics {
+    class BtreeData {
+    protected:
+        DiskLoc parent;
+        DiskLoc nextChild; // child bucket off and to the right of the highest key.
+        unsigned short _wasSize; // can be reused, value is 8192 in current pdfile version Apr2010
+        unsigned short _reserved1; // zero
+        int flags;
+
+        // basicInsert() assumes these three are together and in this order:
+        int emptySize; // size of the empty region
+        int topSize; // size of the data at the top of the bucket (keys are at the beginning or 'bottom')
+        int n; // # of keys so far.
+
+        int reserved;
+        char data[4];
+    };
+
+    /**
+     * This class is all about the storage management
+     *
+     * Const member functions of this class are those which may be called on
+     * an object for which writing has not been signaled.  Non const member
+     * functions may only be called on objects for which writing has been
+     * signaled.  Note that currently some const functions write to the
+     * underlying memory representation of this bucket using optimized methods
+     * to signal write operations.
+     *
+     * DiskLoc parameters that may shadow references within the btree should
+     * be passed by value rather than by reference to non const member
+     * functions or const member functions which may perform writes.  This way
+     * a callee need not worry that write operations will change or invalidate
+     * its arguments.
+     *
+     * The current policy for dealing with bson arguments is the opposite of
+     * what is described above for DiskLoc arguments.  We do
+     * not want to want to copy bson into memory as an intermediate step for
+     * btree changes, so if bson is to be moved it must be copied to the new
+     * location before the old location is invalidated.
+     */
+    class BucketBasics : public BtreeData {
         friend class BtreeBuilder;
         friend class KeyNode;
     public:
-        void dumpTree(DiskLoc thisLoc, const BSONObj &order);
-        bool isHead() { return parent.isNull(); }
-        void assertValid(const Ordering &order, bool force = false);
-        void assertValid(const BSONObj &orderObj, bool force = false) { 
-            return assertValid(Ordering::make(orderObj),force); 
-        }
-        int fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount = 0); /* traverses everything */
+        /** assert write intent declared for this bucket already */
+        void assertWritable();
 
-        KeyNode keyNode(int i) const {
-            if ( i >= n ){
+        void assertValid(const Ordering &order, bool force = false) const;
+        void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); }
+
+        /**
+         * @return KeyNode for key at index i.  The KeyNode will become invalid
+         * if the key is moved or reassigned, or if the node is packed.
+         */
+        const KeyNode keyNode(int i) const {
+            if ( i >= n ) {
                 massert( 13000 , (string)"invalid keyNode: " +  BSON( "i" << i << "n" << n ).jsonString() , i < n );
             }
             return KeyNode(*this, k(i));
         }
 
-    protected:
+        static int headerSize() {
+            const BucketBasics *d = 0;
+            return (char*)&(d->data) - (char*)&(d->parent);
+        }
+        static int bodySize() { return BucketSize - headerSize(); }
 
-        void modified(const DiskLoc& thisLoc);
+        // for testing
+        int nKeys() const { return n; }
+        const DiskLoc getNextChild() const { return nextChild; }
 
-        char * dataAt(short ofs) {
-            return data + ofs;
-        }
+    protected:
+        char * dataAt(short ofs) { return data + ofs; }
 
         void init(); // initialize a new node
 
-        /* returns false if node is full and must be split
-           keypos is where to insert -- inserted after that key #.  so keypos=0 is the leftmost one.
-        */
-        bool basicInsert(const DiskLoc& thisLoc, int &keypos, const DiskLoc& recordLoc, const BSONObj& key, const Ordering &order);
-        
         /**
-         * @return true if works, false if not enough space
+         * @return false if node is full and must be split
+         * @keypos is where to insert -- inserted before that key #.  so keypos=0 is the leftmost one.
+         *  keypos will be updated if keys are moved as a result of pack()
+         * This function will modify the btree bucket memory representation even
+         * though it is marked const.
          */
-        bool _pushBack(const DiskLoc& recordLoc, BSONObj& key, const Ordering &order, DiskLoc prevChild);
-        void pushBack(const DiskLoc& recordLoc, BSONObj& key, const Ordering &order, DiskLoc prevChild){
+        bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const;
+
+        /** @return true if works, false if not enough space */
+        bool _pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild);
+        void pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) {
             bool ok = _pushBack( recordLoc , key , order , prevChild );
             assert(ok);
         }
+
+        /**
+         * This is a special purpose function used by BtreeBuilder.  The
+         * interface is quite dangerous if you're not careful.  The bson key
+         * returned here points to bucket memory that has been invalidated but
+         * not yet reclaimed.
+         *
+         * TODO Maybe this could be replaced with two functions, one which
+         * returns the last key without deleting it and another which simply
+         * deletes the last key.  Then the caller would have enough control to
+         * ensure proper memory integrity.
+         */
         void popBack(DiskLoc& recLoc, BSONObj& key);
-        void _delKeyAtPos(int keypos); // low level version that doesn't deal with child ptrs.
+
+        void _delKeyAtPos(int keypos, bool mayEmpty = false); // low level version that doesn't deal with child ptrs.
 
         /* !Packed means there is deleted fragment space within the bucket.
            We "repack" when we run out of space before considering the node
@@ -125,145 +191,257 @@ namespace mongo {
            */
         enum Flags { Packed=1 };
 
-        DiskLoc& childForPos(int p) {
-            return p == n ? nextChild : k(p).prevChildBucket;
-        }
+        const DiskLoc& childForPos(int p) const { return p == n ? nextChild : k(p).prevChildBucket; }
+        DiskLoc& childForPos(int p) { return p == n ? nextChild : k(p).prevChildBucket; }
 
         int totalDataSize() const;
-        void pack( const Ordering &order, int &refPos);
-        void setNotPacked();
-        void setPacked();
+        /** @return true if the key may be dropped by pack() */
+        bool mayDropKey( int index, int refPos ) const;
+
+        /**
+         * Pack the bucket to reclaim space from invalidated memory.
+         * @refPos is an index in the bucket which will may be updated if we
+         *  delete keys from the bucket
+         * This function may cast away const and perform a write.
+         */
+        void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const;
+        /** Pack when already writable */
+        void _packReadyForMod(const Ordering &order, int &refPos);
+
+        /**
+         * @return the size of non header data in this bucket if we were to
+         * call pack().
+         */
+        int packedDataSize( int refPos ) const;
+        void setNotPacked() { flags &= ~Packed; }
+        void setPacked() { flags |= Packed; }
         int _alloc(int bytes);
         void _unalloc(int bytes);
         void truncateTo(int N, const Ordering &order, int &refPos);
+        /** drop specified number of keys from beginning of key array, and pack */
+        void dropFront(int nDrop, const Ordering &order, int &refPos);
         void markUnused(int keypos);
 
-        /* BtreeBuilder uses the parent var as a temp place to maintain a linked list chain. 
-           we use tempNext() when we do that to be less confusing. (one might have written a union in C)
-           */
+        /**
+         * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
+         *   we use tempNext() when we do that to be less confusing. (one might have written a union in C)
+         */
+        const DiskLoc& tempNext() const { return parent; }
         DiskLoc& tempNext() { return parent; }
 
-    public:
-        DiskLoc parent;
-
-        string bucketSummary() const {
-            stringstream ss;
-            ss << "  Bucket info:" << endl;
-            ss << "    n: " << n << endl;
-            ss << "    parent: " << parent.toString() << endl;
-            ss << "    nextChild: " << parent.toString() << endl;
-            ss << "    flags:" << flags << endl;
-            ss << "    emptySize: " << emptySize << " topSize: " << topSize << endl;
-            return ss.str();
-        }
-        
-        bool isUsed( int i ) const {
-            return k(i).isUsed();
-        }
+        void _shape(int level, stringstream&) const;
+        int Size() const;
+        const _KeyNode& k(int i) const { return ((const _KeyNode*)data)[i]; }
+        _KeyNode& k(int i) { return ((_KeyNode*)data)[i]; }
 
-    protected:
-        void _shape(int level, stringstream&);
-        DiskLoc nextChild; // child bucket off and to the right of the highest key.
+        /** @return the key position where a split should occur on insert */
+        int splitPos( int keypos ) const;
 
-    private:
-        unsigned short _wasSize; // can be reused, value is 8192 in current pdfile version Apr2010
-        unsigned short _reserved1; // zero
+        /**
+         * Adds new entries to beginning of key array, shifting existing
+         * entries to the right.  After this is called, setKey() must be called
+         * on all the newly created entries in the key array.
+         */
+        void reserveKeysFront( int nAdd );
 
-    protected:
-        int Size() const;
-        int flags;
-        int emptySize; // size of the empty region
-        int topSize; // size of the data at the top of the bucket (keys are at the beginning or 'bottom')
-        int n; // # of keys so far.
-        int reserved;
-        const _KeyNode& k(int i) const {
-            return ((_KeyNode*)data)[i];
-        }
-        _KeyNode& k(int i) {
-            return ((_KeyNode*)data)[i];
-        }
-        char data[4];
+        /**
+         * Sets an existing key using the given parameters.
+         * @i index of key to set
+         */
+        void setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket );
     };
-#pragma pack()
 
-#pragma pack(1)
+    /**
+     * This class adds functionality for manipulating buckets that are assembled
+     * in a tree.  The requirements for const and non const functions and
+     * arguments are generally the same as in BtreeBucket.  Because this class
+     * deals with tree structure, some functions that are marked const may
+     * trigger modification of another node in the btree or potentially of the
+     * current node.  In such cases, the function's implementation explicitly
+     * casts away const when indicating an intent to write to the durability
+     * layer.  The DiskLocs provided to such functions should be passed by
+     * value if they shadow pointers within the btree.
+     *
+     * To clarify enforcement of referential integrity in this implementation,
+     * we use the following pattern when deleting data we have a persistent
+     * pointer to.  The pointer is cleared or removed explicitly, then the data
+     * it pointed to is cleaned up with a helper function.
+     *
+     * TODO It might make sense to put some of these functions in a class
+     * representing a full btree instead of a single btree bucket.  That would
+     * allow us to use the const qualifier in a manner more consistent with
+     * standard usage.  Right now the interface is for both a node and a tree,
+     * so assignment of const is sometimes nonideal.
+     *
+     * TODO There are several cases in which the this pointer is invalidated
+     * as a result of deallocation.  A seperate class representing a btree would
+     * alleviate some fragile cases where the implementation must currently
+     * behave correctly if the this pointer is suddenly invalidated by a
+     * callee.
+     */
     class BtreeBucket : public BucketBasics {
         friend class BtreeCursor;
     public:
-        void dump();
+        bool isHead() const { return parent.isNull(); }
+        void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const;
+        int fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount = 0, bool strict = false) const; /* traverses everything */
 
-        /* @return true if key exists in index 
+        bool isUsed( int i ) const { return k(i).isUsed(); }
+        string bucketSummary() const;
+        void dump() const;
 
-           order - indicates order of keys in the index.  this is basically the index's key pattern, e.g.:
-             BSONObj order = ((IndexDetails&)idx).keyPattern();
-           likewise below in bt_insert() etc.
-        */
-        bool exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, const Ordering& order);
+        /**
+         * @return true if key exists in index
+         *
+         * @order - indicates order of keys in the index.  this is basically the index's key pattern, e.g.:
+         *    BSONObj order = ((IndexDetails&)idx).keyPattern();
+         * likewise below in bt_insert() etc.
+         */
+        bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const;
 
         bool wouldCreateDup(
-            const IndexDetails& idx, DiskLoc thisLoc, 
+            const IndexDetails& idx, const DiskLoc &thisLoc,
             const BSONObj& key, const Ordering& order,
-            DiskLoc self); 
+            const DiskLoc &self) const;
+
+        static DiskLoc addBucket(const IndexDetails&); /* start a new index off, empty */
+        /** invalidates 'this' and thisLoc */
+        void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);
 
-        static DiskLoc addBucket(IndexDetails&); /* start a new index off, empty */
-        void deallocBucket(const DiskLoc &thisLoc, IndexDetails &id);
-        
         static void renameIndexNamespace(const char *oldNs, const char *newNs);
 
-        int bt_insert(DiskLoc thisLoc, DiskLoc recordLoc,
-                   const BSONObj& key, const Ordering &order, bool dupsAllowed,
-                   IndexDetails& idx, bool toplevel = true);
+        /** This function may change the btree root */
+        int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                      const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                      IndexDetails& idx, bool toplevel = true) const;
 
-        bool unindex(const DiskLoc& thisLoc, IndexDetails& id, BSONObj& key, const DiskLoc& recordLoc);
+        /** This function may change the btree root */
+        bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const;
 
-        /* locate may return an "unused" key that is just a marker.  so be careful.
-             looks for a key:recordloc pair.
+        /**
+         * locate may return an "unused" key that is just a marker.  so be careful.
+         *   looks for a key:recordloc pair.
+         *
+         * @found - returns true if exact match found.  note you can get back a position
+         *          result even if found is false.
+         */
+        DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                       int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
 
-           found - returns true if exact match found.  note you can get back a position 
-                   result even if found is false.
-        */
-        DiskLoc locate(const IndexDetails& , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, 
-                       int& pos, bool& found, DiskLoc recordLoc, int direction=1);
-        
         /**
          * find the first instance of the key
          * does not handle dups
-         * returned DiskLock isNull if can't find anything with that
+         * returned DiskLoc isNull if can't find anything with that
+         * @return the record location of the first match
          */
-        DiskLoc findSingle( const IndexDetails& , const DiskLoc& thisLoc, const BSONObj& key );
+        DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const;
+
+        /** advance one key position in the index: */
+        DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const;
 
-        /* advance one key position in the index: */
-        DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller);
-        
-        void advanceTo(const IndexDetails &id, DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction );
-        
-        DiskLoc getHead(const DiskLoc& thisLoc);
+        void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const;
+        void customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const;
 
-        /* get tree shape */
-        void shape(stringstream&);
+        const DiskLoc getHead(const DiskLoc& thisLoc) const;
+
+        /** get tree shape */
+        void shape(stringstream&) const;
 
         static void a_test(IndexDetails&);
 
-    private:
-        void fixParentPtrs(const DiskLoc& thisLoc);
-        void delBucket(const DiskLoc& thisLoc, IndexDetails&);
-        void delKeyAtPos(const DiskLoc& thisLoc, IndexDetails& id, int p);
-        BSONObj keyAt(int keyOfs) {
+        static int getLowWaterMark();
+        static int getKeyMax();
+
+    protected:
+        /**
+         * Fix parent pointers for children
+         * @firstIndex first index to modify
+         * @lastIndex last index to modify (-1 means last index is n)
+         */
+        void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const;
+
+        /** invalidates this and thisLoc */
+        void delBucket(const DiskLoc thisLoc, const IndexDetails&);
+        /** may invalidate this and thisLoc */
+        void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order);
+
+        /**
+         * May balance utilization of this bucket with a neighbor, either by
+         * merging the buckets or shifting nodes.
+         * @return true iff balancing was performed.
+         * NOTE This function may invalidate thisLoc.
+         */
+        bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const;
+
+        /** @return true if balance succeeded */
+        bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const;
+        void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order );
+        void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+                                   BtreeBucket *l, const DiskLoc lchild,
+                                   BtreeBucket *r, const DiskLoc rchild,
+                                   IndexDetails &id, const Ordering &order );
+        void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+                                   BtreeBucket *l, const DiskLoc lchild,
+                                   BtreeBucket *r, const DiskLoc rchild,
+                                   IndexDetails &id, const Ordering &order );
+
+        /** may invalidate this and thisLoc */
+        void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order);
+
+        /** will invalidate this and thisLoc */
+        void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id );
+
+        /** @return true iff left and right child can be merged into one node */
+        bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const;
+
+        /**
+         * @return index of the rebalanced separator; the index value is
+         * determined as if we had an array
+         * <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> )
+         * This is only expected to be called if the left and right child
+         * cannot be merged.
+         * This function is expected to be called on packed buckets, see also
+         * comments for splitPos().
+         */
+        int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const;
+
+        int indexInParent( const DiskLoc &thisLoc ) const;
+        BSONObj keyAt(int keyOfs) const {
             return keyOfs >= n ? BSONObj() : keyNode(keyOfs).key;
         }
         static BtreeBucket* allocTemp(); /* caller must release with free() */
-        void insertHere(DiskLoc thisLoc, int keypos,
-                        DiskLoc recordLoc, const BSONObj& key, const Ordering &order,
-                        DiskLoc lchild, DiskLoc rchild, IndexDetails&);
-        int _insert(DiskLoc thisLoc, DiskLoc recordLoc,
+
+        /** split bucket */
+        void split(const DiskLoc thisLoc, int keypos,
+                   const DiskLoc recordLoc, const BSONObj& key,
+                   const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx);
+
+        void insertHere(const DiskLoc thisLoc, int keypos,
+                        const DiskLoc recordLoc, const BSONObj& key, const Ordering &order,
+                        const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const;
+
+        int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
                     const BSONObj& key, const Ordering &order, bool dupsAllowed,
-                    DiskLoc lChild, DiskLoc rChild, IndexDetails&);
-        bool find(const IndexDetails& idx, const BSONObj& key, DiskLoc recordLoc, const Ordering &order, int& pos, bool assertIfDup);
-        bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent );
+                    const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const;
+        bool find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;
+        bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const;
         static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
-        static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, const vector< const BSONElement * > &rEnd, const Ordering &o );
+        static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction );
+        static void fix(const DiskLoc thisLoc, const DiskLoc child);
+
+        /** Replaces an existing key with the new specified key, splitting if necessary */
+        void setInternalKey( const DiskLoc thisLoc, int keypos,
+                             const DiskLoc recordLoc, const BSONObj &key, const Ordering &order,
+                             const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx);
+
+        /**
+         * Deletes the specified key, replacing it with the key immediately
+         * preceding or succeeding it in the btree.  Either the left or right
+         * child of the specified key must be non null.
+         */
+        void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order );
     public:
-        // simply builds and returns a dup key error message string
+        /** simply builds and returns a dup key error message string */
         static string dupKeyError( const IndexDetails& idx , const BSONObj& key );
     };
 #pragma pack()
@@ -271,76 +449,59 @@ namespace mongo {
     class BtreeCursor : public Cursor {
     public:
         BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
-
         BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
-        ~BtreeCursor(){
-        }
-        virtual bool ok() {
-            return !bucket.isNull();
-        }
-        bool eof() {
-            return !ok();
-        }
+        virtual bool ok() { return !bucket.isNull(); }
         virtual bool advance();
-
         virtual void noteLocation(); // updates keyAtKeyOfs...
         virtual void checkLocation();
         virtual bool supportGetMore() { return true; }
         virtual bool supportYields() { return true; }
 
-        /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
-           if a multikey index traversal:
-             if loc has already been sent, returns true.
-             otherwise, marks loc as sent.
-             @return true if the loc has not been seen
-        */
+        /**
+         * used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+         * if a multikey index traversal:
+         *   if loc has already been sent, returns true.
+         *   otherwise, marks loc as sent.
+         * @return true if the loc has not been seen
+         */
         virtual bool getsetdup(DiskLoc loc) {
-            if( multikey ) { 
-                pair<set<DiskLoc>::iterator, bool> p = dups.insert(loc);
+            if( _multikey ) {
+                pair<set<DiskLoc>::iterator, bool> p = _dups.insert(loc);
                 return !p.second;
             }
             return false;
         }
 
-        _KeyNode& _currKeyNode() {
+        virtual bool modifiedKeys() const { return _multikey; }
+        virtual bool isMultiKey() const { return _multikey; }
+
+        const _KeyNode& _currKeyNode() const {
             assert( !bucket.isNull() );
-            _KeyNode& kn = bucket.btree()->k(keyOfs);
+            const _KeyNode& kn = bucket.btree()->k(keyOfs);
             assert( kn.isUsed() );
             return kn;
         }
-        KeyNode currKeyNode() const {
+        const KeyNode currKeyNode() const {
             assert( !bucket.isNull() );
             return bucket.btree()->keyNode(keyOfs);
         }
-        virtual BSONObj currKey() const {
-            return currKeyNode().key;
-        }
 
-        virtual BSONObj indexKeyPattern() {
-            return indexDetails.keyPattern();
-        }
+        virtual BSONObj currKey() const { return currKeyNode().key; }
+        virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); }
 
         virtual void aboutToDeleteBucket(const DiskLoc& b) {
             if ( bucket == b )
                 keyOfs = -1;
         }
 
-        virtual DiskLoc currLoc() {
-            return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();
-        }
-        virtual DiskLoc refLoc() {
-            return currLoc();
-        }
-        virtual Record* _current() {
-            return currLoc().rec();
-        }
-        virtual BSONObj current() {
-            return BSONObj(_current());
-        }
+        virtual DiskLoc currLoc()  { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();  }
+        virtual DiskLoc refLoc()   { return currLoc(); }
+        virtual Record* _current() { return currLoc().rec(); }
+        virtual BSONObj current()  { return BSONObj(_current()); }
         virtual string toString() {
             string s = string("BtreeCursor ") + indexDetails.indexName();
-            if ( direction < 0 ) s += " reverse";
-            if ( bounds_.get() && bounds_->size() > 1 ) s += " multi";
+            if ( _direction < 0 ) s += " reverse";
+            if ( _bounds.get() && _bounds->size() > 1 ) s += " multi";
             return s;
         }
 
@@ -351,77 +512,81 @@ namespace mongo {
         virtual BSONObj prettyIndexBounds() const {
             if ( !_independentFieldRanges ) {
                 return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) );
-            } else {
-                return bounds_->obj();
+            }
+            else {
+                return _bounds->obj();
             }
         }
-        
+
         void forgetEndKey() { endKey = BSONObj(); }
 
         virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
-        
-        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) {
-            _matcher = matcher;
-        }
 
-        // for debugging only
-        DiskLoc getBucket() const { return bucket; }
-        
+        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher;  }
+
+        virtual long long nscanned() { return _nscanned; }
+
+        /** for debugging only */
+        const DiskLoc getBucket() const { return bucket; }
+
     private:
-        /* Our btrees may (rarely) have "unused" keys when items are deleted.
-           Skip past them.
-        */
+        /**
+         * Our btrees may (rarely) have "unused" keys when items are deleted.
+         * Skip past them.
+         */
         bool skipUnusedKeys( bool mayJump );
         bool skipOutOfRangeKeysAndCheckEnd();
         void skipAndCheck();
         void checkEnd();
 
-        // selective audits on construction
+        /** selective audits on construction */
         void audit();
 
-        // set initial bucket
+        /** set initial bucket */
         void init();
 
-        void advanceTo( const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd);
-        
+        /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */
+        void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive );
+
         friend class BtreeBucket;
-        set<DiskLoc> dups;
-        NamespaceDetails *d;
-        int idxNo;
-        
+
+        set<DiskLoc> _dups;
+        NamespaceDetails * const d;
+        const int idxNo;
         BSONObj startKey;
         BSONObj endKey;
-        bool endKeyInclusive_;
-        
-        bool multikey; // note this must be updated every getmore batch in case someone added a multikey...
-
+        bool _endKeyInclusive;
+        bool _multikey; // this must be updated every getmore batch in case someone added a multikey
         const IndexDetails& indexDetails;
-        BSONObj order;
-        Ordering _ordering;
+        const BSONObj _order;
+        const Ordering _ordering;
         DiskLoc bucket;
         int keyOfs;
-        int direction; // 1=fwd,-1=reverse
+        const int _direction; // 1=fwd,-1=reverse
         BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
         DiskLoc locAtKeyOfs;
-        shared_ptr< FieldRangeVector > bounds_;
+        const shared_ptr< FieldRangeVector > _bounds;
         auto_ptr< FieldRangeVector::Iterator > _boundsIterator;
         const IndexSpec& _spec;
         shared_ptr< CoveredIndexMatcher > _matcher;
         bool _independentFieldRanges;
+        long long _nscanned;
     };
 
 
-    inline bool IndexDetails::hasKey(const BSONObj& key) { 
+    inline bool IndexDetails::hasKey(const BSONObj& key) {
         return head.btree()->exists(*this, head, key, Ordering::make(keyPattern()));
     }
-    inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) { 
+    inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) {
         return head.btree()->wouldCreateDup(*this, head, key, Ordering::make(keyPattern()), self);
     }
 
-    /* build btree from the bottom up */
-    /* _ TODO dropDups */
+    /**
+     * build btree from the bottom up
+     * _ TODO dropDups
+     */
     class BtreeBuilder {
-        bool dupsAllowed; 
+        bool dupsAllowed;
         IndexDetails& idx;
         unsigned long long n;
         BSONObj keyLast;
@@ -434,18 +599,20 @@ namespace mongo {
 
         void newBucket();
         void buildNextLevel(DiskLoc);
+        void mayCommitProgressDurably();
 
     public:
         ~BtreeBuilder();
 
         BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx);
 
-        /* keys must be added in order */
+        /** keys must be added in order */
         void addKey(BSONObj& key, DiskLoc loc);
 
-        /* commit work.  if not called, destructor will clean up partially completed work 
-           (in case exception has happened).
-        */
+        /**
+         * commit work.  if not called, destructor will clean up partially completed work
+         *  (in case exception has happened).
+         */
         void commit();
 
         unsigned long long getn() { return n; }
diff --git a/db/btreecursor.cpp b/db/btreecursor.cpp
index d6d0c09..9cab95f 100644
--- a/db/btreecursor.cpp
+++ b/db/btreecursor.cpp
@@ -20,54 +20,56 @@
 #include "btree.h"
 #include "pdfile.h"
 #include "jsobj.h"
-#include "curop.h"
+#include "curop-inl.h"
 
 namespace mongo {
 
     extern int otherTraceLevel;
 
-    BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id, 
+    BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id,
                               const BSONObj &_startKey, const BSONObj &_endKey, bool endKeyInclusive, int _direction ) :
-            d(_d), idxNo(_idxNo), 
-            startKey( _startKey ),
-            endKey( _endKey ),
-            endKeyInclusive_( endKeyInclusive ),
-            multikey( d->isMultikey( idxNo ) ),
-            indexDetails( _id ),
-            order( _id.keyPattern() ),
-            _ordering( Ordering::make( order ) ),
-            direction( _direction ),
-            _spec( _id.getSpec() ),
-            _independentFieldRanges( false )
-    {
+        d(_d), idxNo(_idxNo),
+        startKey( _startKey ),
+        endKey( _endKey ),
+        _endKeyInclusive( endKeyInclusive ),
+        _multikey( d->isMultikey( idxNo ) ),
+        indexDetails( _id ),
+        _order( _id.keyPattern() ),
+        _ordering( Ordering::make( _order ) ),
+        _direction( _direction ),
+        _spec( _id.getSpec() ),
+        _independentFieldRanges( false ),
+        _nscanned( 0 ) {
         audit();
         init();
-        DEV assert( dups.size() == 0 );
+        dassert( _dups.size() == 0 );
     }
 
     BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction )
         :
-            d(_d), idxNo(_idxNo), 
-            endKeyInclusive_( true ),
-            multikey( d->isMultikey( idxNo ) ),
-            indexDetails( _id ),
-            order( _id.keyPattern() ),
-            _ordering( Ordering::make( order ) ),
-            direction( _direction ),
-            bounds_( ( assert( _bounds.get() ), _bounds ) ),
-            _boundsIterator( new FieldRangeVector::Iterator( *bounds_  ) ),
-            _spec( _id.getSpec() ),
-            _independentFieldRanges( true )
-    {
+        d(_d), idxNo(_idxNo),
+        _endKeyInclusive( true ),
+        _multikey( d->isMultikey( idxNo ) ),
+        indexDetails( _id ),
+        _order( _id.keyPattern() ),
+        _ordering( Ordering::make( _order ) ),
+        _direction( _direction ),
+        _bounds( ( assert( _bounds.get() ), _bounds ) ),
+        _boundsIterator( new FieldRangeVector::Iterator( *_bounds  ) ),
+        _spec( _id.getSpec() ),
+        _independentFieldRanges( true ),
+        _nscanned( 0 ) {
         massert( 13384, "BtreeCursor FieldRangeVector constructor doesn't accept special indexes", !_spec.getType() );
         audit();
-        startKey = bounds_->startKey();
-        bool found;
+        startKey = _bounds->startKey();
         _boundsIterator->advance( startKey ); // handles initialization
-        bucket = indexDetails.head.btree()->
-        locate(indexDetails, indexDetails.head, startKey, _ordering, keyOfs, found, direction > 0 ? minDiskLoc : maxDiskLoc, direction);
+        _boundsIterator->prepDive();
+        pair< DiskLoc, int > noBestParent;
+        bucket = indexDetails.head;
+        keyOfs = 0;
+        indexDetails.head.btree()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
         skipAndCheck();
-        DEV assert( dups.size() == 0 );
+        dassert( _dups.size() == 0 );
     }
 
     void BtreeCursor::audit() {
@@ -76,7 +78,7 @@ namespace mongo {
         if ( otherTraceLevel >= 12 ) {
             if ( otherTraceLevel >= 200 ) {
                 out() << "::BtreeCursor() qtl>200.  validating entire index." << endl;
-                indexDetails.head.btree()->fullValidate(indexDetails.head, order);
+                indexDetails.head.btree()->fullValidate(indexDetails.head, _order);
             }
             else {
                 out() << "BTreeCursor(). dumping head bucket" << endl;
@@ -86,17 +88,20 @@ namespace mongo {
     }
 
     void BtreeCursor::init() {
-        if ( _spec.getType() ){
+        if ( _spec.getType() ) {
             startKey = _spec.getType()->fixKey( startKey );
             endKey = _spec.getType()->fixKey( endKey );
         }
         bool found;
         bucket = indexDetails.head.btree()->
-            locate(indexDetails, indexDetails.head, startKey, _ordering, keyOfs, found, direction > 0 ? minDiskLoc : maxDiskLoc, direction);
+                 locate(indexDetails, indexDetails.head, startKey, _ordering, keyOfs, found, _direction > 0 ? minDiskLoc : maxDiskLoc, _direction);
+        if ( ok() ) {
+            _nscanned = 1;
+        }
         skipUnusedKeys( false );
         checkEnd();
     }
-    
+
     void BtreeCursor::skipAndCheck() {
         skipUnusedKeys( true );
         while( 1 ) {
@@ -109,7 +114,7 @@ namespace mongo {
             }
         }
     }
-    
+
     bool BtreeCursor::skipOutOfRangeKeysAndCheckEnd() {
         if ( !ok() ) {
             return false;
@@ -118,25 +123,30 @@ namespace mongo {
         if ( ret == -2 ) {
             bucket = DiskLoc();
             return false;
-        } else if ( ret == -1 ) {
+        }
+        else if ( ret == -1 ) {
+            ++_nscanned;
             return false;
         }
-        advanceTo( currKeyNode().key, ret, _boundsIterator->cmp() );
+        ++_nscanned;
+        advanceTo( currKeyNode().key, ret, _boundsIterator->after(), _boundsIterator->cmp(), _boundsIterator->inc() );
         return true;
     }
-    
+
     /* skip unused keys. */
     bool BtreeCursor::skipUnusedKeys( bool mayJump ) {
         int u = 0;
         while ( 1 ) {
             if ( !ok() )
                 break;
-            BtreeBucket *b = bucket.btree();
-            _KeyNode& kn = b->k(keyOfs);
+            const BtreeBucket *b = bucket.btree();
+            const _KeyNode& kn = b->k(keyOfs);
             if ( kn.isUsed() )
                 break;
-            bucket = b->advance(bucket, keyOfs, direction, "skipUnusedKeys");
+            bucket = b->advance(bucket, keyOfs, _direction, "skipUnusedKeys");
             u++;
+            //don't include unused keys in nscanned
+            //++_nscanned;
             if ( mayJump && ( u % 10 == 0 ) ) {
                 skipOutOfRangeKeysAndCheckEnd();
             }
@@ -158,31 +168,34 @@ namespace mongo {
         if ( bucket.isNull() )
             return;
         if ( !endKey.isEmpty() ) {
-            int cmp = sgn( endKey.woCompare( currKey(), order ) );
-            if ( ( cmp != 0 && cmp != direction ) ||
-                ( cmp == 0 && !endKeyInclusive_ ) )
+            int cmp = sgn( endKey.woCompare( currKey(), _order ) );
+            if ( ( cmp != 0 && cmp != _direction ) ||
+                    ( cmp == 0 && !_endKeyInclusive ) )
                 bucket = DiskLoc();
         }
     }
-    
-    void BtreeCursor::advanceTo( const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd) {
-        bucket.btree()->advanceTo( indexDetails, bucket, keyOfs, keyBegin, keyBeginLen, keyEnd, _ordering, direction );
+
+    void BtreeCursor::advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive) {
+        bucket.btree()->advanceTo( bucket, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, _ordering, _direction );
     }
-    
+
     bool BtreeCursor::advance() {
         killCurrentOp.checkForInterrupt();
         if ( bucket.isNull() )
             return false;
 
-        bucket = bucket.btree()->advance(bucket, keyOfs, direction, "BtreeCursor::advance");
+        bucket = bucket.btree()->advance(bucket, keyOfs, _direction, "BtreeCursor::advance");
 
         if ( !_independentFieldRanges ) {
             skipUnusedKeys( false );
             checkEnd();
-            return ok();
+            if ( ok() ) {
+                ++_nscanned;
+            }
+        }
+        else {
+            skipAndCheck();
         }
-        
-        skipAndCheck();
         return ok();
     }
 
@@ -207,10 +220,10 @@ namespace mongo {
         if ( eof() )
             return;
 
-        multikey = d->isMultikey(idxNo);
+        _multikey = d->isMultikey(idxNo);
 
         if ( keyOfs >= 0 ) {
-            BtreeBucket *b = bucket.btree();
+            const BtreeBucket *b = bucket.btree();
 
             assert( !keyAtKeyOfs.isEmpty() );
 
@@ -219,17 +232,17 @@ namespace mongo {
             int x = 0;
             while( 1 ) {
                 if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
-                    b->k(keyOfs).recordLoc == locAtKeyOfs ) {
-                        if ( !b->k(keyOfs).isUsed() ) {
-                            /* we were deleted but still exist as an unused
-                            marker key. advance.
-                            */
-                            skipUnusedKeys( false );
-                        }
-                        return;
+                        b->k(keyOfs).recordLoc == locAtKeyOfs ) {
+                    if ( !b->k(keyOfs).isUsed() ) {
+                        /* we were deleted but still exist as an unused
+                        marker key. advance.
+                        */
+                        skipUnusedKeys( false );
+                    }
+                    return;
                 }
 
-                /* we check one key earlier too, in case a key was just deleted.  this is 
+                /* we check one key earlier too, in case a key was just deleted.  this is
                    important so that multi updates are reasonably fast.
                    */
                 if( keyOfs == 0 || x++ )
@@ -245,7 +258,7 @@ namespace mongo {
         bool found;
 
         /* TODO: Switch to keep indexdetails and do idx.head! */
-        bucket = indexDetails.head.btree()->locate(indexDetails, indexDetails.head, keyAtKeyOfs, _ordering, keyOfs, found, locAtKeyOfs, direction);
+        bucket = indexDetails.head.btree()->locate(indexDetails, indexDetails.head, keyAtKeyOfs, _ordering, keyOfs, found, locAtKeyOfs, _direction);
         RARELY log() << "  key seems to have moved in the index, refinding. found:" << found << endl;
         if ( ! bucket.isNull() )
             skipUnusedKeys( false );
diff --git a/db/cap.cpp b/db/cap.cpp
index c676429..198bd54 100644
--- a/db/cap.cpp
+++ b/db/cap.cpp
@@ -1,4 +1,5 @@
-// @file cap.cpp capped collection related 
+// @file cap.cpp capped collection related
+// the "old" version (<= v1.6)
 
 /**
 *    Copyright (C) 2008 10gen Inc.
@@ -49,7 +50,7 @@
 namespace mongo {
 
     /* combine adjacent deleted records *for the current extent* of the capped collection
-     
+
        this is O(n^2) but we call it for capped tables where typically n==1 or 2!
        (or 3...there will be a little unused sliver at the end of the extent.)
     */
@@ -62,7 +63,8 @@ namespace mongo {
         DiskLoc i = cappedFirstDeletedInCurExtent();
         for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted )
             drecs.push_back( i );
-        cappedFirstDeletedInCurExtent() = i;
+
+        getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i;
 
         // This is the O(n^2) part.
         drecs.sort();
@@ -80,7 +82,7 @@ namespace mongo {
             DiskLoc b = *j;
             while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) {
                 // a & b are adjacent.  merge.
-                a.drec()->lengthWithHeaders += b.drec()->lengthWithHeaders;
+                getDur().writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders;
                 j++;
                 if ( j == drecs.end() ) {
                     DEBUGGING out() << "temp: compact adddelrec2\n";
@@ -106,8 +108,8 @@ namespace mongo {
         // migrate old NamespaceDetails format
         assert( capped );
         if ( capExtent.a() == 0 && capExtent.getOfs() == 0 ) {
-            capFirstNewRecord = DiskLoc();
-            capFirstNewRecord.setInvalid();
+            //capFirstNewRecord = DiskLoc();
+            capFirstNewRecord.writing().setInvalid();
             // put all the DeletedRecords in cappedListOfAllDeletedRecords()
             for ( int i = 1; i < Buckets; ++i ) {
                 DiskLoc first = deletedList[ i ];
@@ -115,14 +117,14 @@ namespace mongo {
                     continue;
                 DiskLoc last = first;
                 for (; !last.drec()->nextDeleted.isNull(); last = last.drec()->nextDeleted );
-                last.drec()->nextDeleted = cappedListOfAllDeletedRecords();
-                cappedListOfAllDeletedRecords() = first;
-                deletedList[ i ] = DiskLoc();
+                last.drec()->nextDeleted.writing() = cappedListOfAllDeletedRecords();
+                cappedListOfAllDeletedRecords().writing() = first;
+                deletedList[i].writing() = DiskLoc();
             }
             // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
 
             // Last, in case we're killed before getting here
-            capExtent = firstExtent;
+            capExtent.writing() = firstExtent;
         }
     }
 
@@ -144,20 +146,20 @@ namespace mongo {
         // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
         // (or DiskLoc() if new capExtent == firstExtent)
         if ( capExtent == lastExtent )
-            cappedLastDelRecLastExtent() = DiskLoc();
+            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
         else {
             DiskLoc i = cappedFirstDeletedInCurExtent();
             for (; !i.isNull() && nextIsInCapExtent( i ); i = i.drec()->nextDeleted );
-            cappedLastDelRecLastExtent() = i;
+            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = i;
         }
 
-        capExtent = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext;
+        getDur().writingDiskLoc( capExtent ) = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext;
 
         /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
         //dassert( theCapExtent()->ns == ns );
 
         theCapExtent()->assertOk();
-        capFirstNewRecord = DiskLoc();
+        getDur().writingDiskLoc( capFirstNewRecord ) = DiskLoc();
     }
 
     DiskLoc NamespaceDetails::__capAlloc( int len ) {
@@ -176,25 +178,25 @@ namespace mongo {
         /* unlink ourself from the deleted list */
         if ( !ret.isNull() ) {
             if ( prev.isNull() )
-                cappedListOfAllDeletedRecords() = ret.drec()->nextDeleted;
+                cappedListOfAllDeletedRecords().writing() = ret.drec()->nextDeleted;
             else
-                prev.drec()->nextDeleted = ret.drec()->nextDeleted;
-            ret.drec()->nextDeleted.setInvalid(); // defensive.
+                prev.drec()->nextDeleted.writing() = ret.drec()->nextDeleted;
+            ret.drec()->nextDeleted.writing().setInvalid(); // defensive.
             assert( ret.drec()->extentOfs < ret.getOfs() );
         }
 
         return ret;
     }
 
-    DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) { 
+    DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) {
         // signal done allocating new extents.
         if ( !cappedLastDelRecLastExtent().isValid() )
-            cappedLastDelRecLastExtent() = DiskLoc();
-        
+            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
+
         assert( len < 400000000 );
         int passes = 0;
         int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
-        if ( maxPasses < 5000 ){
+        if ( maxPasses < 5000 ) {
             // this is for bacwards safety since 5000 was the old value
             maxPasses = 5000;
         }
@@ -208,7 +210,7 @@ namespace mongo {
         theCapExtent()->assertOk();
         DiskLoc firstEmptyExtent;
         while ( 1 ) {
-            if ( nrecords < max ) {
+            if ( stats.nrecords < max ) {
                 loc = __capAlloc( len );
                 if ( !loc.isNull() )
                     break;
@@ -217,8 +219,9 @@ namespace mongo {
             // If on first iteration through extents, don't delete anything.
             if ( !capFirstNewRecord.isValid() ) {
                 advanceCapExtent( ns );
+
                 if ( capExtent != firstExtent )
-                    capFirstNewRecord.setInvalid();
+                    capFirstNewRecord.writing().setInvalid();
                 // else signal done with first iteration through extents.
                 continue;
             }
@@ -247,14 +250,14 @@ namespace mongo {
             compact();
             if( ++passes > maxPasses ) {
                 log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n';
-                log() << "passes max:" << max << " nrecords:" << nrecords << " datasize: " << datasize << endl;
+                log() << "passes max:" << max << " nrecords:" << stats.nrecords << " datasize: " << stats.datasize << endl;
                 massert( 10345 ,  "passes >= maxPasses in capped collection alloc", false );
             }
         }
 
         // Remember first record allocated on this iteration through capExtent.
         if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() )
-            capFirstNewRecord = loc;
+            getDur().writingDiskLoc(capFirstNewRecord) = loc;
 
         return loc;
     }
@@ -269,123 +272,179 @@ namespace mongo {
         }
     }
 
-    void NamespaceDetails::cappedDumpDelInfo() { 
+    void NamespaceDetails::cappedDumpDelInfo() {
         cout << "dl[0]: " << deletedList[0].toString() << endl;
-        for( DiskLoc z = deletedList[0]; !z.isNull(); z = z.drec()->nextDeleted ) { 
-            cout << "  drec:" << z.toString() << " dreclen:" << hex << z.drec()->lengthWithHeaders << 
-                " ext:" << z.drec()->myExtent(z)->myLoc.toString() << endl;
+        for( DiskLoc z = deletedList[0]; !z.isNull(); z = z.drec()->nextDeleted ) {
+            cout << "  drec:" << z.toString() << " dreclen:" << hex << z.drec()->lengthWithHeaders <<
+                 " ext:" << z.drec()->myExtent(z)->myLoc.toString() << endl;
         }
         cout << "dl[1]: " << deletedList[1].toString() << endl;
     }
 
-    /* everything from end on, eliminate from the capped collection.
-       @param inclusive if true, deletes end (i.e. closed or open range)
-    */
+    void NamespaceDetails::cappedTruncateLastDelUpdate() {
+        if ( capExtent == firstExtent ) {
+            // Only one extent of the collection is in use, so there
+            // is no deleted record in a previous extent, so nullify
+            // cappedLastDelRecLastExtent().
+            cappedLastDelRecLastExtent().writing() = DiskLoc();
+        }
+        else {
+            // Scan through all deleted records in the collection
+            // until the last deleted record for the extent prior
+            // to the new capExtent is found.  Then set
+            // cappedLastDelRecLastExtent() to that deleted record.
+            DiskLoc i = cappedListOfAllDeletedRecords();
+            for( ;
+                    !i.drec()->nextDeleted.isNull() &&
+                    !inCapExtent( i.drec()->nextDeleted );
+                    i = i.drec()->nextDeleted );
+            // In our capped storage model, every extent must have at least one
+            // deleted record.  Here we check that 'i' is not the last deleted
+            // record.  (We expect that there will be deleted records in the new
+            // capExtent as well.)
+            assert( !i.drec()->nextDeleted.isNull() );
+            cappedLastDelRecLastExtent().writing() = i;
+        }
+    }
+
     void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) {
         DEV assert( this == nsdetails(ns) );
         assert( cappedLastDelRecLastExtent().isValid() );
-        
+
+        // We iteratively remove the newest document until the newest document
+        // is 'end', then we remove 'end' if requested.
         bool foundLast = false;
         while( 1 ) {
             if ( foundLast ) {
+                // 'end' has been found and removed, so break.
                 break;
             }
+            // 'curr' will point to the newest document in the collection.
             DiskLoc curr = theCapExtent()->lastRecord;
             assert( !curr.isNull() );
             if ( curr == end ) {
                 if ( inclusive ) {
+                    // 'end' has been found, so break next iteration.
                     foundLast = true;
-                } else {
+                }
+                else {
+                    // 'end' has been found, so break.
                     break;
                 }
             }
-            
-            uassert( 13415, "emptying the collection is not allowed", nrecords > 1 );
-            
+
+            // TODO The algorithm used in this function cannot generate an
+            // empty collection, but we could call emptyCappedCollection() in
+            // this case instead of asserting.
+            uassert( 13415, "emptying the collection is not allowed", stats.nrecords > 1 );
+
+            // Delete the newest record, and coalesce the new deleted
+            // record with existing deleted records.
+            theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
+            compact();
+
+            // This is the case where we have not yet had to remove any
+            // documents to make room for other documents, and we are allocating
+            // documents from free space in fresh extents instead of reusing
+            // space from familiar extents.
             if ( !capLooped() ) {
-                theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
-                compact();
+
+                // We just removed the last record from the 'capExtent', and
+                // the 'capExtent' can't be empty, so we set 'capExtent' to
+                // capExtent's prev extent.
                 if ( theCapExtent()->lastRecord.isNull() ) {
                     assert( !theCapExtent()->xprev.isNull() );
-                    capExtent = theCapExtent()->xprev;
+                    // NOTE Because we didn't delete the last document, and
+                    // capLooped() is false, capExtent is not the first extent
+                    // so xprev will be nonnull.
+                    capExtent.writing() = theCapExtent()->xprev;
                     theCapExtent()->assertOk();
-                    if ( capExtent == firstExtent ) {
-                        cappedLastDelRecLastExtent() = DiskLoc();
-                    } else {
-                        // slow - there's no prev ptr for deleted rec
-                        DiskLoc i = cappedListOfAllDeletedRecords();
-                        for( ;
-                            !i.drec()->nextDeleted.isNull() &&
-                            !inCapExtent( i.drec()->nextDeleted );
-                            i = i.drec()->nextDeleted );
-                        assert( !i.drec()->nextDeleted.isNull() ); // I believe there is always at least one drec per extent
-                        cappedLastDelRecLastExtent() = i;
-                    }
+
+                    // update cappedLastDelRecLastExtent()
+                    cappedTruncateLastDelUpdate();
                 }
                 continue;
             }
 
-            theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
-            compact();
-            if ( curr == capFirstNewRecord ) { // invalid, but can compare locations
-                capExtent = ( capExtent == firstExtent ) ? lastExtent : theCapExtent()->xprev;
-                theCapExtent()->assertOk();
-                assert( !theCapExtent()->firstRecord.isNull() );
-                capFirstNewRecord = theCapExtent()->firstRecord;
-                if ( capExtent == firstExtent ) {
-                    cappedLastDelRecLastExtent() = DiskLoc();
-                } else {
-                    // slow - there's no prev ptr for deleted rec
-                    DiskLoc i = cappedListOfAllDeletedRecords();
-                    for( ;
-                        !i.drec()->nextDeleted.isNull() &&
-                        !inCapExtent( i.drec()->nextDeleted );
-                        i = i.drec()->nextDeleted );
-                    assert( !i.drec()->nextDeleted.isNull() ); // I believe there is always at least one drec per extent
-                    cappedLastDelRecLastExtent() = i;
+            // This is the case where capLooped() is true, and we just deleted
+            // from capExtent, and we just deleted capFirstNewRecord, which was
+            // the last record on the fresh side of capExtent.
+            // NOTE In this comparison, curr and potentially capFirstNewRecord
+            // may point to invalid data, but we can still compare the
+            // references themselves.
+            if ( curr == capFirstNewRecord ) {
+
+                // Set 'capExtent' to the first nonempty extent prior to the
+                // initial capExtent.  There must be such an extent because we
+                // have not deleted the last document in the collection.  It is
+                // possible that all extents other than the capExtent are empty.
+                // In this case we will keep the initial capExtent and specify
+                // that all records contained within are on the fresh rather than
+                // stale side of the extent.
+                DiskLoc newCapExtent = capExtent;
+                do {
+                    // Find the previous extent, looping if necessary.
+                    newCapExtent = ( newCapExtent == firstExtent ) ? lastExtent : newCapExtent.ext()->xprev;
+                    newCapExtent.ext()->assertOk();
                 }
+                while ( newCapExtent.ext()->firstRecord.isNull() );
+                capExtent.writing() = newCapExtent;
+
+                // Place all documents in the new capExtent on the fresh side
+                // of the capExtent by setting capFirstNewRecord to the first
+                // document in the new capExtent.
+                capFirstNewRecord.writing() = theCapExtent()->firstRecord;
+
+                // update cappedLastDelRecLastExtent()
+                cappedTruncateLastDelUpdate();
             }
         }
     }
-    
+
     void NamespaceDetails::emptyCappedCollection( const char *ns ) {
         DEV assert( this == nsdetails(ns) );
         massert( 13424, "collection must be capped", capped );
-        massert( 13425, "background index build in progress", !backgroundIndexBuildInProgress );
+        massert( 13425, "background index build in progress", !indexBuildInProgress );
         massert( 13426, "indexes present", nIndexes == 0 );
 
+        // Clear all references to this namespace.
         ClientCursor::invalidate( ns );
-		NamespaceDetailsTransient::clearForPrefix( ns );
+        NamespaceDetailsTransient::clearForPrefix( ns );
+
+        // Get a writeable reference to 'this' and reset all pertinent
+        // attributes.
+        NamespaceDetails *t = writingWithoutExtra();
+
+        t->cappedLastDelRecLastExtent() = DiskLoc();
+        t->cappedListOfAllDeletedRecords() = DiskLoc();
 
-        cappedLastDelRecLastExtent() = DiskLoc();
-        cappedListOfAllDeletedRecords() = DiskLoc();
-        
         // preserve firstExtent/lastExtent
-        capExtent = firstExtent;
-        datasize = nrecords = 0;
+        t->capExtent = firstExtent;
+        t->stats.datasize = stats.nrecords = 0;
         // lastExtentSize preserve
         // nIndexes preserve 0
         // capped preserve true
         // max preserve
-        paddingFactor = 1.0;
-        flags = 0;
-        capFirstNewRecord = DiskLoc();
-        capFirstNewRecord.setInvalid();
-        cappedLastDelRecLastExtent().setInvalid();
+        t->paddingFactor = 1.0;
+        t->flags = 0;
+        t->capFirstNewRecord = DiskLoc();
+        t->capFirstNewRecord.setInvalid();
+        t->cappedLastDelRecLastExtent().setInvalid();
         // dataFileVersion preserve
         // indexFileVersion preserve
-        multiKeyIndexBits = 0;
-        reservedA = 0;
-        extraOffset = 0;
-        // backgroundIndexBuildInProgress preserve 0
-        memset(reserved, 0, sizeof(reserved));
+        t->multiKeyIndexBits = 0;
+        t->reservedA = 0;
+        t->extraOffset = 0;
+        // indexBuildInProgress preserve 0
+        memset(t->reserved, 0, sizeof(t->reserved));
 
+        // Reset all existing extents and recreate the deleted list.
         for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) {
             DiskLoc prev = ext.ext()->xprev;
             DiskLoc next = ext.ext()->xnext;
             DiskLoc empty = ext.ext()->reuse( ns );
-            ext.ext()->xprev = prev;
-            ext.ext()->xnext = next;
+            ext.ext()->xprev.writing() = prev;
+            ext.ext()->xnext.writing() = next;
             addDeletedRec( empty.drec(), empty );
         }
     }
diff --git a/db/client.cpp b/db/client.cpp
index f9653f5..e4fd4b9 100644
--- a/db/client.cpp
+++ b/db/client.cpp
@@ -16,14 +16,14 @@
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-/* Client represents a connection to the database (the server-side) and corresponds 
+/* Client represents a connection to the database (the server-side) and corresponds
    to an open socket (or logical connection if pooling on sockets) from a client.
 */
 
 #include "pch.h"
 #include "db.h"
 #include "client.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "json.h"
 #include "security.h"
 #include "commands.h"
@@ -40,20 +40,31 @@ namespace mongo {
     set<Client*> Client::clients; // always be in clientsMutex when manipulating this
     boost::thread_specific_ptr<Client> currentClient;
 
-    Client::Client(const char *desc, MessagingPort *p) : 
-      _context(0),
-      _shutdown(false),
-      _desc(desc),
-      _god(0),
-      _lastOp(0), 
-      _mp(p)
-    {
+    /* each thread which does db operations has a Client object in TLS.
+       call this when your thread starts.
+    */
+    Client& Client::initThread(const char *desc, MessagingPort *mp) {
+        assert( currentClient.get() == 0 );
+        Client *c = new Client(desc, mp);
+        currentClient.reset(c);
+        mongo::lastError.initThread();
+        return *c;
+    }
+
+    Client::Client(const char *desc, MessagingPort *p) :
+        _context(0),
+        _shutdown(false),
+        _desc(desc),
+        _god(0),
+        _lastOp(0),
+        _mp(p) {
+        _connectionId = setThreadName(desc);
         _curOp = new CurOp( this );
         scoped_lock bl(clientsMutex);
         clients.insert(this);
     }
 
-    Client::~Client() { 
+    Client::~Client() {
         _god = 0;
 
         if ( _context )
@@ -62,90 +73,33 @@ namespace mongo {
         if ( ! _shutdown ) {
             error() << "Client::shutdown not called: " << _desc << endl;
         }
-        
+
         scoped_lock bl(clientsMutex);
         if ( ! _shutdown )
             clients.erase(this);
         delete _curOp;
     }
-    
-    void Client::_dropns( const string& ns ){
-        Top::global.collectionDropped( ns );
-                    
-        dblock l;
-        Client::Context ctx( ns );
-        if ( ! nsdetails( ns.c_str() ) )
-            return;
-        
-        try {
-            string err;
-            BSONObjBuilder b;
-            dropCollection( ns , err , b );
-        }
-        catch ( ... ){
-            warning() << "error dropping temp collection: " << ns << endl;
-        }
-
-    }
-    
-    void Client::_invalidateDB( const string& db ) {
-        assert( db.find( '.' ) == string::npos );
-
-        set<string>::iterator min = _tempCollections.lower_bound( db + "." );
-        set<string>::iterator max = _tempCollections.lower_bound( db + "|" );
-        
-        _tempCollections.erase( min , max );
-
-    }
-    
-    void Client::invalidateDB(const string& db) {
-        scoped_lock bl(clientsMutex);
-        for ( set<Client*>::iterator i = clients.begin(); i!=clients.end(); i++ ){
-            Client* cli = *i;
-            cli->_invalidateDB(db);
-        }
-    }
 
-    void Client::invalidateNS( const string& ns ){
-        scoped_lock bl(clientsMutex);
-        for ( set<Client*>::iterator i = clients.begin(); i!=clients.end(); i++ ){
-            Client* cli = *i;
-            cli->_tempCollections.erase( ns );
-        }
-    }
-
-
-    void Client::addTempCollection( const string& ns ) { 
-        _tempCollections.insert( ns ); 
-    }
-
-    bool Client::shutdown(){
+    bool Client::shutdown() {
         _shutdown = true;
         if ( inShutdown() )
             return false;
         {
             scoped_lock bl(clientsMutex);
             clients.erase(this);
-        }
-
-        bool didAnything = false;
-        
-        if ( _tempCollections.size() ){
-            didAnything = true;
-            for ( set<string>::iterator i = _tempCollections.begin(); i!=_tempCollections.end(); i++ ){
-                _dropns( *i );
+            if ( isSyncThread() ) {
+                syncThread = 0;
             }
-            _tempCollections.clear();
         }
-        
-        return didAnything;
+
+        return false;
     }
 
-    BSONObj CurOp::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}");
+    BSONObj CachedBSONObj::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}");
     AtomicUInt CurOp::_nextOpNum;
-    
+
     Client::Context::Context( string ns , Database * db, bool doauth )
-        : _client( currentClient.get() ) , _oldContext( _client->_context ) , 
+        : _client( currentClient.get() ) , _oldContext( _client->_context ) ,
           _path( dbpath ) , _lock(0) , _justCreated(false) {
         assert( db && db->isOk() );
         _ns = ns;
@@ -155,20 +109,36 @@ namespace mongo {
             _auth();
     }
 
-    void Client::Context::_finishInit( bool doauth ){
+    Client::Context::Context(const string& ns, string path , mongolock * lock , bool doauth )
+        : _client( currentClient.get() ) , _oldContext( _client->_context ) ,
+          _path( path ) , _lock( lock ) ,
+          _ns( ns ), _db(0) {
+        _finishInit( doauth );
+    }
+
+    /* this version saves the context but doesn't yet set the new one: */
+
+    Client::Context::Context()
+        : _client( currentClient.get() ) , _oldContext( _client->_context ),
+          _path( dbpath ) , _lock(0) , _justCreated(false), _db(0) {
+        _client->_context = this;
+        clear();
+    }
+
+    void Client::Context::_finishInit( bool doauth ) {
         int lockState = dbMutex.getState();
         assert( lockState );
-        
+
         _db = dbHolder.get( _ns , _path );
-        if ( _db ){
+        if ( _db ) {
             _justCreated = false;
         }
-        else if ( dbMutex.getState() > 0 ){
+        else if ( dbMutex.getState() > 0 ) {
             // already in a write lock
             _db = dbHolder.getOrCreate( _ns , _path , _justCreated );
             assert( _db );
         }
-        else if ( dbMutex.getState() < -1 ){
+        else if ( dbMutex.getState() < -1 ) {
             // nested read lock :(
             assert( _lock );
             _lock->releaseAndWriteLock();
@@ -181,50 +151,52 @@ namespace mongo {
             // to do that, we're going to unlock, then get a write lock
             // this is so that if this is the first query and its long doesn't block db
             // we just have to check that the db wasn't closed in the interim where we unlock
-            for ( int x=0; x<2; x++ ){
-                {                     
+            for ( int x=0; x<2; x++ ) {
+                {
                     dbtemprelease unlock;
                     writelock lk( _ns );
                     dbHolder.getOrCreate( _ns , _path , _justCreated );
                 }
-                
+
                 _db = dbHolder.get( _ns , _path );
-                
+
                 if ( _db )
                     break;
-                
+
                 log() << "db was closed on us right after we opened it: " << _ns << endl;
             }
-            
+
             uassert( 13005 , "can't create db, keeps getting closed" , _db );
         }
-        
-        _client->_context = this;
-        _client->_curOp->enter( this );
-        if ( doauth )
-            _auth( lockState );
 
-        switch ( _client->_curOp->getOp() ){
+        switch ( _client->_curOp->getOp() ) {
         case dbGetMore: // getMore's are special and should be handled else where
         case dbUpdate: // update & delete check shard version in instance.cpp, so don't check here as well
-        case dbDelete: 
+        case dbDelete:
             break;
         default: {
             string errmsg;
-            if ( ! shardVersionOk( _ns , lockState > 0 , errmsg ) ){
-                msgasserted( StaleConfigInContextCode , (string)"[" + _ns + "] shard version not ok in Client::Context: " + errmsg );
+            if ( ! shardVersionOk( _ns , lockState > 0 , errmsg ) ) {
+                ostringstream os;
+                os << "[" << _ns << "] shard version not ok in Client::Context: " << errmsg;
+                msgassertedNoTrace( StaleConfigInContextCode , os.str().c_str() );
             }
         }
         }
+
+        _client->_context = this;
+        _client->_curOp->enter( this );
+        if ( doauth )
+            _auth( lockState );
     }
-    
-    void Client::Context::_auth( int lockState ){
+
+    void Client::Context::_auth( int lockState ) {
         if ( _client->_ai.isAuthorizedForLock( _db->name , lockState ) )
             return;
 
         // before we assert, do a little cleanup
         _client->_context = _oldContext; // note: _oldContext may be null
-        
+
         stringstream ss;
         ss << "unauthorized db:" << _db->name << " lock type:" << lockState << " client:" << _client->clientAddress();
         uasserted( 10057 , ss.str() );
@@ -236,9 +208,35 @@ namespace mongo {
         _client->_context = _oldContext; // note: _oldContext may be null
     }
 
-    string Client::clientAddress() const {
+    bool Client::Context::inDB( const string& db , const string& path ) const {
+        if ( _path != path )
+            return false;
+
+        if ( db == _ns )
+            return true;
+
+        string::size_type idx = _ns.find( db );
+        if ( idx != 0 )
+            return false;
+
+        return  _ns[db.size()] == '.';
+    }
+
+    void Client::appendLastOp( BSONObjBuilder& b ) const {
+        if( theReplSet ) {
+            b.append("lastOp" , (long long) _lastOp);
+        }
+        else {
+            OpTime lo(_lastOp);
+            if ( ! lo.isNull() )
+                b.appendTimestamp( "lastOp" , lo.asDate() );
+        }
+    }
+
+
+    string Client::clientAddress(bool includePort) const {
         if( _curOp )
-            return _curOp->getRemoteString(false);
+            return _curOp->getRemoteString(includePort);
         return "";
     }
 
@@ -249,63 +247,75 @@ namespace mongo {
         return ss.str();
     }
 
-    string sayClientState(){
+    string sayClientState() {
         Client* c = currentClient.get();
         if ( !c )
             return "no client";
         return c->toString();
     }
-    
-    void curopWaitingForLock( int type ){
+
+    Client* curopWaitingForLock( int type ) {
         Client * c = currentClient.get();
         assert( c );
         CurOp * co = c->curop();
-        if ( co ){
+        if ( co ) {
             co->waitingForLock( type );
         }
+        return c;
     }
-    void curopGotLock(){
-        Client * c = currentClient.get();
+    void curopGotLock(Client *c) {
         assert(c);
         CurOp * co = c->curop();
-        if ( co ){
+        if ( co )
             co->gotLock();
-        }
     }
 
-    CurOp::~CurOp(){
-        if ( _wrapped ){
-            scoped_lock bl(Client::clientsMutex);
-            _client->_curOp = _wrapped;
+    void KillCurrentOp::interruptJs( AtomicUInt *op ) {
+        if ( !globalScriptEngine )
+            return;
+        if ( !op ) {
+            globalScriptEngine->interruptAll();
         }
-        
-        _client = 0;
+        else {
+            globalScriptEngine->interrupt( *op );
+        }
+    }
+
+    void KillCurrentOp::killAll() {
+        _globalKill = true;
+        interruptJs( 0 );
     }
 
-    BSONObj CurOp::query( bool threadSafe ) {
-        if( querySize() == 1 ) { 
-            return _tooBig;
+    void KillCurrentOp::kill(AtomicUInt i) {
+        bool found = false;
+        {
+            scoped_lock l( Client::clientsMutex );
+            for( set< Client* >::const_iterator j = Client::clients.begin(); !found && j != Client::clients.end(); ++j ) {
+                for( CurOp *k = ( *j )->curop(); !found && k; k = k->parent() ) {
+                    if ( k->opNum() == i ) {
+                        k->kill();
+                        for( CurOp *l = ( *j )->curop(); l != k; l = l->parent() ) {
+                            l->kill();
+                        }
+                        found = true;
+                    }
+                }
+            }
         }
-        
-        if ( ! threadSafe ){
-            BSONObj o(_queryBuf);
-            return o;
+        if ( found ) {
+            interruptJs( &i );
         }
-
-        int size = querySize();        
-        int before = checksum( _queryBuf , size );
-        BSONObj a(_queryBuf);
-        BSONObj b = a.copy();
-        int after = checksum( _queryBuf , size );
-        
-        if ( before == after )
-            return b;
-        
-        return BSON( "msg" << "query changed while capturing" );
     }
 
+    CurOp::~CurOp() {
+        if ( _wrapped ) {
+            scoped_lock bl(Client::clientsMutex);
+            _client->_curOp = _wrapped;
+        }
+        _client = 0;
+    }
 
-    BSONObj CurOp::infoNoauth( int attempt ) {
+    BSONObj CurOp::infoNoauth() {
         BSONObjBuilder b;
         b.append("opid", _opNum);
         bool a = _active && _start;
@@ -313,40 +323,16 @@ namespace mongo {
         if ( _lockType )
             b.append("lockType" , _lockType > 0 ? "write" : "read"  );
         b.append("waitingForLock" , _waitingForLock );
-        
-        if( a ){
+
+        if( a ) {
             b.append("secs_running", elapsedSeconds() );
         }
-        
+
         b.append( "op" , opToString( _op ) );
-        
+
         b.append("ns", _ns);
-        
-        {
-            int size = querySize();
-            if ( size == 0 ){
-                // do nothing
-            }
-            else if ( size == 1 ){
-                b.append( "query" , _tooBig );
-            }
-            else if ( attempt > 2 ){
-                b.append( "query" , BSON( "err" << "can't get a clean object" ) );
-                log( LL_WARNING ) << "CurOp changing too much to get reading" << endl;
-                         
-            }
-            else {
-                int before = checksum( _queryBuf , size );
-                b.appendObject( "query" , _queryBuf , size );
-                int after = checksum( _queryBuf , size );
-                
-                if ( after != before ){
-                    // this means something changed
-                    // going to retry
-                    return infoNoauth( attempt + 1 );
-                }
-            }
-        }
+
+        _query.append( b , "query" );
 
         // b.append("inLock",  ??
         stringstream clientStr;
@@ -355,9 +341,9 @@ namespace mongo {
 
         if ( _client )
             b.append( "desc" , _client->desc() );
-        
-        if ( ! _message.empty() ){
-            if ( _progressMeter.isActive() ){
+
+        if ( ! _message.empty() ) {
+            if ( _progressMeter.isActive() ) {
                 StringBuilder buf(128);
                 buf << _message.toString() << " " << _progressMeter.toString();
                 b.append( "msg" , buf.str() );
@@ -370,7 +356,7 @@ namespace mongo {
         return b.obj();
     }
 
-    void Client::gotHandshake( const BSONObj& o ){
+    void Client::gotHandshake( const BSONObj& o ) {
         BSONObjIterator i(o);
 
         {
@@ -378,7 +364,7 @@ namespace mongo {
             assert( id.type() );
             _remoteId = id.wrap( "_id" );
         }
-        
+
         BSONObjBuilder b;
         while ( i.more() )
             b.append( i.next() );
@@ -388,31 +374,31 @@ namespace mongo {
     class HandshakeCmd : public Command {
     public:
         void help(stringstream& h) const { h << "internal"; }
-        HandshakeCmd() : Command( "handshake" ){}
-        virtual LockType locktype() const { return NONE; } 
+        HandshakeCmd() : Command( "handshake" ) {}
+        virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return false; }
         virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             Client& c = cc();
             c.gotHandshake( cmdObj );
             return 1;
-        }        
+        }
 
     } handshakeCmd;
 
     class ClientListPlugin : public WebStatusPlugin {
     public:
-        ClientListPlugin() : WebStatusPlugin( "clients" , 20 ){}
-        virtual void init(){}
-        
-        virtual void run( stringstream& ss ){
+        ClientListPlugin() : WebStatusPlugin( "clients" , 20 ) {}
+        virtual void init() {}
+
+        virtual void run( stringstream& ss ) {
             using namespace mongoutils::html;
 
             ss << "\n<table border=1 cellpadding=2 cellspacing=0>";
             ss << "<tr align='left'>"
                << th( a("", "Connections to the database, both internal and external.", "Client") )
                << th( a("http://www.mongodb.org/display/DOCS/Viewing+and+Terminating+Current+Operation", "", "OpId") )
-               << "<th>Active</th>" 
+               << "<th>Active</th>"
                << "<th>LockType</th>"
                << "<th>Waiting</th>"
                << "<th>SecsRunning</th>"
@@ -426,11 +412,11 @@ namespace mongo {
                << "</tr>\n";
             {
                 scoped_lock bl(Client::clientsMutex);
-                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { 
+                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) {
                     Client *c = *i;
                     CurOp& co = *(c->curop());
                     ss << "<tr><td>" << c->desc() << "</td>";
-                    
+
                     tablecell( ss , co.opNum() );
                     tablecell( ss , co.active() );
                     {
@@ -447,8 +433,9 @@ namespace mongo {
                         tablecell( ss , "" );
                     tablecell( ss , co.getOp() );
                     tablecell( ss , co.getNS() );
-                    if ( co.haveQuery() )
-                        tablecell( ss , co.query( true ) );
+                    if ( co.haveQuery() ) {
+                        tablecell( ss , co.query() );
+                    }
                     else
                         tablecell( ss , "" );
                     tablecell( ss , co.getRemoteString() );
@@ -463,18 +450,18 @@ namespace mongo {
             ss << "</table>\n";
 
         }
-        
+
     } clientListPlugin;
 
-    int Client::recommendedYieldMicros( int * writers , int * readers ){
+    int Client::recommendedYieldMicros( int * writers , int * readers ) {
         int num = 0;
         int w = 0;
         int r = 0;
         {
             scoped_lock bl(clientsMutex);
-            for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ){
+            for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) {
                 Client* c = *i;
-                if ( c->curop()->isWaitingForLock() ){
+                if ( c->curop()->isWaitingForLock() ) {
                     num++;
                     if ( c->curop()->getLockType() > 0 )
                         w++;
@@ -483,15 +470,44 @@ namespace mongo {
                 }
             }
         }
-        
+
         if ( writers )
             *writers = w;
         if ( readers )
             *readers = r;
 
-        if ( num > 50 )
-            num = 50;
+        int time = r * 100;
+        time += w * 500;
+
+        time = min( time , 1000000 );
+
+        // there has been a kill request for this op - we should yield to allow the op to stop
+        // This function returns empty string if we aren't interrupted
+        if ( killCurrentOp.checkForInterruptNoAssert( false )[0] != '\0' ) {
+            return 100;
+        }
+
+        return time;
+    }
+
+    int Client::getActiveClientCount( int& writers, int& readers ) {
+        writers = 0;
+        readers = 0;
+
+        scoped_lock bl(clientsMutex);
+        for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) {
+            Client* c = *i;
+            if ( ! c->curop()->active() )
+                continue;
+
+            int l = c->curop()->getLockType();
+            if ( l > 0 )
+                writers++;
+            else if ( l < 0 )
+                readers++;
+
+        }
 
-        return num * 100;
+        return writers + readers;
     }
 }
diff --git a/db/client.h b/db/client.h
index d0600e3..4e8589e 100644
--- a/db/client.h
+++ b/db/client.h
@@ -16,7 +16,7 @@
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-/* Client represents a connection to the database (the server-side) and corresponds 
+/* Client represents a connection to the database (the server-side) and corresponds
    to an open socket (or logical connection if pooling on sockets) from a client.
 
    todo: switch to asio...this will fit nicely with that.
@@ -26,11 +26,11 @@
 
 #include "../pch.h"
 #include "security.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "lasterror.h"
 #include "stats/top.h"
 
-namespace mongo { 
+namespace mongo {
 
     extern class ReplSet *theReplSet;
     class AuthenticationInfo;
@@ -42,18 +42,83 @@ namespace mongo {
 
     extern boost::thread_specific_ptr<Client> currentClient;
 
-    class Client : boost::noncopyable { 
+    typedef long long ConnectionId;
+
+    class Client : boost::noncopyable {
     public:
+        class Context;
+
+        static mongo::mutex clientsMutex;
+        static set<Client*> clients; // always be in clientsMutex when manipulating this
+        static int recommendedYieldMicros( int * writers = 0 , int * readers = 0 );
+        static int getActiveClientCount( int& writers , int& readers );
+
         static Client *syncThread;
-        void iAmSyncThread() { 
+
+
+        /* each thread which does db operations has a Client object in TLS.
+           call this when your thread starts.
+        */
+        static Client& initThread(const char *desc, MessagingPort *mp = 0);
+
+        /*
+           this has to be called as the client goes away, but before thread termination
+           @return true if anything was done
+         */
+        bool shutdown();
+
+
+        ~Client();
+
+        void iAmSyncThread() {
             wassert( syncThread == 0 );
-            syncThread = this; 
+            syncThread = this;
         }
         bool isSyncThread() const { return this == syncThread; } // true if this client is the replication secondary pull thread
 
-        static mongo::mutex clientsMutex;
-        static set<Client*> clients; // always be in clientsMutex when manipulating this
-        static int recommendedYieldMicros( int * writers = 0 , int * readers = 0 );
+
+        string clientAddress(bool includePort=false) const;
+        AuthenticationInfo * getAuthenticationInfo() { return &_ai; }
+        bool isAdmin() { return _ai.isAuthorized( "admin" ); }
+        CurOp* curop() const { return _curOp; }
+        Context* getContext() const { return _context; }
+        Database* database() const {  return _context ? _context->db() : 0; }
+        const char *ns() const { return _context->ns(); }
+        const char *desc() const { return _desc; }
+        void setLastOp( ReplTime op ) { _lastOp = op; }
+        ReplTime getLastOp() const { return _lastOp; }
+
+        /* report what the last operation was.  used by getlasterror */
+        void appendLastOp( BSONObjBuilder& b ) const;
+
+        bool isGod() const { return _god; } /* this is for map/reduce writes */
+        string toString() const;
+        void gotHandshake( const BSONObj& o );
+        BSONObj getRemoteID() const { return _remoteId; }
+        BSONObj getHandshake() const { return _handshake; }
+
+        MessagingPort * port() const { return _mp; }
+
+        ConnectionId getConnectionId() const { return _connectionId; }
+
+    private:
+        ConnectionId _connectionId; // > 0 for things "conn", 0 otherwise
+        CurOp * _curOp;
+        Context * _context;
+        bool _shutdown;
+        const char *_desc;
+        bool _god;
+        AuthenticationInfo _ai;
+        ReplTime _lastOp;
+        BSONObj _handshake;
+        BSONObj _remoteId;
+        MessagingPort * const _mp;
+
+        Client(const char *desc, MessagingPort *p = 0);
+
+        friend class CurOp;
+
+    public:
 
         /* set _god=true temporarily, safely */
         class GodScope {
@@ -63,201 +128,99 @@ namespace mongo {
             ~GodScope();
         };
 
+
         /* Set database we want to use, then, restores when we finish (are out of scope)
            Note this is also helpful if an exception happens as the state if fixed up.
         */
-        class Context : boost::noncopyable{
-            Client * _client;
-            Context * _oldContext;
-            
-            string _path;
-            mongolock * _lock;
-            bool _justCreated;
-
-            string _ns;
-            Database * _db;
-
+        class Context : boost::noncopyable {
+        public:
             /**
-             * at this point _client, _oldContext and _ns have to be set
-             * _db should not have been touched
-             * this will set _db and create if needed
-             * will also set _client->_context to this
+             * this is the main constructor
+             * use this unless there is a good reason not to
              */
-            void _finishInit( bool doauth=true);
-            
-            void _auth( int lockState = dbMutex.getState() );
-        public:
-            Context(const string& ns, string path=dbpath, mongolock * lock = 0 , bool doauth=true ) 
-                : _client( currentClient.get() ) , _oldContext( _client->_context ) , 
-                  _path( path ) , _lock( lock ) , 
-                  _ns( ns ), _db(0){
-                _finishInit( doauth );
-            }
-            
+            Context(const string& ns, string path=dbpath, mongolock * lock = 0 , bool doauth=true );
+
             /* this version saves the context but doesn't yet set the new one: */
-            
-            Context() 
-                : _client( currentClient.get() ) , _oldContext( _client->_context ), 
-                  _path( dbpath ) , _lock(0) , _justCreated(false), _db(0){
-                _client->_context = this;
-                clear();
-            }
-            
+            Context();
+
             /**
              * if you are doing this after allowing a write there could be a race condition
              * if someone closes that db.  this checks that the DB is still valid
              */
             Context( string ns , Database * db, bool doauth=true );
-            
+
             ~Context();
 
-            Client* getClient() const { return _client; }            
+            Client* getClient() const { return _client; }
             Database* db() const { return _db; }
-            const char * ns() const { return _ns.c_str(); }            
+            const char * ns() const { return _ns.c_str(); }
+
+            /** @return if the db was created by this Context */
             bool justCreated() const { return _justCreated; }
 
-            bool equals( const string& ns , const string& path=dbpath ) const {
-                return _ns == ns && _path == path;
-            }
+            bool equals( const string& ns , const string& path=dbpath ) const { return _ns == ns && _path == path; }
 
-            bool inDB( const string& db , const string& path=dbpath ) const {
-                if ( _path != path )
-                    return false;
-                
-                if ( db == _ns )
-                    return true;
-
-                string::size_type idx = _ns.find( db );
-                if ( idx != 0 )
-                    return false;
-                
-                return  _ns[db.size()] == '.';
-            }
+            /**
+             * @return true iff the current Context is using db/path
+             */
+            bool inDB( const string& db , const string& path=dbpath ) const;
 
-            void clear(){
-                _ns = "";
-                _db = 0;
-            }
+            void clear() { _ns = ""; _db = 0; }
 
             /**
              * call before unlocking, so clear any non-thread safe state
              */
-            void unlocked(){
-                _db = 0;
-            }
+            void unlocked() { _db = 0; }
 
             /**
              * call after going back into the lock, will re-establish non-thread safe stuff
              */
-            void relocked(){
-                _finishInit();
-            }
+            void relocked() { _finishInit(); }
 
             friend class CurOp;
-        }; // class Client::Context
-        
-    private:
-        void _dropns( const string& ns );
-
-        CurOp * _curOp;
-        Context * _context;
-        bool _shutdown;
-        set<string> _tempCollections;
-        const char *_desc;
-        bool _god;
-        AuthenticationInfo _ai;
-        ReplTime _lastOp;
-        BSONObj _handshake;
-        BSONObj _remoteId;
-
-    public:
-        MessagingPort * const _mp;
 
-        string clientAddress() const;
-        AuthenticationInfo * getAuthenticationInfo(){ return &_ai; }
-        bool isAdmin() { return _ai.isAuthorized( "admin" ); }
-        CurOp* curop() { return _curOp; }        
-        Context* getContext(){ return _context; }
-        Database* database() {  return _context ? _context->db() : 0; }
-        const char *ns() const { return _context->ns(); }
-        const char *desc() const { return _desc; }
-        
-        Client(const char *desc, MessagingPort *p = 0);
-        ~Client();
+        private:
+            /**
+             * at this point _client, _oldContext and _ns have to be set
+             * _db should not have been touched
+             * this will set _db and create if needed
+             * will also set _client->_context to this
+             */
+            void _finishInit( bool doauth=true);
 
-        void addTempCollection( const string& ns );
-        
-        void _invalidateDB(const string& db);
-        static void invalidateDB(const string& db);
-        static void invalidateNS( const string& ns );
+            void _auth( int lockState = dbMutex.getState() );
 
-        void setLastOp( ReplTime op ) { _lastOp = op; }
-        ReplTime getLastOp() const { return _lastOp; }
+            Client * _client;
+            Context * _oldContext;
 
-        /* report what the last operation was.  used by getlasterror */
-        void appendLastOp( BSONObjBuilder& b ) {
-            if( theReplSet ) { 
-                b.append("lastOp" , (long long) _lastOp);
-            }
-            else {
-                OpTime lo(_lastOp);
-                if ( ! lo.isNull() )
-                    b.appendTimestamp( "lastOp" , lo.asDate() );
-            }
-        }
+            string _path;
+            mongolock * _lock;
+            bool _justCreated;
 
-        /* each thread which does db operations has a Client object in TLS.  
-           call this when your thread starts. 
-        */
-        static Client& initThread(const char *desc, MessagingPort *mp = 0);
+            string _ns;
+            Database * _db;
 
-        /* 
-           this has to be called as the client goes away, but before thread termination
-           @return true if anything was done
-         */
-        bool shutdown();
-        
-        /* this is for map/reduce writes */
-        bool isGod() const { return _god; }
+        }; // class Client::Context
 
-        friend class CurOp;
 
-        string toString() const;
-        void gotHandshake( const BSONObj& o );
-        BSONObj getRemoteID() const { return _remoteId; }
-        BSONObj getHandshake() const { return _handshake; }
     };
-    
+
     /** get the Client object for this thread. */
-    inline Client& cc() { 
+    inline Client& cc() {
         Client * c = currentClient.get();
         assert( c );
         return *c;
     }
 
-    /* each thread which does db operations has a Client object in TLS.  
-       call this when your thread starts. 
-    */
-    inline Client& Client::initThread(const char *desc, MessagingPort *mp) {
-        setThreadName(desc);
-        assert( currentClient.get() == 0 );
-        Client *c = new Client(desc, mp);
-        currentClient.reset(c);
-        mongo::lastError.initThread();
-        return *c;
-    }
-
-    inline Client::GodScope::GodScope(){
+    inline Client::GodScope::GodScope() {
         _prev = cc()._god;
         cc()._god = true;
     }
 
-    inline Client::GodScope::~GodScope(){
-        cc()._god = _prev;
-    }
+    inline Client::GodScope::~GodScope() { cc()._god = _prev; }
 
-	/* this unlocks, does NOT upgrade. that works for our current usage */
-    inline void mongolock::releaseAndWriteLock() { 
+    /* this unlocks, does NOT upgrade. that works for our current usage */
+    inline void mongolock::releaseAndWriteLock() {
         if( !_writelock ) {
 
 #if BOOST_VERSION >= 103500
@@ -278,6 +241,6 @@ namespace mongo {
     }
 
     string sayClientState();
-  
+
     inline bool haveClient() { return currentClient.get() > 0; }
 };
diff --git a/db/clientcursor.cpp b/db/clientcursor.cpp
index 23ef529..bc09457 100644
--- a/db/clientcursor.cpp
+++ b/db/clientcursor.cpp
@@ -32,18 +32,18 @@
 
 namespace mongo {
 
-    typedef multimap<DiskLoc, ClientCursor*> CCByLoc;
-
     CCById ClientCursor::clientCursorsById;
     boost::recursive_mutex ClientCursor::ccmutex;
     long long ClientCursor::numberTimedOut = 0;
 
-    /*static*/ void ClientCursor::assertNoCursors() { 
+    void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ); // from s/d_logic.h
+
+    /*static*/ void ClientCursor::assertNoCursors() {
         recursive_scoped_lock lock(ccmutex);
-        if( clientCursorsById.size() ) { 
+        if( clientCursorsById.size() ) {
             log() << "ERROR clientcursors exist but should not at this point" << endl;
             ClientCursor *cc = clientCursorsById.begin()->second;
-            log() << "first one: " << cc->cursorid << ' ' << cc->ns << endl;
+            log() << "first one: " << cc->_cursorid << ' ' << cc->_ns << endl;
             clientCursorsById.clear();
             assert(false);
         }
@@ -51,18 +51,19 @@ namespace mongo {
 
 
     void ClientCursor::setLastLoc_inlock(DiskLoc L) {
+        assert( _pos != -2 ); // defensive - see ~ClientCursor
+
         if ( L == _lastLoc )
             return;
 
         CCByLoc& bl = byLoc();
+
         if ( !_lastLoc.isNull() ) {
-            CCByLoc::iterator i = kv_find(bl, _lastLoc, this);
-            if ( i != bl.end() )
-                bl.erase(i);
+            bl.erase( ByLocKey( _lastLoc, _cursorid ) );
         }
 
         if ( !L.isNull() )
-            bl.insert( make_pair(L, this) );
+            bl[ByLocKey(L,_cursorid)] = this;
         _lastLoc = L;
     }
 
@@ -74,8 +75,8 @@ namespace mongo {
 
     /* todo: this implementation is incomplete.  we use it as a prefix for dropDatabase, which
              works fine as the prefix will end with '.'.  however, when used with drop and
-    		 dropIndexes, this could take out cursors that belong to something else -- if you
-    		 drop "foo", currently, this will kill cursors for "foobar".
+             dropIndexes, this could take out cursors that belong to something else -- if you
+             drop "foo", currently, this will kill cursors for "foobar".
     */
     void ClientCursor::invalidate(const char *nsPrefix) {
         vector<ClientCursor*> toDelete;
@@ -84,6 +85,7 @@ namespace mongo {
         assert( len > 0 && strchr(nsPrefix, '.') );
 
         {
+            //cout << "\nTEMP invalidate " << nsPrefix << endl;
             recursive_scoped_lock lock(ccmutex);
 
             Database *db = cc().database();
@@ -92,18 +94,18 @@ namespace mongo {
 
             for( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) {
                 ClientCursor *cc = i->second;
-                if( cc->_db != db ) 
+                if( cc->_db != db )
                     continue;
-                if ( strncmp(nsPrefix, cc->ns.c_str(), len) == 0 ) {
+                if ( strncmp(nsPrefix, cc->_ns.c_str(), len) == 0 ) {
                     toDelete.push_back(i->second);
                 }
             }
 
             /*
             note : we can't iterate byloc because clientcursors may exist with a loc of null in which case
-                   they are not in the map.  perhaps they should not exist though in the future?  something to 
+                   they are not in the map.  perhaps they should not exist though in the future?  something to
                    change???
-                   
+
             CCByLoc& bl = db->ccByLoc;
             for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); ++i ) {
                 ClientCursor *cc = i->second;
@@ -115,10 +117,16 @@ namespace mongo {
 
             for ( vector<ClientCursor*>::iterator i = toDelete.begin(); i != toDelete.end(); ++i )
                 delete (*i);
+
+            /*cout << "TEMP after invalidate " << endl;
+            for( auto i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) {
+                cout << "  " << i->second->ns << endl;
+            }
+            cout << "TEMP after invalidate done" << endl;*/
         }
     }
 
-    bool ClientCursor::shouldTimeout( unsigned millis ){
+    bool ClientCursor::shouldTimeout( unsigned millis ) {
         _idleAgeMillis += millis;
         return _idleAgeMillis > 600000 && _pinValue == 0;
     }
@@ -130,9 +138,9 @@ namespace mongo {
         for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end();  ) {
             CCById::iterator j = i;
             i++;
-            if( j->second->shouldTimeout( millis ) ){
+            if( j->second->shouldTimeout( millis ) ) {
                 numberTimedOut++;
-                log(1) << "killing old cursor " << j->second->cursorid << ' ' << j->second->ns 
+                log(1) << "killing old cursor " << j->second->_cursorid << ' ' << j->second->_ns
                        << " idle:" << j->second->idleTime() << "ms\n";
                 delete j->second;
             }
@@ -150,10 +158,10 @@ namespace mongo {
             log() << "perf warning: byLoc.size=" << bl.size() << " in aboutToDeleteBucket\n";
         }
         for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); i++ )
-            i->second->c->aboutToDeleteBucket(b);
+            i->second->_c->aboutToDeleteBucket(b);
     }
     void aboutToDeleteBucket(const DiskLoc& b) {
-        ClientCursor::informAboutToDeleteBucket(b); 
+        ClientCursor::informAboutToDeleteBucket(b);
     }
 
     /* must call this on a delete so we clean up the cursors. */
@@ -162,9 +170,12 @@ namespace mongo {
 
         Database *db = cc().database();
         assert(db);
+
+        aboutToDeleteForSharding( db , dl );
+
         CCByLoc& bl = db->ccByLoc;
-        CCByLoc::iterator j = bl.lower_bound(dl);
-        CCByLoc::iterator stop = bl.upper_bound(dl);
+        CCByLoc::iterator j = bl.lower_bound(ByLocKey::min(dl));
+        CCByLoc::iterator stop = bl.upper_bound(ByLocKey::max(dl));
         if ( j == stop )
             return;
 
@@ -172,26 +183,45 @@ namespace mongo {
 
         while ( 1 ) {
             toAdvance.push_back(j->second);
-            DEV assert( j->first == dl );
+            DEV assert( j->first.loc == dl );
             ++j;
             if ( j == stop )
                 break;
         }
 
-        wassert( toAdvance.size() < 5000 );
-        
-        for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ){
+        if( toAdvance.size() >= 3000 ) {
+            log() << "perf warning MPW101: " << toAdvance.size() << " cursors for one diskloc "
+                  << dl.toString()
+                  << ' ' << toAdvance[1000]->_ns
+                  << ' ' << toAdvance[2000]->_ns
+                  << ' ' << toAdvance[1000]->_pinValue
+                  << ' ' << toAdvance[2000]->_pinValue
+                  << ' ' << toAdvance[1000]->_pos
+                  << ' ' << toAdvance[2000]->_pos
+                  << ' ' << toAdvance[1000]->_idleAgeMillis
+                  << ' ' << toAdvance[2000]->_idleAgeMillis
+                  << ' ' << toAdvance[1000]->_doingDeletes
+                  << ' ' << toAdvance[2000]->_doingDeletes
+                  << endl;
+            //wassert( toAdvance.size() < 5000 );
+        }
+
+        for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ) {
             ClientCursor* cc = *i;
             wassert(cc->_db == db);
-            
+
             if ( cc->_doingDeletes ) continue;
 
-            Cursor *c = cc->c.get();
-            if ( c->capped() ){
+            Cursor *c = cc->_c.get();
+            if ( c->capped() ) {
+                /* note we cannot advance here. if this condition occurs, writes to the oplog
+                   have "caught" the reader.  skipping ahead, the reader would miss postentially
+                   important data.
+                   */
                 delete cc;
                 continue;
             }
-            
+
             c->checkLocation();
             DiskLoc tmp1 = c->refLoc();
             if ( tmp1 != dl ) {
@@ -213,53 +243,131 @@ namespace mongo {
     }
     void aboutToDelete(const DiskLoc& dl) { ClientCursor::aboutToDelete(dl); }
 
+    ClientCursor::ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query ) :
+        _ns(ns), _db( cc().database() ),
+        _c(c), _pos(0),
+        _query(query),  _queryOptions(queryOptions),
+        _idleAgeMillis(0), _pinValue(0),
+        _doingDeletes(false), _yieldSometimesTracker(128,10) {
+        assert( _db );
+        assert( str::startsWith(_ns, _db->name) );
+        if( queryOptions & QueryOption_NoCursorTimeout )
+            noTimeout();
+        recursive_scoped_lock lock(ccmutex);
+        _cursorid = allocCursorId_inlock();
+        clientCursorsById.insert( make_pair(_cursorid, this) );
+
+        if ( ! _c->modifiedKeys() ) {
+            // store index information so we can decide if we can
+            // get something out of the index key rather than full object
+
+            int x = 0;
+            BSONObjIterator i( _c->indexKeyPattern() );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.isNumber() ) {
+                    // only want basic index fields, not "2d" etc
+                    _indexedFields[e.fieldName()] = x;
+                }
+                x++;
+            }
+        }
+
+    }
+
+
     ClientCursor::~ClientCursor() {
-        assert( pos != -2 );
+        assert( _pos != -2 );
 
         {
             recursive_scoped_lock lock(ccmutex);
             setLastLoc_inlock( DiskLoc() ); // removes us from bylocation multimap
-            clientCursorsById.erase(cursorid);
+            clientCursorsById.erase(_cursorid);
 
             // defensive:
-            (CursorId&) cursorid = -1;
-            pos = -2;
+            (CursorId&)_cursorid = -1;
+            _pos = -2;
+        }
+    }
+
+    bool ClientCursor::getFieldsDotted( const string& name, BSONElementSet &ret ) {
+
+        map<string,int>::const_iterator i = _indexedFields.find( name );
+        if ( i == _indexedFields.end() ) {
+            current().getFieldsDotted( name , ret );
+            return false;
+        }
+
+        int x = i->second;
+
+        BSONObjIterator it( currKey() );
+        while ( x && it.more() ) {
+            it.next();
+            x--;
         }
+        assert( x == 0 );
+        ret.insert( it.next() );
+        return true;
+    }
+
+    BSONElement ClientCursor::getFieldDotted( const string& name , bool * fromKey ) {
+
+        map<string,int>::const_iterator i = _indexedFields.find( name );
+        if ( i == _indexedFields.end() ) {
+            if ( fromKey )
+                *fromKey = false;
+            return current().getFieldDotted( name );
+        }
+        
+        int x = i->second;
+
+        BSONObjIterator it( currKey() );
+        while ( x && it.more() ) {
+            it.next();
+            x--;
+        }
+        assert( x == 0 );
+
+        if ( fromKey )
+            *fromKey = true;
+        return it.next();
     }
 
+
     /* call when cursor's location changes so that we can update the
        cursorsbylocation map.  if you are locked and internally iterating, only
        need to call when you are ready to "unlock".
     */
     void ClientCursor::updateLocation() {
-        assert( cursorid );
+        assert( _cursorid );
         _idleAgeMillis = 0;
-        DiskLoc cl = c->refLoc();
+        DiskLoc cl = _c->refLoc();
         if ( lastLoc() == cl ) {
             //log() << "info: lastloc==curloc " << ns << '\n';
-        } else {
+        }
+        else {
             recursive_scoped_lock lock(ccmutex);
             setLastLoc_inlock(cl);
         }
         // may be necessary for MultiCursor even when cl hasn't changed
-        c->noteLocation();
+        _c->noteLocation();
     }
-    
+
     int ClientCursor::yieldSuggest() {
         int writers = 0;
         int readers = 0;
-        
+
         int micros = Client::recommendedYieldMicros( &writers , &readers );
-        
-        if ( micros > 0 && writers == 0 && dbMutex.getState() <= 0 ){
+
+        if ( micros > 0 && writers == 0 && dbMutex.getState() <= 0 ) {
             // we have a read lock, and only reads are coming on, so why bother unlocking
             micros = 0;
         }
-        
+
         return micros;
     }
-    
-    bool ClientCursor::yieldSometimes(){
+
+    bool ClientCursor::yieldSometimes() {
         if ( ! _yieldSometimesTracker.ping() )
             return true;
 
@@ -267,82 +375,83 @@ namespace mongo {
         return ( micros > 0 ) ? yield( micros ) : true;
     }
 
-    void ClientCursor::staticYield( int micros ) {
+    void ClientCursor::staticYield( int micros , const StringData& ns ) {
+        killCurrentOp.checkForInterrupt( false );
         {
             dbtempreleasecond unlock;
-            if ( unlock.unlocked() ){
+            if ( unlock.unlocked() ) {
                 if ( micros == -1 )
                     micros = Client::recommendedYieldMicros();
                 if ( micros > 0 )
-                    sleepmicros( micros ); 
+                    sleepmicros( micros );
             }
             else {
-                log( LL_WARNING ) << "ClientCursor::yield can't unlock b/c of recursive lock" << endl;
+                warning() << "ClientCursor::yield can't unlock b/c of recursive lock ns: " << ns << endl;
             }
-        }        
+        }
     }
-    
+
     bool ClientCursor::prepareToYield( YieldData &data ) {
-        if ( ! c->supportYields() )
+        if ( ! _c->supportYields() )
             return false;
         // need to store in case 'this' gets deleted
-        data._id = cursorid;
-        
+        data._id = _cursorid;
+
         data._doingDeletes = _doingDeletes;
         _doingDeletes = false;
-        
+
         updateLocation();
-        
+
         {
-            /* a quick test that our temprelease is safe. 
-             todo: make a YieldingCursor class 
+            /* a quick test that our temprelease is safe.
+             todo: make a YieldingCursor class
              and then make the following code part of a unit test.
              */
             const int test = 0;
             static bool inEmpty = false;
-            if( test && !inEmpty ) { 
+            if( test && !inEmpty ) {
                 inEmpty = true;
                 log() << "TEST: manipulate collection during cc:yield" << endl;
-                if( test == 1 ) 
-                    Helpers::emptyCollection(ns.c_str());
+                if( test == 1 )
+                    Helpers::emptyCollection(_ns.c_str());
                 else if( test == 2 ) {
                     BSONObjBuilder b; string m;
-                    dropCollection(ns.c_str(), m, b);
+                    dropCollection(_ns.c_str(), m, b);
                 }
-                else { 
-                    dropDatabase(ns.c_str());
+                else {
+                    dropDatabase(_ns.c_str());
                 }
             }
-        }        
+        }
         return true;
     }
-    
+
     bool ClientCursor::recoverFromYield( const YieldData &data ) {
         ClientCursor *cc = ClientCursor::find( data._id , false );
-        if ( cc == 0 ){
+        if ( cc == 0 ) {
             // id was deleted
             return false;
         }
-        
+
         cc->_doingDeletes = data._doingDeletes;
-        cc->c->checkLocation();
-        return true;        
+        cc->_c->checkLocation();
+        return true;
     }
-    
+
     bool ClientCursor::yield( int micros ) {
-        if ( ! c->supportYields() )
+        if ( ! _c->supportYields() )
             return true;
-        YieldData data; 
+        YieldData data;
         prepareToYield( data );
-        
-        staticYield( micros );
+
+        staticYield( micros , _ns );
 
         return ClientCursor::recoverFromYield( data );
     }
 
     int ctmLast = 0; // so we don't have to do find() which is a little slow very often.
     long long ClientCursor::allocCursorId_inlock() {
-        if( 0 ) { 
+        if( 0 ) {
             static long long z;
             ++z;
             cout << "TEMP alloccursorid " << z << endl;
@@ -362,32 +471,32 @@ namespace mongo {
         return x;
     }
 
-    void ClientCursor::storeOpForSlave( DiskLoc last ){
+    void ClientCursor::storeOpForSlave( DiskLoc last ) {
         if ( ! ( _queryOptions & QueryOption_OplogReplay ))
             return;
 
         if ( last.isNull() )
             return;
-        
+
         BSONElement e = last.obj()["ts"];
         if ( e.type() == Date || e.type() == Timestamp )
             _slaveReadTill = e._opTime();
     }
-    
-    void ClientCursor::updateSlaveLocation( CurOp& curop ){
+
+    void ClientCursor::updateSlaveLocation( CurOp& curop ) {
         if ( _slaveReadTill.isNull() )
             return;
-        mongo::updateSlaveLocation( curop , ns.c_str() , _slaveReadTill );
+        mongo::updateSlaveLocation( curop , _ns.c_str() , _slaveReadTill );
     }
 
 
-    void ClientCursor::appendStats( BSONObjBuilder& result ){
+    void ClientCursor::appendStats( BSONObjBuilder& result ) {
         recursive_scoped_lock lock(ccmutex);
-        result.appendNumber("totalOpen", (int)clientCursorsById.size() );
+        result.appendNumber("totalOpen", clientCursorsById.size() );
         result.appendNumber("clientCursors_size", (int) numCursors());
-        result.appendNumber("timedOut" , (int)numberTimedOut);
+        result.appendNumber("timedOut" , numberTimedOut);
     }
-    
+
     // QUESTION: Restrict to the namespace from which this command was issued?
     // Alternatively, make this command admin-only?
     class CmdCursorInfo : public Command {
@@ -398,19 +507,19 @@ namespace mongo {
             help << " example: { cursorInfo : 1 }";
         }
         virtual LockType locktype() const { return NONE; }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             ClientCursor::appendStats( result );
             return true;
         }
     } cmdCursorInfo;
-    
-    void ClientCursorMonitor::run(){
+
+    void ClientCursorMonitor::run() {
         Client::initThread("clientcursormon");
         Client& client = cc();
-        
+
         unsigned old = curTimeMillis();
 
-        while ( ! inShutdown() ){
+        while ( ! inShutdown() ) {
             unsigned now = curTimeMillis();
             ClientCursor::idleTimeReport( now - old );
             old = now;
@@ -420,15 +529,28 @@ namespace mongo {
         client.shutdown();
     }
 
-    void ClientCursor::find( const string& ns , set<CursorId>& all ){
+    void ClientCursor::find( const string& ns , set<CursorId>& all ) {
         recursive_scoped_lock lock(ccmutex);
-        
-        for ( CCById::iterator i=clientCursorsById.begin(); i!=clientCursorsById.end(); ++i ){
-            if ( i->second->ns == ns )
+
+        for ( CCById::iterator i=clientCursorsById.begin(); i!=clientCursorsById.end(); ++i ) {
+            if ( i->second->_ns == ns )
                 all.insert( i->first );
         }
     }
 
+    int ClientCursor::erase(int n, long long *ids) {
+        int found = 0;
+        for ( int i = 0; i < n; i++ ) {
+            if ( erase(ids[i]) )
+                found++;
+
+            if ( inShutdown() )
+                break;
+        }
+        return found;
+
+    }
+
 
     ClientCursorMonitor clientCursorMonitor;
 
diff --git a/db/clientcursor.h b/db/clientcursor.h
index b895c17..f1d107f 100644
--- a/db/clientcursor.h
+++ b/db/clientcursor.h
@@ -33,6 +33,7 @@
 #include "dbhelpers.h"
 #include "matcher.h"
 #include "../client/dbclient.h"
+#include "projection.h"
 
 namespace mongo {
 
@@ -41,31 +42,35 @@ namespace mongo {
     class ClientCursor;
     class ParsedQuery;
 
+    struct ByLocKey {
+
+        ByLocKey( const DiskLoc & l , const CursorId& i ) : loc(l), id(i) {}
+
+        static ByLocKey min( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::min() ); }
+        static ByLocKey max( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::max() ); }
+
+        bool operator<( const ByLocKey &other ) const {
+            int x = loc.compare( other.loc );
+            if ( x )
+                return x < 0;
+            return id < other.id;
+        }
+
+        DiskLoc loc;
+        CursorId id;
+
+    };
+
     /* todo: make this map be per connection.  this will prevent cursor hijacking security attacks perhaps.
+     *       ERH: 9/2010 this may not work since some drivers send getMore over a different connection
     */
     typedef map<CursorId, ClientCursor*> CCById;
+    typedef map<ByLocKey, ClientCursor*> CCByLoc;
 
     extern BSONObj id_obj;
 
     class ClientCursor {
         friend class CmdCursorInfo;
-        DiskLoc _lastLoc;                        // use getter and setter not this (important)
-        unsigned _idleAgeMillis;                 // how long has the cursor been around, relative to server idle time
-
-        /* 0 = normal
-           1 = no timeout allowed
-           100 = in use (pinned) -- see Pointer class
-        */
-        unsigned _pinValue;
-
-        bool _doingDeletes;
-        ElapsedTracker _yieldSometimesTracker;
-
-        static CCById clientCursorsById;
-        static long long numberTimedOut;
-        static boost::recursive_mutex ccmutex;   // must use this for all statics above!        
-        static CursorId allocCursorId_inlock();        
-        
     public:
         static void assertNoCursors();
 
@@ -75,32 +80,38 @@ namespace mongo {
            at the same time - which might be bad.  That should never happen, but if a client driver
            had a bug, it could (or perhaps some sort of attack situation).
         */
-        class Pointer : boost::noncopyable { 
-        public:
+        class Pointer : boost::noncopyable {
             ClientCursor *_c;
+        public:
+            ClientCursor * c() { return _c; }
             void release() {
                 if( _c ) {
                     assert( _c->_pinValue >= 100 );
                     _c->_pinValue -= 100;
+                    _c = 0;
                 }
+            }
+            /**
+             * call this if during a yield, the cursor got deleted
+             * if so, we don't want to use the point address
+             */
+            void deleted() {
                 _c = 0;
             }
+            ~Pointer() { release(); }
             Pointer(long long cursorid) {
                 recursive_scoped_lock lock(ccmutex);
                 _c = ClientCursor::find_inlock(cursorid, true);
                 if( _c ) {
                     if( _c->_pinValue >= 100 ) {
                         _c = 0;
-                        uassert(12051, "clientcursor already in use? driver problem?", false);
+                        uasserted(12051, "clientcursor already in use? driver problem?");
                     }
                     _c->_pinValue += 100;
                 }
             }
-            ~Pointer() {
-                release();
-            }
-        }; 
-        
+        };
+
         // This object assures safe and reliable cleanup of the ClientCursor.
         // The implementation assumes that there will be no duplicate ids among cursors
         // (which is assured if cursors must last longer than 1 second).
@@ -108,19 +119,17 @@ namespace mongo {
         public:
             CleanupPointer() : _c( 0 ), _id( -1 ) {}
             void reset( ClientCursor *c = 0 ) {
-                if ( c == _c ) {
+                if ( c == _c )
                     return;
-                }
-
                 if ( _c ) {
                     // be careful in case cursor was deleted by someone else
                     ClientCursor::erase( _id );
                 }
-                
                 if ( c ) {
                     _c = c;
-                    _id = c->cursorid;
-                } else {
+                    _id = c->_cursorid;
+                }
+                else {
                     _c = 0;
                     _id = -1;
                 }
@@ -135,40 +144,19 @@ namespace mongo {
             CursorId _id;
         };
 
-        /*const*/ CursorId cursorid;
-        const string ns;
-        const shared_ptr<Cursor> c;
-        int pos;                        // # objects into the cursor so far 
-        const BSONObj query;            // used for logging diags only; optional in constructor
-        const int _queryOptions;        // see enum QueryOptions dbclient.h
-        OpTime _slaveReadTill;
-        Database * const _db;
-
-        ClientCursor(int queryOptions, shared_ptr<Cursor>& _c, const string& _ns, BSONObj _query = BSONObj()) :
-            _idleAgeMillis(0), _pinValue(0), 
-            _doingDeletes(false), _yieldSometimesTracker(128,10),
-            ns(_ns), c(_c), 
-            pos(0), query(_query), 
-            _queryOptions(queryOptions), 
-            _db( cc().database() )
-        {
-            assert( _db );
-            assert( str::startsWith(_ns, _db->name) );
-            if( queryOptions & QueryOption_NoCursorTimeout )
-                noTimeout();
-            recursive_scoped_lock lock(ccmutex);
-            cursorid = allocCursorId_inlock();
-            clientCursorsById.insert( make_pair(cursorid, this) );
-        }
+        ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query = BSONObj() );
+
         ~ClientCursor();
 
-        DiskLoc lastLoc() const {
-            return _lastLoc;
-        }
+        // ***************  basic accessors *******************
 
-        shared_ptr< ParsedQuery > pq;
-        shared_ptr< FieldMatcher > fields; // which fields query wants returned
-        Message originalMessage; // this is effectively an auto ptr for data the matcher points to
+        CursorId cursorid() const { return _cursorid; }
+        string ns() const { return _ns; }
+        Database * db() const { return _db; }
+        const BSONObj& query() const { return _query; }
+        int queryOptions() const { return _queryOptions; }
+
+        DiskLoc lastLoc() const { return _lastLoc; }
 
         /* Get rid of cursors for namespaces that begin with nsprefix.
            Used by drop, dropIndexes, dropDatabase.
@@ -176,14 +164,14 @@ namespace mongo {
         static void invalidate(const char *nsPrefix);
 
         /**
-         * @param microsToSleep -1 : ask client 
+         * @param microsToSleep -1 : ask client
          *                     >=0 : sleep for that amount
-         * do a dbtemprelease 
-         * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic - 
+         * do a dbtemprelease
+         * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic -
          *       we don't do herein as this->matcher (above) is only initialized for true queries/getmore.
          *       (ie not set for remote/update)
-         * @return if the cursor is still valid. 
-         *         if false is returned, then this ClientCursor should be considered deleted - 
+         * @return if the cursor is still valid.
+         *         if false is returned, then this ClientCursor should be considered deleted -
          *         in fact, the whole database could be gone.
          */
         bool yield( int microsToSleep = -1 );
@@ -192,72 +180,82 @@ namespace mongo {
          * @return same as yield()
          */
         bool yieldSometimes();
-        
+
         static int yieldSuggest();
-        static void staticYield( int micros );
-        
+        static void staticYield( int micros , const StringData& ns );
+
         struct YieldData { CursorId _id; bool _doingDeletes; };
         bool prepareToYield( YieldData &data );
         static bool recoverFromYield( const YieldData &data );
 
         struct YieldLock : boost::noncopyable {
             explicit YieldLock( ptr<ClientCursor> cc )
-                : _canYield(cc->c->supportYields()) {
-                if ( _canYield ){
+                : _canYield(cc->_c->supportYields()) {
+                if ( _canYield ) {
                     cc->prepareToYield( _data );
                     _unlock.reset(new dbtempreleasecond());
                 }
             }
-            ~YieldLock(){
-                if ( _unlock ){
+            ~YieldLock() {
+                if ( _unlock ) {
                     log( LL_WARNING ) << "ClientCursor::YieldLock not closed properly" << endl;
                     relock();
                 }
             }
-
-            bool stillOk(){
+            bool stillOk() {
                 if ( ! _canYield )
                     return true;
-
                 relock();
-                
                 return ClientCursor::recoverFromYield( _data );
             }
-
-            void relock(){
+            void relock() {
                 _unlock.reset();
             }
-            
         private:
-            bool _canYield;
+            const bool _canYield;
             YieldData _data;
-            
             scoped_ptr<dbtempreleasecond> _unlock;
-
         };
 
         // --- some pass through helpers for Cursor ---
 
-        BSONObj indexKeyPattern() {
-            return c->indexKeyPattern();
-        }
+        Cursor* c() const { return _c.get(); }
+        int pos() const { return _pos; }
 
-        bool ok(){
-            return c->ok();
-        }
+        void incPos( int n ) { _pos += n; } // TODO: this is bad
+        void setPos( int n ) { _pos = n; } // TODO : this is bad too
 
-        bool advance(){
-            return c->advance();
-        }
+        BSONObj indexKeyPattern() { return _c->indexKeyPattern();  }
+        bool modifiedKeys() const { return _c->modifiedKeys(); }
+        bool isMultiKey() const { return _c->isMultiKey(); }
 
-        bool currentMatches(){
-            if ( ! c->matcher() )
-                return true;
-            return c->matcher()->matchesCurrent( c.get() );
-        }
+        bool ok() { return _c->ok(); }
+        bool advance() { return _c->advance(); }
+        BSONObj current() { return _c->current(); }
+        DiskLoc currLoc() { return _c->currLoc(); }
+        BSONObj currKey() const { return _c->currKey(); }
+
+
+        /**
+         * same as BSONObj::getFieldsDotted
+         * if it can be retrieved from key, it is
+         * @return if this was retrieved from key
+         */
+        bool getFieldsDotted( const string& name, BSONElementSet &ret );
+
+        /**
+         * same as BSONObj::getFieldDotted
+         * if it can be retrieved from key, it is
+         * @return if this was retrieved from key
+         */
+        BSONElement getFieldDotted( const string& name , bool * fromKey = 0 );
+
+        bool currentIsDup() { return _c->getsetdup( _c->currLoc() ); }
 
-        BSONObj current(){
-            return c->current();
+        bool currentMatches() {
+            if ( ! _c->matcher() )
+                return true;
+            return _c->matcher()->matchesCurrent( _c.get() );
         }
 
     private:
@@ -273,12 +271,12 @@ namespace mongo {
             return it->second;
         }
     public:
-        static ClientCursor* find(CursorId id, bool warn = true) { 
+        static ClientCursor* find(CursorId id, bool warn = true) {
             recursive_scoped_lock lock(ccmutex);
             ClientCursor *c = find_inlock(id, warn);
-			// if this asserts, your code was not thread safe - you either need to set no timeout 
-			// for the cursor or keep a ClientCursor::Pointer in scope for it.
-            massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue ); 
+            // if this asserts, your code was not thread safe - you either need to set no timeout
+            // for the cursor or keep a ClientCursor::Pointer in scope for it.
+            massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue );
             return c;
         }
 
@@ -293,6 +291,11 @@ namespace mongo {
             return false;
         }
 
+        /**
+         * @return number of cursors found
+         */
+        static int erase( int n , long long * ids );
+
         /* call when cursor's location changes so that we can update the
            cursorsbylocation map.  if you are locked and internally iterating, only
            need to call when you are ready to "unlock".
@@ -314,43 +317,82 @@ namespace mongo {
 
         void storeOpForSlave( DiskLoc last );
         void updateSlaveLocation( CurOp& curop );
-        
-        unsigned idleTime(){
-            return _idleAgeMillis;
-        }
+
+        unsigned idleTime() const { return _idleAgeMillis; }
+
+        void setDoingDeletes( bool doingDeletes ) {_doingDeletes = doingDeletes; }
+
+        void slaveReadTill( const OpTime& t ) { _slaveReadTill = t; }
+
+    public: // static methods
 
         static void idleTimeReport(unsigned millis);
-private:
-        // cursors normally timeout after an inactivy period to prevent excess memory use
-        // setting this prevents timeout of the cursor in question.
-        void noTimeout() { 
-            _pinValue++;
-        }
 
-        multimap<DiskLoc, ClientCursor*>& byLoc() { 
-            return _db->ccByLoc;
-        }
-public:
-        void setDoingDeletes( bool doingDeletes ){
-            _doingDeletes = doingDeletes;
-        }
-        
         static void appendStats( BSONObjBuilder& result );
-
         static unsigned numCursors() { return clientCursorsById.size(); }
-
         static void informAboutToDeleteBucket(const DiskLoc& b);
         static void aboutToDelete(const DiskLoc& dl);
-
         static void find( const string& ns , set<CursorId>& all );
+
+
+    private: // methods
+
+        // cursors normally timeout after an inactivy period to prevent excess memory use
+        // setting this prevents timeout of the cursor in question.
+        void noTimeout() { _pinValue++; }
+
+        CCByLoc& byLoc() { return _db->ccByLoc; }
+
+    private:
+
+        CursorId _cursorid;
+
+        const string _ns;
+        Database * _db;
+
+        const shared_ptr<Cursor> _c;
+        map<string,int> _indexedFields;  // map from indexed field to offset in key object
+        int _pos;                        // # objects into the cursor so far
+
+        const BSONObj _query;            // used for logging diags only; optional in constructor
+        int _queryOptions;        // see enum QueryOptions dbclient.h
+
+        OpTime _slaveReadTill;
+
+        DiskLoc _lastLoc;                        // use getter and setter not this (important)
+        unsigned _idleAgeMillis;                 // how long has the cursor been around, relative to server idle time
+
+        /* 0 = normal
+           1 = no timeout allowed
+           100 = in use (pinned) -- see Pointer class
+        */
+        unsigned _pinValue;
+
+        bool _doingDeletes;
+        ElapsedTracker _yieldSometimesTracker;
+
+    public:
+        shared_ptr<ParsedQuery> pq;
+        shared_ptr<Projection> fields; // which fields query wants returned
+        Message originalMessage; // this is effectively an auto ptr for data the matcher points to
+
+
+
+    private: // static members
+
+        static CCById clientCursorsById;
+        static long long numberTimedOut;
+        static boost::recursive_mutex ccmutex;   // must use this for all statics above!
+        static CursorId allocCursorId_inlock();
+
     };
 
     class ClientCursorMonitor : public BackgroundJob {
     public:
+        string name() const { return "ClientCursorMonitor"; }
         void run();
-        string name() { return "ClientCursorMonitor"; }
     };
 
     extern ClientCursorMonitor clientCursorMonitor;
-    
+
 } // namespace mongo
diff --git a/db/cloner.cpp b/db/cloner.cpp
index 9177a00..fe57463 100644
--- a/db/cloner.cpp
+++ b/db/cloner.cpp
@@ -31,7 +31,7 @@ namespace mongo {
 
     void ensureHaveIdIndex(const char *ns);
 
-    bool replAuthenticate(DBClientConnection *);
+    bool replAuthenticate(DBClientBase *);
 
     class Cloner: boost::noncopyable {
         auto_ptr< DBClientWithCommands > conn;
@@ -40,7 +40,7 @@ namespace mongo {
         struct Fun;
     public:
         Cloner() { }
-        
+
         /* slaveOk     - if true it is ok if the source of the data is !ismaster.
            useReplAuth - use the credentials we normally use as a replication slave for the cloning
            snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
@@ -92,14 +92,14 @@ namespace mongo {
             if ( context ) {
                 context->relocked();
             }
-            
+
             while( i.moreInCurrentBatch() ) {
                 if ( n % 128 == 127 /*yield some*/ ) {
                     dbtemprelease t;
                 }
-                
+
                 BSONObj tmp = i.nextSafe();
-            
+
                 /* assure object is valid.  note this will slow us down a little. */
                 if ( !tmp.valid() ) {
                     stringstream ss;
@@ -109,15 +109,15 @@ namespace mongo {
                         e.validate();
                         ss << " firstElement: " << e;
                     }
-                    catch( ... ){
+                    catch( ... ) {
                         ss << " firstElement corrupt";
                     }
                     out() << ss.str() << endl;
                     continue;
                 }
-            
+
                 ++n;
-            
+
                 BSONObj js = tmp;
                 if ( isindex ) {
                     assert( strstr(from_collection, "system.indexes") );
@@ -125,16 +125,18 @@ namespace mongo {
                     storedForLater->push_back( js.getOwned() );
                     continue;
                 }
-            
-                try { 
+
+                try {
                     theDataFileMgr.insertWithObjMod(to_collection, js);
                     if ( logForRepl )
                         logOp("i", to_collection, js);
+
+                    getDur().commitIfNeeded();
                 }
-                catch( UserException& e ) { 
+                catch( UserException& e ) {
                     log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
                 }
-            
+
                 RARELY if ( time( 0 ) - saveLast > 60 ) {
                     log() << n << " objects cloned so far from collection " << from_collection << endl;
                     saveLast = time( 0 );
@@ -146,17 +148,17 @@ namespace mongo {
         const char *from_collection;
         const char *to_collection;
         time_t saveLast;
-        list<BSONObj> *storedForLater;     
+        list<BSONObj> *storedForLater;
         bool logForRepl;
         Client::Context *context;
     };
-    
+
     /* copy the specified collection
        isindex - if true, this is system.indexes collection, in which we do some transformation when copying.
     */
     void Cloner::copy(const char *from_collection, const char *to_collection, bool isindex, bool logForRepl, bool masterSameProcess, bool slaveOk, Query query) {
         list<BSONObj> storedForLater;
-        
+
         Fun f;
         f.n = 0;
         f.isindex = isindex;
@@ -165,7 +167,7 @@ namespace mongo {
         f.saveLast = time( 0 );
         f.storedForLater = &storedForLater;
         f.logForRepl = logForRepl;
-        
+
         int options = QueryOption_NoCursorTimeout | ( slaveOk ? QueryOption_SlaveOk : 0 );
         {
             dbtemprelease r;
@@ -173,7 +175,9 @@ namespace mongo {
             DBClientConnection *remote = dynamic_cast< DBClientConnection* >( conn.get() );
             if ( remote ) {
                 remote->query( boost::function<void(DBClientCursorBatchIterator &)>( f ), from_collection, query, 0, options );
-            } else { // no exhaust mode for direct client, so we have this hack
+            }
+            else {
+                // there is no exhaust mode for direct client, so we have this hack
                 auto_ptr<DBClientCursor> c = conn->query( from_collection, query, 0, 0, 0, options );
                 assert( c.get() );
                 while( c->more() ) {
@@ -182,16 +186,18 @@ namespace mongo {
                 }
             }
         }
-        
-        if ( storedForLater.size() ){
-            for ( list<BSONObj>::iterator i = storedForLater.begin(); i!=storedForLater.end(); i++ ){
+
+        if ( storedForLater.size() ) {
+            for ( list<BSONObj>::iterator i = storedForLater.begin(); i!=storedForLater.end(); i++ ) {
                 BSONObj js = *i;
-                try { 
+                try {
                     theDataFileMgr.insertWithObjMod(to_collection, js);
                     if ( logForRepl )
                         logOp("i", to_collection, js);
+
+                    getDur().commitIfNeeded();
                 }
-                catch( UserException& e ) { 
+                catch( UserException& e ) {
                     log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
                 }
             }
@@ -210,40 +216,44 @@ namespace mongo {
             return false;
 
         conn.reset( myconn.release() );
-        
+
         writelock lk(ns); // TODO: make this lower down
         Client::Context ctx(ns);
 
-        { // config
+        {
+            // config
             string temp = ctx.db()->name + ".system.namespaces";
             BSONObj config = conn->findOne( temp , BSON( "name" << ns ) );
             if ( config["options"].isABSONObj() )
                 if ( ! userCreateNS( ns.c_str() , config["options"].Obj() , errmsg, true , 0 ) )
                     return false;
         }
-        
-        { // main data
+
+        {
+            // main data
             copy( ns.c_str() , ns.c_str() , /*isindex*/false , logForRepl , false , true , Query(query).snapshot() );
         }
-        
+
         /* TODO : copyIndexes bool does not seem to be implemented! */
-        if( !copyIndexes ) { 
+        if( !copyIndexes ) {
             log() << "ERROR copy collection copyIndexes not implemented? " << ns << endl;
         }
 
-        { // indexes
+        {
+            // indexes
             string temp = ctx.db()->name + ".system.indexes";
             copy( temp.c_str() , temp.c_str() , /*isindex*/true , logForRepl , false , true , BSON( "ns" << ns ) );
         }
+        getDur().commitIfNeeded();
         return true;
     }
-    
+
     extern bool inDBRepair;
     void ensureIdIndexForNewNs(const char *ns);
 
     bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot) {
 
-		massert( 10289 ,  "useReplAuth is not written to replication log", !useReplAuth || !logForRepl );
+        massert( 10289 ,  "useReplAuth is not written to replication log", !useReplAuth || !logForRepl );
 
         string todb = cc().database()->name;
         stringstream a,b;
@@ -263,23 +273,26 @@ namespace mongo {
            */
         string ns = fromdb + ".system.namespaces";
         list<BSONObj> toClone;
-        {  
+        {
             dbtemprelease r;
-		
+
             // just using exhaust for collection copying right now
             auto_ptr<DBClientCursor> c;
             {
                 if ( conn.get() ) {
                     // nothing to do
-                } else if ( !masterSameProcess ) {
-                    auto_ptr< DBClientConnection > c( new DBClientConnection() );
-                    if ( !c->connect( masterHost, errmsg ) )
+                }
+                else if ( !masterSameProcess ) {
+                    ConnectionString cs = ConnectionString::parse( masterHost, errmsg );
+                    auto_ptr<DBClientBase> con( cs.connect( errmsg ));
+                    if ( !con.get() )
                         return false;
-                    if( !replAuthenticate(c.get()) )
+                    if( !replAuthenticate(con.get()) )
                         return false;
-                    
-                    conn = c;
-                } else {
+
+                    conn = con;
+                }
+                else {
                     conn.reset( new DBDirectClient() );
                 }
                 c = conn->query( ns.c_str(), BSONObj(), 0, 0, 0, slaveOk ? QueryOption_SlaveOk : 0 );
@@ -289,8 +302,8 @@ namespace mongo {
                 errmsg = "query failed " + ns;
                 return false;
             }
-            
-            while ( c->more() ){
+
+            while ( c->more() ) {
                 BSONObj collection = c->next();
 
                 log(2) << "\t cloner got " << collection << endl;
@@ -304,23 +317,23 @@ namespace mongo {
                 assert( e.type() == String );
                 const char *from_name = e.valuestr();
 
-                if( strstr(from_name, ".system.") ) { 
+                if( strstr(from_name, ".system.") ) {
                     /* system.users and s.js is cloned -- but nothing else from system.
                      * system.indexes is handled specially at the end*/
-                    if( legalClientSystemNS( from_name , true ) == 0 ){
+                    if( legalClientSystemNS( from_name , true ) == 0 ) {
                         log(2) << "\t\t not cloning because system collection" << endl;
                         continue;
                     }
                 }
-                if( ! nsDollarCheck( from_name ) ){
+                if( ! isANormalNSName( from_name ) ) {
                     log(2) << "\t\t not cloning because has $ " << endl;
                     continue;
-                }            
+                }
                 toClone.push_back( collection.getOwned() );
             }
         }
 
-        for ( list<BSONObj>::iterator i=toClone.begin(); i != toClone.end(); i++ ){
+        for ( list<BSONObj>::iterator i=toClone.begin(); i != toClone.end(); i++ ) {
             {
                 dbtemprelease r;
             }
@@ -328,7 +341,7 @@ namespace mongo {
             log(2) << "  really will clone: " << collection << endl;
             const char * from_name = collection["name"].valuestr();
             BSONObj options = collection.getObjectField("options");
-            
+
             /* change name "<fromdb>.collection" -> <todb>.collection */
             const char *p = strchr(from_name, '.');
             assert(p);
@@ -338,17 +351,17 @@ namespace mongo {
             {
                 string err;
                 const char *toname = to_name.c_str();
-                /* we defer building id index for performance - building it in batch is much faster */ 
+                /* we defer building id index for performance - building it in batch is much faster */
                 userCreateNS(toname, options, err, logForRepl, &wantIdIndex);
             }
             log(1) << "\t\t cloning " << from_name << " -> " << to_name << endl;
             Query q;
-            if( snapshot ) 
+            if( snapshot )
                 q.snapshot();
             copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, q);
 
             if( wantIdIndex ) {
-                /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations 
+                /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations
                    that occur during the initial sync.  inDBRepair makes dropDups be true.
                    */
                 bool old = inDBRepair;
@@ -357,7 +370,7 @@ namespace mongo {
                     ensureIdIndexForNewNs(to_name.c_str());
                     inDBRepair = old;
                 }
-                catch(...) { 
+                catch(...) {
                     inDBRepair = old;
                     throw;
                 }
@@ -368,27 +381,26 @@ namespace mongo {
 
         string system_indexes_from = fromdb + ".system.indexes";
         string system_indexes_to = todb + ".system.indexes";
-        /* [dm]: is the ID index sometimes not called "_id_"?  There is other code in the system that looks for a "_id" prefix 
-                 rather than this exact value.  we should standardize.  OR, remove names - which is in the bugdb.  Anyway, this 
+        /* [dm]: is the ID index sometimes not called "_id_"?  There is other code in the system that looks for a "_id" prefix
+                 rather than this exact value.  we should standardize.  OR, remove names - which is in the bugdb.  Anyway, this
                  is dubious here at the moment.
         */
         copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, BSON( "name" << NE << "_id_" ) );
 
         return true;
     }
-    
+
     /* slaveOk     - if true it is ok if the source of the data is !ismaster.
        useReplAuth - use the credentials we normally use as a replication slave for the cloning
        snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
                      for example repairDatabase need not use it.
     */
-    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, 
-				   bool slaveOk, bool useReplAuth, bool snapshot)
-    {
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+                   bool slaveOk, bool useReplAuth, bool snapshot) {
         Cloner c;
         return c.go(masterHost, errmsg, fromdb, logForReplication, slaveOk, useReplAuth, snapshot);
     }
-    
+
     /* Usage:
        mydb.$cmd.findOne( { clone: "fromhost" } );
     */
@@ -410,11 +422,11 @@ namespace mongo {
             /* replication note: we must logOp() not the command, but the cloned data -- if the slave
                were to clone it would get a different point-in-time and not match.
                */
-            return cloneFrom(from.c_str(), errmsg, dbname, 
+            return cloneFrom(from.c_str(), errmsg, dbname,
                              /*logForReplication=*/!fromRepl, /*slaveok*/false, /*usereplauth*/false, /*snapshot*/true);
         }
     } cmdclone;
-    
+
     class CmdCloneCollection : public Command {
     public:
         virtual bool slaveOk() const {
@@ -424,10 +436,10 @@ namespace mongo {
         CmdCloneCollection() : Command("cloneCollection") { }
         virtual void help( stringstream &help ) const {
             help << "{ cloneCollection: <namespace>, from: <host> [,query: <query_filter>] [,copyIndexes:<bool>] }"
-                "\nCopies a collection from one server to another. Do not use on a single server as the destination "
-                "is placed at the same db.collection (namespace) as the source.\n"
-                "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there."
-                ;
+                 "\nCopies a collection from one server to another. Do not use on a single server as the destination "
+                 "is placed at the same db.collection (namespace) as the source.\n"
+                 "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there."
+                 ;
         }
         virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string fromhost = cmdObj.getStringField("from");
@@ -437,7 +449,7 @@ namespace mongo {
             }
             {
                 HostAndPort h(fromhost);
-                if( h.isSelf() ) { 
+                if( h.isSelf() ) {
                     errmsg = "can't cloneCollection from self";
                     return false;
                 }
@@ -450,13 +462,13 @@ namespace mongo {
             BSONObj query = cmdObj.getObjectField("query");
             if ( query.isEmpty() )
                 query = BSONObj();
-            
+
             BSONElement copyIndexesSpec = cmdObj.getField("copyindexes");
             bool copyIndexes = copyIndexesSpec.isBoolean() ? copyIndexesSpec.boolean() : true;
-            
-            log() << "cloneCollection.  db:" << dbname << " collection:" << collection << " from: " << fromhost 
+
+            log() << "cloneCollection.  db:" << dbname << " collection:" << collection << " from: " << fromhost
                   << " query: " << query << " " << ( copyIndexes ? "" : ", not copying indexes" ) << endl;
-            
+
             Cloner c;
             return c.copyCollection( fromhost , collection , query, errmsg , copyIndexes );
         }
@@ -557,7 +569,7 @@ namespace mongo {
             return res;
         }
     } cmdcopydb;
-    
+
     class CmdRenameCollection : public Command {
     public:
         CmdRenameCollection() : Command( "renameCollection" ) {}
@@ -581,7 +593,7 @@ namespace mongo {
                 errmsg = "invalid command syntax";
                 return false;
             }
-            
+
             bool capped = false;
             long long size = 0;
             {
@@ -593,10 +605,10 @@ namespace mongo {
                     for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext )
                         size += i.ext()->length;
             }
-            
+
             Client::Context ctx( target );
-            
-            if ( nsdetails( target.c_str() ) ){
+
+            if ( nsdetails( target.c_str() ) ) {
                 uassert( 10027 ,  "target namespace exists", cmdObj["dropTarget"].trueValue() );
                 BSONObjBuilder bb( result.subobjStart( "dropTarget" ) );
                 dropCollection( target , errmsg , bb );
@@ -623,7 +635,7 @@ namespace mongo {
             }
             if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) )
                 return false;
-            
+
             auto_ptr< DBClientCursor > c;
             DBDirectClient bridge;
 
@@ -638,7 +650,7 @@ namespace mongo {
                 BSONObj o = c->next();
                 theDataFileMgr.insertWithObjMod( target.c_str(), o );
             }
-            
+
             char cl[256];
             nsToDatabase( source.c_str(), cl );
             string sourceIndexes = string( cl ) + ".system.indexes";
@@ -661,7 +673,8 @@ namespace mongo {
                         break;
                     if ( strcmp( e.fieldName(), "ns" ) == 0 ) {
                         b.append( "ns", target );
-                    } else {
+                    }
+                    else {
                         b.append( e );
                     }
                 }
diff --git a/db/cmdline.cpp b/db/cmdline.cpp
index 65ee179..900a782 100644
--- a/db/cmdline.cpp
+++ b/db/cmdline.cpp
@@ -20,47 +20,92 @@
 #include "cmdline.h"
 #include "commands.h"
 #include "../util/processinfo.h"
+#include "security_key.h"
+
+#ifdef _WIN32
+#include <direct.h>
+#endif
 
 namespace po = boost::program_options;
+namespace fs = boost::filesystem;
 
 namespace mongo {
 
-    void setupSignals();
+    void setupSignals( bool inFork );
+    string getHostNameCached();
     BSONArray argvArray;
 
-    void CmdLine::addGlobalOptions( boost::program_options::options_description& general , 
-                                    boost::program_options::options_description& hidden ){
+    void CmdLine::addGlobalOptions( boost::program_options::options_description& general ,
+                                    boost::program_options::options_description& hidden ) {
         /* support for -vv -vvvv etc. */
         for (string s = "vv"; s.length() <= 12; s.append("v")) {
             hidden.add_options()(s.c_str(), "verbose");
         }
-        
+
         general.add_options()
-            ("help,h", "show this usage information")
-            ("version", "show version information")
-            ("config,f", po::value<string>(), "configuration file specifying additional options")
-            ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)")
-            ("quiet", "quieter output")
-            ("port", po::value<int>(&cmdLine.port), "specify port number")
-            ("bind_ip", po::value<string>(&cmdLine.bind_ip), "comma separated list of ip addresses to listen on - all local ips by default")
-            ("logpath", po::value<string>() , "file to send all output to instead of stdout" )
-            ("logappend" , "append to logpath instead of over-writing" )
-            ("pidfilepath", po::value<string>(), "full path to pidfile (if not set, no pidfile is created)")
+        ("help,h", "show this usage information")
+        ("version", "show version information")
+        ("config,f", po::value<string>(), "configuration file specifying additional options")
+        ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)")
+        ("quiet", "quieter output")
+        ("port", po::value<int>(&cmdLine.port), "specify port number")
+        ("bind_ip", po::value<string>(&cmdLine.bind_ip), "comma separated list of ip addresses to listen on - all local ips by default")
+        ("logpath", po::value<string>() , "log file to send write to instead of stdout - has to be a file, not directory" )
+        ("logappend" , "append to logpath instead of over-writing" )
+        ("pidfilepath", po::value<string>(), "full path to pidfile (if not set, no pidfile is created)")
+        ("keyFile", po::value<string>(), "private key for cluster authentication (only for replica sets)")
 #ifndef _WIN32
-            ("fork" , "fork server process" )
+        ("unixSocketPrefix", po::value<string>(), "alternative directory for UNIX domain sockets (defaults to /tmp)")
+        ("fork" , "fork server process" )
 #endif
-            ;
-        
+        ;
+
     }
 
 
-    bool CmdLine::store( int argc , char ** argv , 
+#if defined(_WIN32)
+    void CmdLine::addWindowsOptions( boost::program_options::options_description& windows ,
+                                     boost::program_options::options_description& hidden ) {
+        windows.add_options()
+        ("install", "install mongodb service")
+        ("remove", "remove mongodb service")
+        ("reinstall", "reinstall mongodb service (equivilant of mongod --remove followed by mongod --install)")
+        ("serviceName", po::value<string>(), "windows service name")
+        ("serviceDisplayName", po::value<string>(), "windows service display name")
+        ("serviceDescription", po::value<string>(), "windows service description")
+        ("serviceUser", po::value<string>(), "user name service executes as")
+        ("servicePassword", po::value<string>(), "password used to authenticate serviceUser")
+        ;
+        hidden.add_options()("service", "start mongodb service");
+    }
+#endif
+
+
+    bool CmdLine::store( int argc , char ** argv ,
                          boost::program_options::options_description& visible,
                          boost::program_options::options_description& hidden,
                          boost::program_options::positional_options_description& positional,
-                         boost::program_options::variables_map &params ){
-        
+                         boost::program_options::variables_map &params ) {
+
+
+        {
+            // setup binary name
+            cmdLine.binaryName = argv[0];
+            size_t i = cmdLine.binaryName.rfind( '/' );
+            if ( i != string::npos )
+                cmdLine.binaryName = cmdLine.binaryName.substr( i + 1 );
+            
+            // setup cwd
+            char buffer[1024];
+#ifdef _WIN32
+            assert( _getcwd( buffer , 1000 ) );
+#else
+            assert( getcwd( buffer , 1000 ) );
+#endif
+            cmdLine.cwd = buffer;
+        }
         
+
         /* don't allow guessing - creates ambiguities when some options are
          * prefixes of others. allow long disguises and don't allow guessing
          * to get away with our vvvvvvv trick. */
@@ -69,7 +114,7 @@ namespace mongo {
                       po::command_line_style::allow_long_disguise) ^
                      po::command_line_style::allow_sticky);
 
-        
+
         try {
 
             po::options_description all;
@@ -80,26 +125,27 @@ namespace mongo {
                        .options( all )
                        .positional( positional )
                        .style( style )
-                       .run(), 
+                       .run(),
                        params );
 
-            if ( params.count("config") ){
+            if ( params.count("config") ) {
                 ifstream f( params["config"].as<string>().c_str() );
-                if ( ! f.is_open() ){
+                if ( ! f.is_open() ) {
                     cout << "ERROR: could not read from config file" << endl << endl;
                     cout << visible << endl;
                     return false;
                 }
-                
+
                 po::store( po::parse_config_file( f , all ) , params );
                 f.close();
             }
-            
+
             po::notify(params);
-        } 
+        }
         catch (po::error &e) {
-            cout << "ERROR: " << e.what() << endl << endl;
-            cout << visible << endl;
+            cout << "error command line: " << e.what() << endl;
+            cout << "use --help for help" << endl;
+            //cout << visible << endl;
             return false;
         }
 
@@ -120,44 +166,51 @@ namespace mongo {
         string logpath;
 
 #ifndef _WIN32
+        if (params.count("unixSocketPrefix")) {
+            cmdLine.socket = params["unixSocketPrefix"].as<string>();
+            if (!fs::is_directory(cmdLine.socket)) {
+                cout << cmdLine.socket << " must be a directory" << endl;
+                ::exit(-1);
+            }
+        }
+        
         if (params.count("fork")) {
-            if ( ! params.count( "logpath" ) ){
+            if ( ! params.count( "logpath" ) ) {
                 cout << "--fork has to be used with --logpath" << endl;
                 ::exit(-1);
             }
-            
-            { // test logpath
+
+            {
+                // test logpath
                 logpath = params["logpath"].as<string>();
                 assert( logpath.size() );
-                if ( logpath[0] != '/' ){
-                    char temp[256];
-                    assert( getcwd( temp , 256 ) );
-                    logpath = (string)temp + "/" + logpath;
+                if ( logpath[0] != '/' ) {
+                    logpath = cmdLine.cwd + "/" + logpath;
                 }
                 FILE * test = fopen( logpath.c_str() , "a" );
-                if ( ! test ){
+                if ( ! test ) {
                     cout << "can't open [" << logpath << "] for log file: " << errnoWithDescription() << endl;
                     ::exit(-1);
                 }
                 fclose( test );
             }
-            
+
             cout.flush();
             cerr.flush();
 
             pid_t c = fork();
-            if ( c ){
+            if ( c ) {
                 _exit(0);
             }
 
-            if ( chdir("/") < 0 ){
+            if ( chdir("/") < 0 ) {
                 cout << "Cant chdir() while forking server process: " << strerror(errno) << endl;
                 ::exit(-1);
             }
             setsid();
-            
+
             pid_t c2 = fork();
-            if ( c2 ){
+            if ( c2 ) {
                 cout << "forked process: " << c2 << endl;
                 _exit(0);
             }
@@ -170,19 +223,19 @@ namespace mongo {
             fclose(stdin);
 
             FILE* f = freopen("/dev/null", "w", stderr);
-            if ( f == NULL ){
+            if ( f == NULL ) {
                 cout << "Cant reassign stderr while forking server process: " << strerror(errno) << endl;
                 ::exit(-1);
             }
 
             f = freopen("/dev/null", "r", stdin);
-            if ( f == NULL ){
+            if ( f == NULL ) {
                 cout << "Cant reassign stdin while forking server process: " << strerror(errno) << endl;
                 ::exit(-1);
             }
 
             setupCoreSignals();
-            setupSignals();
+            setupSignals( true );
         }
 #endif
         if (params.count("logpath")) {
@@ -196,6 +249,18 @@ namespace mongo {
             writePidFile( params["pidfilepath"].as<string>() );
         }
 
+        if (params.count("keyFile")) {
+            const string f = params["keyFile"].as<string>();
+
+            if (!setUpSecurityKey(f)) {
+                // error message printed in setUpPrivateKey
+                dbexit(EXIT_BADOPTIONS);
+            }
+
+            noauth = false;
+        }
+
+
         {
             BSONArrayBuilder b;
             for (int i=0; i < argc; i++)
@@ -205,29 +270,51 @@ namespace mongo {
 
         return true;
     }
-    
-    void ignoreSignal( int signal ){
-    }
 
-    void setupCoreSignals(){
+    void ignoreSignal( int sig ) {}
+
+    void setupCoreSignals() {
 #if !defined(_WIN32)
         assert( signal(SIGUSR1 , rotateLogs ) != SIG_ERR );
         assert( signal(SIGHUP , ignoreSignal ) != SIG_ERR );
 #endif
     }
 
-    class CmdGetCmdLineOpts : Command{
-        public:
+    class CmdGetCmdLineOpts : Command {
+    public:
         CmdGetCmdLineOpts(): Command("getCmdLineOpts") {}
         void help(stringstream& h) const { h << "get argv"; }
         virtual LockType locktype() const { return NONE; }
         virtual bool adminOnly() const { return true; }
         virtual bool slaveOk() const { return true; }
 
-        virtual bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+        virtual bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             result.append("argv", argvArray);
             return true;
         }
 
     } cmdGetCmdLineOpts;
+
+    string prettyHostName() {
+        StringBuilder s(128);
+        s << getHostNameCached();
+        if( cmdLine.port != CmdLine::DefaultDBPort )
+            s << ':' << mongo::cmdLine.port;
+        return s.str();
+    }
+
+    ParameterValidator::ParameterValidator( const string& name ) : _name( name ) {
+        if ( ! _all )
+            _all = new map<string,ParameterValidator*>();
+        (*_all)[_name] = this;
+    }
+
+    ParameterValidator * ParameterValidator::get( const string& name ) {
+        map<string,ParameterValidator*>::iterator i = _all->find( name );
+        if ( i == _all->end() )
+            return NULL;
+        return i->second;
+    }
+    map<string,ParameterValidator*> * ParameterValidator::_all = 0;
+
 }
diff --git a/db/cmdline.h b/db/cmdline.h
index ef1bd57..4c8c7c4 100644
--- a/db/cmdline.h
+++ b/db/cmdline.h
@@ -17,72 +17,134 @@
 #pragma once
 
 #include "../pch.h"
+#include "jsobj.h"
 
 namespace mongo {
-    
-    /* command line options        
+
+    /* command line options
     */
     /* concurrency: OK/READ */
-    struct CmdLine { 
+    struct CmdLine {
+
+        CmdLine() :
+            port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), smallfiles(sizeof(int*) == 4),
+            quota(false), quotaFiles(8), cpu(false), durOptions(0), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ),
+            syncdelay(60), socket("/tmp") {
+            // default may change for this later.
+#if defined(_DURABLEDEFAULTON)
+            dur = true;
+#else
+            dur = false;
+#endif
+        }
+
+        string binaryName;     // mongod or mongos
+        string cwd;            // cwd of when process started
+
         int port;              // --port
+        enum {
+            DefaultDBPort = 27017,
+            ConfigServerPort = 27019,
+            ShardServerPort = 27018
+        };
+        bool isDefaultPort() const { return port == DefaultDBPort; }
+
         string bind_ip;        // --bind_ip
         bool rest;             // --rest
+        bool jsonp;            // --jsonp
 
         string _replSet;       // --replSet[/<seedlist>]
-        string ourSetName() const { 
+        string ourSetName() const {
             string setname;
             size_t sl = _replSet.find('/');
             if( sl == string::npos )
                 return _replSet;
             return _replSet.substr(0, sl);
         }
+        bool usingReplSets() const { return !_replSet.empty(); }
 
+        // for master/slave replication
         string source;         // --source
         string only;           // --only
-        
+
         bool quiet;            // --quiet
-        bool notablescan;      // --notablescan
-        bool prealloc;         // --noprealloc
-        bool smallfiles;       // --smallfiles
-        
+        bool noTableScan;      // --notablescan no table scans allowed
+        bool prealloc;         // --noprealloc no preallocation of data files
+        bool smallfiles;       // --smallfiles allocate smaller data files
+
         bool quota;            // --quota
         int quotaFiles;        // --quotaFiles
         bool cpu;              // --cpu show cpu time periodically
 
+        bool dur;              // --dur durability
+
+        /** --durOptions 7      dump journal and terminate without doing anything further
+            --durOptions 4      recover and terminate without listening
+        */
+        enum { // bits to be ORed
+            DurDumpJournal = 1,   // dump diagnostics on the journal during recovery
+            DurScanOnly = 2,      // don't do any real work, just scan and dump if dump specified
+            DurRecoverOnly = 4,   // terminate after recovery step
+            DurParanoid = 8,      // paranoid mode enables extra checks
+            DurAlwaysCommit = 16  // do a group commit every time the writelock is released
+        };
+        int durOptions;          // --durOptions <n> for debugging
+
         long long oplogSize;   // --oplogSize
         int defaultProfile;    // --profile
         int slowMS;            // --time in ms that is "slow"
 
         int pretouch;          // --pretouch for replication application (experimental)
-        bool moveParanoia;     // for move chunk paranoia 
+        bool moveParanoia;     // for move chunk paranoia
+        double syncdelay;      // seconds between fsyncs
 
-        enum { 
-            DefaultDBPort = 27017,
-			ConfigServerPort = 27019,
-			ShardServerPort = 27018
-        };
+        string socket;         // UNIX domain socket directory
 
-        CmdLine() : 
-            port(DefaultDBPort), rest(false), quiet(false), notablescan(false), prealloc(true), smallfiles(false),
-            quota(false), quotaFiles(8), cpu(false), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true )
-        { } 
-        
-
-        static void addGlobalOptions( boost::program_options::options_description& general , 
+        static void addGlobalOptions( boost::program_options::options_description& general ,
                                       boost::program_options::options_description& hidden );
 
-        
+        static void addWindowsOptions( boost::program_options::options_description& windows ,
+                                       boost::program_options::options_description& hidden );
+
+
         /**
          * @return true if should run program, false if should exit
          */
-        static bool store( int argc , char ** argv , 
+        static bool store( int argc , char ** argv ,
                            boost::program_options::options_description& visible,
                            boost::program_options::options_description& hidden,
                            boost::program_options::positional_options_description& positional,
                            boost::program_options::variables_map &output );
     };
-    
+
     extern CmdLine cmdLine;
-    
+
     void setupCoreSignals();
+
+    string prettyHostName();
+
+
+    /**
+     * used for setParameter
+     * so you can write validation code that lives with code using it
+     * rather than all in the command place
+     * also lets you have mongos or mongod specific code
+     * without pulling it all sorts of things
+     */
+    class ParameterValidator {
+    public:
+        ParameterValidator( const string& name );
+        virtual ~ParameterValidator() {}
+
+        virtual bool isValid( BSONElement e , string& errmsg ) = 0;
+
+        static ParameterValidator * get( const string& name );
+
+    private:
+        string _name;
+
+        // don't need to lock since this is all done in static init
+        static map<string,ParameterValidator*> * _all;
+    };
+
 }
diff --git a/db/commands.cpp b/db/commands.cpp
index ef219fe..770d035 100644
--- a/db/commands.cpp
+++ b/db/commands.cpp
@@ -38,7 +38,7 @@ namespace mongo {
         }
         ss << "\n<tr><td>";
         bool web = _webCommands->count(name) != 0;
-        if( web ) ss << "<a href=\"/" << name << "?text\">";
+        if( web ) ss << "<a href=\"/" << name << "?text=1\">";
         ss << name;
         if( web ) ss << "</a>";
         ss << "</td>\n";
@@ -55,7 +55,7 @@ namespace mongo {
         ss << "<td>";
         if( helpStr != "no help defined" ) {
             const char *p = helpStr.c_str();
-            while( *p ) { 
+            while( *p ) {
                 if( *p == '<' ) {
                     ss << "&lt;";
                     p++; continue;
@@ -67,7 +67,7 @@ namespace mongo {
                     p++;
                     continue;
                 }
-                if( strncmp(p, "http:", 5) == 0 ) { 
+                if( strncmp(p, "http:", 5) == 0 ) {
                     ss << "<a href=\"";
                     const char *q = p;
                     while( *q && *q != ' ' && *q != '\n' )
@@ -79,7 +79,7 @@ namespace mongo {
                     while( *q && *q != ' ' && *q != '\n' ) {
                         ss << (*q == '+' ? ' ' : *q);
                         q++;
-                        if( *q == '#' ) 
+                        if( *q == '#' )
                             while( *q && *q != ' ' && *q != '\n' ) q++;
                     }
                     ss << "</a>";
@@ -120,7 +120,7 @@ namespace mongo {
     void Command::help( stringstream& help ) const {
         help << "no help defined";
     }
-    
+
     bool Command::runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder) {
         const char *p = strchr(ns, '.');
         if ( !p ) return false;
@@ -145,7 +145,7 @@ namespace mongo {
                 ok = false;
                 errmsg = "access denied - use admin db";
             }
-            else if ( jsobj.getBoolField( "help" ) ){
+            else if ( jsobj.getBoolField( "help" ) ) {
                 stringstream help;
                 help << "help for: " << e.fieldName() << " ";
                 c->help( help );
@@ -161,18 +161,18 @@ namespace mongo {
 
             if (!have_ok)
                 anObjBuilder.append( "ok" , ok ? 1.0 : 0.0 );
-            
+
             if ( !ok && !have_errmsg) {
                 anObjBuilder.append("errmsg", errmsg);
                 uassert_nothrow(errmsg.c_str());
             }
             return true;
         }
-        
+
         return false;
     }
 
-    Command* Command::findCommand( const string& name ){
+    Command* Command::findCommand( const string& name ) {
         map<string,Command*>::iterator i = _commands->find( name );
         if ( i == _commands->end() )
             return 0;
@@ -180,7 +180,7 @@ namespace mongo {
     }
 
 
-    Command::LockType Command::locktype( const string& name ){
+    Command::LockType Command::locktype( const string& name ) {
         Command * c = findCommand( name );
         if ( ! c )
             return WRITE;
@@ -189,10 +189,10 @@ namespace mongo {
 
     void Command::logIfSlow( const Timer& timer, const string& msg ) {
         int ms = timer.millis();
-        if ( ms > cmdLine.slowMS ){
+        if ( ms > cmdLine.slowMS ) {
             out() << msg << " took " << ms << " ms." << endl;
         }
     }
-    
-    
+
+
 } // namespace mongo
diff --git a/db/commands.h b/db/commands.h
index a8a61c4..42e46a0 100644
--- a/db/commands.h
+++ b/db/commands.h
@@ -18,7 +18,9 @@
 #pragma once
 
 #include "../pch.h"
+
 #include "jsobj.h"
+#include "../util/timer.h"
 
 namespace mongo {
 
@@ -32,7 +34,7 @@ namespace mongo {
         */
     class Command {
     public:
-        
+
         enum LockType { READ = -1 , NONE = 0 , WRITE = 1 };
 
         const string name;
@@ -47,11 +49,11 @@ namespace mongo {
         */
         virtual bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) = 0;
 
-        /* 
-		   note: logTheTop() MUST be false if READ
+        /*
+           note: logTheTop() MUST be false if READ
            if NONE, can't use Client::Context setup
                     use with caution
-		 */
+         */
         virtual LockType locktype() const = 0;
 
         /* Return true if only the admin ns has privileges to run this command. */
@@ -61,7 +63,7 @@ namespace mongo {
 
         void htmlHelp(stringstream&) const;
 
-        /* Like adminOnly, but even stricter: we must either be authenticated for admin db, 
+        /* Like adminOnly, but even stricter: we must either be authenticated for admin db,
            or, if running without auth, on the local interface.
 
            When localHostOnlyIfNoAuth() is true, adminOnly() must also be true.
@@ -72,7 +74,7 @@ namespace mongo {
            (the command directly from a client -- if fromRepl, always allowed).
         */
         virtual bool slaveOk() const = 0;
-        
+
         /* Return true if the client force a command to be run on a slave by
            turning on the 'slaveok' option in the command query.
         */
@@ -89,12 +91,12 @@ namespace mongo {
 
         virtual void help( stringstream& help ) const;
 
-        /* Return true if authentication and security applies to the commands.  Some commands 
+        /* Return true if authentication and security applies to the commands.  Some commands
            (e.g., getnonce, authenticate) can be done by anyone even unauthorized.
         */
         virtual bool requiresAuth() { return true; }
 
-        /** @param webUI expose the command in the web ui as localhost:28017/<name> 
+        /** @param webUI expose the command in the web ui as localhost:28017/<name>
             @param oldName an optional old, deprecated name for the command
         */
         Command(const char *_name, bool webUI = false, const char *oldName = 0);
@@ -102,7 +104,7 @@ namespace mongo {
         virtual ~Command() {}
 
     protected:
-        BSONObj getQuery( const BSONObj& cmdObj ){
+        BSONObj getQuery( const BSONObj& cmdObj ) {
             if ( cmdObj["query"].type() == Object )
                 return cmdObj["query"].embeddedObject();
             if ( cmdObj["q"].type() == Object )
diff --git a/db/commands/distinct.cpp b/db/commands/distinct.cpp
new file mode 100644
index 0000000..2e26bcd
--- /dev/null
+++ b/db/commands/distinct.cpp
@@ -0,0 +1,150 @@
+// distinct.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../queryoptimizer.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    class DistinctCommand : public Command {
+    public:
+        DistinctCommand() : Command("distinct") {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }";
+        }
+
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            Timer t;
+            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+
+            string key = cmdObj["key"].valuestrsafe();
+            BSONObj keyPattern = BSON( key << 1 );
+
+            BSONObj query = getQuery( cmdObj );
+
+            int bufSize = BSONObjMaxUserSize - 4096;
+            BufBuilder bb( bufSize );
+            char * start = bb.buf();
+
+            BSONArrayBuilder arr( bb );
+            BSONElementSet values;
+
+            long long nscanned = 0; // locations looked at
+            long long nscannedObjects = 0; // full objects looked at
+            long long n = 0; // matches
+            MatchDetails md;
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+
+            if ( ! d ) {
+                result.appendArray( "values" , BSONObj() );
+                result.append( "stats" , BSON( "n" << 0 << "nscanned" << 0 << "nscannedObjects" << 0 ) );
+                return true;
+            }
+
+            shared_ptr<Cursor> cursor;
+            if ( ! query.isEmpty() ) {
+                cursor = bestGuessCursor(ns.c_str() , query , BSONObj() );
+            }
+            else {
+
+                // query is empty, so lets see if we can find an index
+                // with the key so we don't have to hit the raw data
+                NamespaceDetails::IndexIterator ii = d->ii();
+                while ( ii.more() ) {
+                    IndexDetails& idx = ii.next();
+
+                    if ( d->isMultikey( ii.pos() - 1 ) )
+                        continue;
+
+                    if ( idx.inKeyPattern( key ) ) {
+                        cursor = bestGuessCursor( ns.c_str() , BSONObj() , idx.keyPattern() );
+                        break;
+                    }
+
+                }
+
+                if ( ! cursor.get() )
+                    cursor = bestGuessCursor(ns.c_str() , query , BSONObj() );
+
+            }
+
+
+
+            scoped_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns));
+
+            while ( cursor->ok() ) {
+                nscanned++;
+                bool loadedObject = false;
+
+                if ( !cursor->matcher() || cursor->matcher()->matchesCurrent( cursor.get() , &md ) ) {
+                    n++;
+
+                    BSONElementSet temp;
+                    loadedObject = ! cc->getFieldsDotted( key , temp );
+
+                    for ( BSONElementSet::iterator i=temp.begin(); i!=temp.end(); ++i ) {
+                        BSONElement e = *i;
+                        if ( values.count( e ) )
+                            continue;
+
+                        int now = bb.len();
+
+                        uassert(10044,  "distinct too big, 4mb cap", ( now + e.size() + 1024 ) < bufSize );
+
+                        arr.append( e );
+                        BSONElement x( start + now );
+
+                        values.insert( x );
+                    }
+                }
+
+                if ( loadedObject || md.loadedObject )
+                    nscannedObjects++;
+
+                cursor->advance();
+
+                if (!cc->yieldSometimes())
+                    break;
+
+                RARELY killCurrentOp.checkForInterrupt();
+            }
+
+            assert( start == bb.buf() );
+
+            result.appendArray( "values" , arr.done() );
+
+            {
+                BSONObjBuilder b;
+                b.appendNumber( "n" , n );
+                b.appendNumber( "nscanned" , nscanned );
+                b.appendNumber( "nscannedObjects" , nscannedObjects );
+                b.appendNumber( "timems" , t.millis() );
+                result.append( "stats" , b.obj() );
+            }
+
+            return true;
+        }
+
+    } distinctCmd;
+
+}
diff --git a/db/commands/group.cpp b/db/commands/group.cpp
new file mode 100644
index 0000000..0cc6ab3
--- /dev/null
+++ b/db/commands/group.cpp
@@ -0,0 +1,202 @@
+// group.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../queryoptimizer.h"
+
+namespace mongo {
+
+    class GroupCommand : public Command {
+    public:
+        GroupCommand() : Command("group") {}
+        virtual LockType locktype() const { return READ; }
+        virtual bool slaveOk() const { return false; }
+        virtual bool slaveOverrideOk() { return true; }
+        virtual void help( stringstream &help ) const {
+            help << "http://www.mongodb.org/display/DOCS/Aggregation";
+        }
+
+        BSONObj getKey( const BSONObj& obj , const BSONObj& keyPattern , ScriptingFunction func , double avgSize , Scope * s ) {
+            if ( func ) {
+                BSONObjBuilder b( obj.objsize() + 32 );
+                b.append( "0" , obj );
+                int res = s->invoke( func , b.obj() );
+                uassert( 10041 ,  (string)"invoke failed in $keyf: " + s->getError() , res == 0 );
+                int type = s->type("return");
+                uassert( 10042 ,  "return of $key has to be an object" , type == Object );
+                return s->getObject( "return" );
+            }
+            return obj.extractFields( keyPattern , true );
+        }
+
+        bool group( string realdbname , const string& ns , const BSONObj& query ,
+                    BSONObj keyPattern , string keyFunctionCode , string reduceCode , const char * reduceScope ,
+                    BSONObj initial , string finalize ,
+                    string& errmsg , BSONObjBuilder& result ) {
+
+
+            auto_ptr<Scope> s = globalScriptEngine->getPooledScope( realdbname );
+            s->localConnect( realdbname.c_str() );
+
+            if ( reduceScope )
+                s->init( reduceScope );
+
+            s->setObject( "$initial" , initial , true );
+
+            s->exec( "$reduce = " + reduceCode , "reduce setup" , false , true , true , 100 );
+            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+            ScriptingFunction f = s->createFunction(
+                                      "function(){ "
+                                      "  if ( $arr[n] == null ){ "
+                                      "    next = {}; "
+                                      "    Object.extend( next , $key ); "
+                                      "    Object.extend( next , $initial , true ); "
+                                      "    $arr[n] = next; "
+                                      "    next = null; "
+                                      "  } "
+                                      "  $reduce( obj , $arr[n] ); "
+                                      "}" );
+
+            ScriptingFunction keyFunction = 0;
+            if ( keyFunctionCode.size() ) {
+                keyFunction = s->createFunction( keyFunctionCode.c_str() );
+            }
+
+
+            double keysize = keyPattern.objsize() * 3;
+            double keynum = 1;
+
+            map<BSONObj,int,BSONObjCmp> map;
+            list<BSONObj> blah;
+
+            shared_ptr<Cursor> cursor = bestGuessCursor(ns.c_str() , query , BSONObj() );
+
+            while ( cursor->ok() ) {
+                if ( cursor->matcher() && ! cursor->matcher()->matchesCurrent( cursor.get() ) ) {
+                    cursor->advance();
+                    continue;
+                }
+
+                BSONObj obj = cursor->current();
+                cursor->advance();
+
+                BSONObj key = getKey( obj , keyPattern , keyFunction , keysize / keynum , s.get() );
+                keysize += key.objsize();
+                keynum++;
+
+                int& n = map[key];
+                if ( n == 0 ) {
+                    n = map.size();
+                    s->setObject( "$key" , key , true );
+
+                    uassert( 10043 ,  "group() can't handle more than 20000 unique keys" , n <= 20000 );
+                }
+
+                s->setObject( "obj" , obj , true );
+                s->setNumber( "n" , n - 1 );
+                if ( s->invoke( f , BSONObj() , 0 , true ) ) {
+                    throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() );
+                }
+            }
+
+            if (!finalize.empty()) {
+                s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 );
+                ScriptingFunction g = s->createFunction(
+                                          "function(){ "
+                                          "  for(var i=0; i < $arr.length; i++){ "
+                                          "  var ret = $finalize($arr[i]); "
+                                          "  if (ret !== undefined) "
+                                          "    $arr[i] = ret; "
+                                          "  } "
+                                          "}" );
+                s->invoke( g , BSONObj() , 0 , true );
+            }
+
+            result.appendArray( "retval" , s->getObject( "$arr" ) );
+            result.append( "count" , keynum - 1 );
+            result.append( "keys" , (int)(map.size()) );
+            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+            s->gc();
+
+            return true;
+        }
+
+        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+
+            /* db.$cmd.findOne( { group : <p> } ) */
+            const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck();
+
+            BSONObj q;
+            if ( p["cond"].type() == Object )
+                q = p["cond"].embeddedObject();
+            else if ( p["condition"].type() == Object )
+                q = p["condition"].embeddedObject();
+            else
+                q = getQuery( p );
+
+            if ( p["ns"].type() != String ) {
+                errmsg = "ns has to be set";
+                return false;
+            }
+
+            string ns = dbname + "." + p["ns"].String();
+
+            BSONObj key;
+            string keyf;
+            if ( p["key"].type() == Object ) {
+                key = p["key"].embeddedObjectUserCheck();
+                if ( ! p["$keyf"].eoo() ) {
+                    errmsg = "can't have key and $keyf";
+                    return false;
+                }
+            }
+            else if ( p["$keyf"].type() ) {
+                keyf = p["$keyf"]._asCode();
+            }
+            else {
+                // no key specified, will use entire object as key
+            }
+
+            BSONElement reduce = p["$reduce"];
+            if ( reduce.eoo() ) {
+                errmsg = "$reduce has to be set";
+                return false;
+            }
+
+            BSONElement initial = p["initial"];
+            if ( initial.type() != Object ) {
+                errmsg = "initial has to be an object";
+                return false;
+            }
+
+
+            string finalize;
+            if (p["finalize"].type())
+                finalize = p["finalize"]._asCode();
+
+            return group( dbname , ns , q ,
+                          key , keyf , reduce._asCode() , reduce.type() != CodeWScope ? 0 : reduce.codeWScopeScopeData() ,
+                          initial.embeddedObject() , finalize ,
+                          errmsg , result );
+        }
+
+    } cmdGroup;
+
+
+} // namespace mongo
diff --git a/db/commands/isself.cpp b/db/commands/isself.cpp
new file mode 100644
index 0000000..b97f51e
--- /dev/null
+++ b/db/commands/isself.cpp
@@ -0,0 +1,220 @@
+// isself.cpp
+
+#include "pch.h"
+#include "../../util/message.h"
+#include "../commands.h"
+#include "../../client/dbclient.h"
+
+#ifndef _WIN32
+# ifndef __sunos__
+#  include <ifaddrs.h>
+# endif
+# include <sys/resource.h>
+# include <sys/stat.h>
+#endif
+
+
+namespace mongo {
+
+#if !defined(_WIN32) && !defined(__sunos__)
+
+    vector<string> getMyAddrs() {
+        ifaddrs * addrs;
+
+        int status = getifaddrs(&addrs);
+        massert(13469, "getifaddrs failure: " + errnoWithDescription(errno), status == 0);
+
+        vector<string> out;
+
+        // based on example code from linux getifaddrs manpage
+        for (ifaddrs * addr = addrs; addr != NULL; addr = addr->ifa_next) {
+            if ( addr->ifa_addr == NULL ) continue;
+            int family = addr->ifa_addr->sa_family;
+            char host[NI_MAXHOST];
+
+            if (family == AF_INET || family == AF_INET6) {
+                status = getnameinfo(addr->ifa_addr,
+                                     (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)),
+                                     host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
+                if ( status != 0 ) {
+                    freeifaddrs( addrs );
+                    addrs = NULL;
+                    msgasserted( 13470, string("getnameinfo() failed: ") + gai_strerror(status) );
+                }
+
+                out.push_back(host);
+            }
+
+        }
+
+        freeifaddrs( addrs );
+        addrs = NULL;
+
+        if (logLevel >= 1) {
+            log(1) << "getMyAddrs():";
+            for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) {
+                log(1) << " [" << *it << ']';
+            }
+            log(1) << endl;
+        }
+
+        return out;
+    }
+
+    vector<string> getAllIPs(StringData iporhost) {
+        addrinfo* addrs = NULL;
+        addrinfo hints;
+        memset(&hints, 0, sizeof(addrinfo));
+        hints.ai_socktype = SOCK_STREAM;
+        hints.ai_family = (IPv6Enabled() ? AF_UNSPEC : AF_INET);
+
+        static string portNum = BSONObjBuilder::numStr(cmdLine.port);
+
+        vector<string> out;
+
+        int ret = getaddrinfo(iporhost.data(), portNum.c_str(), &hints, &addrs);
+        if ( ret ) {
+            warning() << "getaddrinfo(\"" << iporhost.data() << "\") failed: " << gai_strerror(ret) << endl;
+            return out;
+        }
+
+        for (addrinfo* addr = addrs; addr != NULL; addr = addr->ai_next) {
+            int family = addr->ai_family;
+            char host[NI_MAXHOST];
+
+            if (family == AF_INET || family == AF_INET6) {
+                int status = getnameinfo(addr->ai_addr, addr->ai_addrlen, host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
+
+                massert(13472, string("getnameinfo() failed: ") + gai_strerror(status), status == 0);
+
+                out.push_back(host);
+            }
+
+        }
+
+        freeaddrinfo(addrs);
+
+        if (logLevel >= 1) {
+            log(1) << "getallIPs(\"" << iporhost << "\"):";
+            for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) {
+                log(1) << " [" << *it << ']';
+            }
+            log(1) << endl;
+        }
+
+        return out;
+    }
+#endif
+
+
+    class IsSelfCommand : public Command {
+    public:
+        IsSelfCommand() : Command("_isSelf") , _cacheLock( "IsSelfCommand::_cacheLock" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "{ _isSelf : 1 } INTERNAL ONLY";
+        }
+
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            init();
+            result.append( "id" , _id );
+            return true;
+        }
+
+        void init() {
+            scoped_lock lk( _cacheLock );
+            if ( ! _id.isSet() )
+                _id.init();
+        }
+
+        OID _id;
+
+        mongo::mutex _cacheLock;
+        map<string,bool> _cache;
+    } isSelfCommand;
+
+    bool HostAndPort::isSelf() const {
+
+        int p = _port == -1 ? CmdLine::DefaultDBPort : _port;
+
+        if( p != cmdLine.port ) {
+            // shortcut - ports have to match at the very least
+            return false;
+        }
+
+        string host = str::stream() << _host << ":" << p;
+
+        {
+            // check cache for this host
+            // debatably something _could_ change, but I'm not sure right now (erh 10/14/2010)
+            scoped_lock lk( isSelfCommand._cacheLock );
+            map<string,bool>::const_iterator i = isSelfCommand._cache.find( host );
+            if ( i != isSelfCommand._cache.end() )
+                return i->second;
+        }
+
+#if !defined(_WIN32) && !defined(__sunos__)
+        // on linux and os x we can do a quick check for an ip match
+
+        const vector<string> myaddrs = getMyAddrs();
+        const vector<string> addrs = getAllIPs(_host);
+
+        for (vector<string>::const_iterator i=myaddrs.begin(), iend=myaddrs.end(); i!=iend; ++i) {
+            for (vector<string>::const_iterator j=addrs.begin(), jend=addrs.end(); j!=jend; ++j) {
+                string a = *i;
+                string b = *j;
+
+                if ( a == b ||
+                        ( str::startsWith( a , "127." ) && str::startsWith( b , "127." ) )  // 127. is all loopback
+                   ) {
+
+                    // add to cache
+                    scoped_lock lk( isSelfCommand._cacheLock );
+                    isSelfCommand._cache[host] = true;
+                    return true;
+                }
+            }
+        }
+
+#endif
+
+        if ( ! Listener::getTimeTracker() ) {
+            // this ensures we are actually running a server
+            // this may return true later, so may want to retry
+            return false;
+        }
+
+
+        try {
+
+            isSelfCommand.init();
+
+            DBClientConnection conn;
+            string errmsg;
+            if ( ! conn.connect( host , errmsg ) ) {
+                // should this go in the cache?
+                return false;
+            }
+
+            BSONObj out;
+            bool ok = conn.simpleCommand( "admin" , &out , "_isSelf" );
+
+            bool me = ok && out["id"].type() == jstOID && isSelfCommand._id == out["id"].OID();
+
+            // add to cache
+            scoped_lock lk( isSelfCommand._cacheLock );
+            isSelfCommand._cache[host] = me;
+
+            return me;
+        }
+        catch ( std::exception& e ) {
+            warning() << "could't check isSelf (" << host << ") " << e.what() << endl;
+        }
+
+        return false;
+    }
+
+
+
+}
diff --git a/db/commands/mr.cpp b/db/commands/mr.cpp
new file mode 100644
index 0000000..16c604a
--- /dev/null
+++ b/db/commands/mr.cpp
@@ -0,0 +1,1074 @@
+// mr.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db.h"
+#include "../instance.h"
+#include "../commands.h"
+#include "../../scripting/engine.h"
+#include "../../client/dbclient.h"
+#include "../../client/connpool.h"
+#include "../../client/parallel.h"
+#include "../queryoptimizer.h"
+#include "../matcher.h"
+#include "../clientcursor.h"
+#include "../replpair.h"
+#include "../../s/d_chunk_manager.h"
+#include "../../s/d_logic.h"
+
+#include "mr.h"
+
+namespace mongo {
+
+    namespace mr {
+
+        AtomicUInt Config::JOB_NUMBER;
+
+        JSFunction::JSFunction( string type , const BSONElement& e ) {
+            _type = type;
+            _code = e._asCode();
+
+            if ( e.type() == CodeWScope )
+                _wantedScope = e.codeWScopeObject();
+        }
+
+        void JSFunction::init( State * state ) {
+            _scope = state->scope();
+            assert( _scope );
+            _scope->init( &_wantedScope );
+
+            _func = _scope->createFunction( _code.c_str() );
+            uassert( 13598 , str::stream() << "couldn't compile code for: " << _type , _func );
+        }
+
+        void JSMapper::init( State * state ) {
+            _func.init( state );
+            _params = state->config().mapParams;
+        }
+
+        /**
+         * Applies the map function to an object, which should internally call emit()
+         */
+        void JSMapper::map( const BSONObj& o ) {
+            Scope * s = _func.scope();
+            assert( s );
+            s->setThis( &o );
+            if ( s->invoke( _func.func() , _params , 0 , true ) )
+                throw UserException( 9014, str::stream() << "map invoke failed: " + s->getError() );
+        }
+
+        /**
+         * Applies the finalize function to a tuple obj (key, val)
+         * Returns tuple obj {_id: key, value: newval}
+         */
+        BSONObj JSFinalizer::finalize( const BSONObj& o ) {
+            Scope * s = _func.scope();
+
+            Scope::NoDBAccess no = s->disableDBAccess( "can't access db inside finalize" );
+            s->invokeSafe( _func.func() , o );
+
+            // don't want to use o.objsize() to size b
+            // since there are many cases where the point of finalize
+            // is converting many fields to 1
+            BSONObjBuilder b;
+            b.append( o.firstElement() );
+            s->append( b , "value" , "return" );
+            return b.obj();
+        }
+
+        /**
+         * Reduces a list of tuple objects (key, value) to a single tuple {"0": key, "1": value}
+         */
+        BSONObj JSReducer::reduce( const BSONList& tuples ) {
+            if (tuples.size() <= 1)
+                return tuples[0];
+            BSONObj key;
+            int endSizeEstimate = 16;
+            _reduce( tuples , key , endSizeEstimate );
+
+            BSONObjBuilder b(endSizeEstimate);
+            b.appendAs( key.firstElement() , "0" );
+            _func.scope()->append( b , "1" , "return" );
+            return b.obj();
+        }
+
+        /**
+         * Reduces a list of tuple object (key, value) to a single tuple {_id: key, value: val}
+         * Also applies a finalizer method if present.
+         */
+        BSONObj JSReducer::finalReduce( const BSONList& tuples , Finalizer * finalizer ) {
+
+            BSONObj res;
+            BSONObj key;
+
+            if (tuples.size() == 1) {
+                // 1 obj, just use it
+                key = tuples[0];
+                BSONObjBuilder b(key.objsize());
+                BSONObjIterator it(key);
+                b.appendAs( it.next() , "_id" );
+                b.appendAs( it.next() , "value" );
+                res = b.obj();
+            }
+            else {
+                // need to reduce
+                int endSizeEstimate = 16;
+                _reduce( tuples , key , endSizeEstimate );
+                BSONObjBuilder b(endSizeEstimate);
+                b.appendAs( key.firstElement() , "_id" );
+                _func.scope()->append( b , "value" , "return" );
+                res = b.obj();
+            }
+
+            if ( finalizer ) {
+                res = finalizer->finalize( res );
+            }
+
+            return res;
+        }
+
+        /**
+         * actually applies a reduce, to a list of tuples (key, value).
+         * After the call, tuples will hold a single tuple {"0": key, "1": value}
+         */
+        void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) {
+            uassert( 10074 ,  "need values" , tuples.size() );
+
+            int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128;
+
+            // need to build the reduce args: ( key, [values] )
+            BSONObjBuilder reduceArgs( sizeEstimate );
+            boost::scoped_ptr<BSONArrayBuilder>  valueBuilder;
+            int sizeSoFar = 0;
+            unsigned n = 0;
+            for ( ; n<tuples.size(); n++ ) {
+                BSONObjIterator j(tuples[n]);
+                BSONElement keyE = j.next();
+                if ( n == 0 ) {
+                    reduceArgs.append( keyE );
+                    key = keyE.wrap();
+                    sizeSoFar = 5 + keyE.size();
+                    valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) ));
+                }
+
+                BSONElement ee = j.next();
+
+                uassert( 13070 , "value too large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) );
+
+                if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) {
+                    assert( n > 1 ); // if not, inf. loop
+                    break;
+                }
+
+                valueBuilder->append( ee );
+                sizeSoFar += ee.size();
+            }
+            assert(valueBuilder);
+            valueBuilder->done();
+            BSONObj args = reduceArgs.obj();
+
+            Scope * s = _func.scope();
+
+            s->invokeSafe( _func.func() , args );
+
+            if ( s->type( "return" ) == Array ) {
+                uasserted( 10075 , "reduce -> multiple not supported yet");
+                return;
+            }
+
+            endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() );
+
+            if ( n == tuples.size() )
+                return;
+
+            // the input list was too large, add the rest of elmts to new tuples and reduce again
+            // note: would be better to use loop instead of recursion to avoid stack overflow
+            BSONList x;
+            for ( ; n < tuples.size(); n++ ) {
+                x.push_back( tuples[n] );
+            }
+            BSONObjBuilder temp( endSizeEstimate );
+            temp.append( key.firstElement() );
+            s->append( temp , "1" , "return" );
+            x.push_back( temp.obj() );
+            _reduce( x , key , endSizeEstimate );
+        }
+
+        Config::Config( const string& _dbname , const BSONObj& cmdObj ) {
+
+            dbname = _dbname;
+            ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            verbose = cmdObj["verbose"].trueValue();
+
+            uassert( 13602 , "outType is no longer a valid option" , cmdObj["outType"].eoo() );
+
+            if ( cmdObj["out"].type() == String ) {
+                finalShort = cmdObj["out"].String();
+                outType = REPLACE;
+            }
+            else if ( cmdObj["out"].type() == Object ) {
+                BSONObj o = cmdObj["out"].embeddedObject();
+
+                BSONElement e = o.firstElement();
+                string t = e.fieldName();
+
+                if ( t == "normal" || t == "replace" ) {
+                    outType = REPLACE;
+                    finalShort = e.String();
+                }
+                else if ( t == "merge" ) {
+                    outType = MERGE;
+                    finalShort = e.String();
+                }
+                else if ( t == "reduce" ) {
+                    outType = REDUCE;
+                    finalShort = e.String();
+                }
+                else if ( t == "inline" ) {
+                    outType = INMEMORY;
+                }
+                else {
+                    uasserted( 13522 , str::stream() << "unknown out specifier [" << t << "]" );
+                }
+
+                if (o.hasElement("db")) {
+                    outDB = o["db"].String();
+                }
+            }
+            else {
+                uasserted( 13606 , "'out' has to be a string or an object" );
+            }
+
+            if ( outType != INMEMORY ) { // setup names
+                tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << finalShort << "_" << JOB_NUMBER++;
+
+                incLong = tempLong + "_inc";
+
+                finalLong = str::stream() << (outDB.empty() ? dbname : outDB) << "." << finalShort;
+            }
+
+            {
+                // scope and code
+
+                if ( cmdObj["scope"].type() == Object )
+                    scopeSetup = cmdObj["scope"].embeddedObjectUserCheck();
+
+                mapper.reset( new JSMapper( cmdObj["map"] ) );
+                reducer.reset( new JSReducer( cmdObj["reduce"] ) );
+                if ( cmdObj["finalize"].type() && cmdObj["finalize"].trueValue() )
+                    finalizer.reset( new JSFinalizer( cmdObj["finalize"] ) );
+
+                if ( cmdObj["mapparams"].type() == Array ) {
+                    mapParams = cmdObj["mapparams"].embeddedObjectUserCheck();
+                }
+
+            }
+
+            {
+                // query options
+                BSONElement q = cmdObj["query"];
+                if ( q.type() == Object )
+                    filter = q.embeddedObjectUserCheck();
+                else
+                    uassert( 13608 , "query has to be blank or an Object" , ! q.trueValue() );
+
+
+                BSONElement s = cmdObj["sort"];
+                if ( s.type() == Object )
+                    sort = s.embeddedObjectUserCheck();
+                else
+                    uassert( 13609 , "sort has to be blank or an Object" , ! s.trueValue() );
+
+                if ( cmdObj["limit"].isNumber() )
+                    limit = cmdObj["limit"].numberLong();
+                else
+                    limit = 0;
+            }
+        }
+
+        /**
+         * Create temporary collection, set up indexes
+         */
+        void State::prepTempCollection() {
+            if ( ! _onDisk )
+                return;
+
+            _db.dropCollection( _config.tempLong );
+
+            {
+                // create
+                writelock lock( _config.tempLong.c_str() );
+                Client::Context ctx( _config.tempLong.c_str() );
+                string errmsg;
+                if ( ! userCreateNS( _config.tempLong.c_str() , BSONObj() , errmsg , true ) ) {
+                    uasserted( 13630 , str::stream() << "userCreateNS failed for mr tempLong ns: " << _config.tempLong << " err: " << errmsg );
+                }
+            }
+
+
+            {
+                // copy indexes
+                auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.finalLong );
+                while ( idx->more() ) {
+                    BSONObj i = idx->next();
+
+                    BSONObjBuilder b( i.objsize() + 16 );
+                    b.append( "ns" , _config.tempLong );
+                    BSONObjIterator j( i );
+                    while ( j.more() ) {
+                        BSONElement e = j.next();
+                        if ( str::equals( e.fieldName() , "_id" ) ||
+                                str::equals( e.fieldName() , "ns" ) )
+                            continue;
+
+                        b.append( e );
+                    }
+
+                    BSONObj indexToInsert = b.obj();
+                    insert( Namespace( _config.tempLong.c_str() ).getSisterNS( "system.indexes" ).c_str() , indexToInsert );
+                }
+
+            }
+
+        }
+
+        /**
+         * For inline mode, appends results to output object.
+         * Makes sure (key, value) tuple is formatted as {_id: key, value: val}
+         */
+        void State::appendResults( BSONObjBuilder& final ) {
+            if ( _onDisk )
+                return;
+
+            uassert( 13604 , "too much data for in memory map/reduce" , _size < ( BSONObjMaxUserSize / 2 ) );
+
+            BSONArrayBuilder b( (int)(_size * 1.2) ); // _size is data size, doesn't count overhead and keys
+
+            for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+                BSONObj key = i->first;
+                BSONList& all = i->second;
+
+                assert( all.size() == 1 );
+
+                BSONObjIterator vi( all[0] );
+                vi.next();
+
+                BSONObjBuilder temp( b.subobjStart() );
+                temp.appendAs( key.firstElement() , "_id" );
+                temp.appendAs( vi.next() , "value" );
+                temp.done();
+            }
+
+            BSONArray res = b.arr();
+            uassert( 13605 , "too much data for in memory map/reduce" , res.objsize() < ( BSONObjMaxUserSize * 2 / 3 ) );
+
+            final.append( "results" , res );
+        }
+
+        /**
+         * Does post processing on output collection.
+         * This may involve replacing, merging or reducing.
+         */
+        long long State::postProcessCollection() {
+            if ( _onDisk == false || _config.outType == Config::INMEMORY )
+                return _temp->size();
+
+            dblock lock;
+
+            if ( _config.finalLong == _config.tempLong )
+                return _db.count( _config.finalLong );
+
+            if ( _config.outType == Config::REPLACE || _db.count( _config.finalLong ) == 0 ) {
+                // replace: just rename from temp to final collection name, dropping previous collection
+                _db.dropCollection( _config.finalLong );
+                BSONObj info;
+                uassert( 10076 ,  "rename failed" ,
+                         _db.runCommand( "admin" , BSON( "renameCollection" << _config.tempLong << "to" << _config.finalLong ) , info ) );
+                _db.dropCollection( _config.tempLong );
+            }
+            else if ( _config.outType == Config::MERGE ) {
+                // merge: upsert new docs into old collection
+                auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() );
+                while ( cursor->more() ) {
+                    BSONObj o = cursor->next();
+                    Helpers::upsert( _config.finalLong , o );
+                    getDur().commitIfNeeded();
+                }
+                _db.dropCollection( _config.tempLong );
+            }
+            else if ( _config.outType == Config::REDUCE ) {
+                // reduce: apply reduce op on new result and existing one
+                BSONList values;
+
+                auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() );
+                while ( cursor->more() ) {
+                    BSONObj temp = cursor->next();
+                    BSONObj old;
+
+                    bool found;
+                    {
+                        Client::Context tx( _config.finalLong );
+                        found = Helpers::findOne( _config.finalLong.c_str() , temp["_id"].wrap() , old , true );
+                    }
+
+                    if ( found ) {
+                        // need to reduce
+                        values.clear();
+                        values.push_back( temp );
+                        values.push_back( old );
+                        Helpers::upsert( _config.finalLong , _config.reducer->finalReduce( values , _config.finalizer.get() ) );
+                    }
+                    else {
+                        Helpers::upsert( _config.finalLong , temp );
+                    }
+                    getDur().commitIfNeeded();
+                }
+                _db.dropCollection( _config.tempLong );
+            }
+
+            return _db.count( _config.finalLong );
+        }
+
+        /**
+         * Insert doc in collection
+         */
+        void State::insert( const string& ns , BSONObj& o ) {
+            assert( _onDisk );
+
+            writelock l( ns );
+            Client::Context ctx( ns );
+
+            theDataFileMgr.insertAndLog( ns.c_str() , o , false );
+        }
+
+        /**
+         * Insert doc into the inc collection
+         */
+        void State::_insertToInc( BSONObj& o ) {
+            assert( _onDisk );
+            theDataFileMgr.insertWithObjMod( _config.incLong.c_str() , o , true );
+            getDur().commitIfNeeded();
+        }
+
+        State::State( const Config& c ) : _config( c ), _size(0), _numEmits(0) {
+            _temp.reset( new InMemory() );
+            _onDisk = _config.outType != Config::INMEMORY;
+        }
+
+        bool State::sourceExists() {
+            return _db.exists( _config.ns );
+        }
+
+        long long State::incomingDocuments() {
+            return _db.count( _config.ns , _config.filter , QueryOption_SlaveOk , (unsigned) _config.limit );
+        }
+
+        State::~State() {
+            if ( _onDisk ) {
+                try {
+                    _db.dropCollection( _config.tempLong );
+                    _db.dropCollection( _config.incLong );
+                }
+                catch ( std::exception& e ) {
+                    error() << "couldn't cleanup after map reduce: " << e.what() << endl;
+                }
+            }
+        }
+
+        /**
+         * Initialize the mapreduce operation, creating the inc collection
+         */
+        void State::init() {
+            // setup js
+            _scope.reset(globalScriptEngine->getPooledScope( _config.dbname ).release() );
+            _scope->localConnect( _config.dbname.c_str() );
+
+            if ( ! _config.scopeSetup.isEmpty() )
+                _scope->init( &_config.scopeSetup );
+
+            _config.mapper->init( this );
+            _config.reducer->init( this );
+            if ( _config.finalizer )
+                _config.finalizer->init( this );
+
+            _scope->injectNative( "emit" , fast_emit );
+
+            if ( _onDisk ) {
+                // clear temp collections
+                _db.dropCollection( _config.tempLong );
+                _db.dropCollection( _config.incLong );
+
+                // create the inc collection and make sure we have index on "0" key
+                {
+                    writelock l( _config.incLong );
+                    Client::Context ctx( _config.incLong );
+                    string err;
+                    if ( ! userCreateNS( _config.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ) {
+                        uasserted( 13631 , str::stream() << "userCreateNS failed for mr incLong ns: " << _config.incLong << " err: " << err );
+                    }
+                }
+
+                BSONObj sortKey = BSON( "0" << 1 );
+                _db.ensureIndex( _config.incLong , sortKey );
+
+            }
+
+        }
+
+        /**
+         * Applies last reduce and finalize on a list of tuples (key, val)
+         * Inserts single result {_id: key, value: val} into temp collection
+         */
+        void State::finalReduce( BSONList& values ) {
+            if ( !_onDisk || values.size() == 0 )
+                return;
+
+            BSONObj res = _config.reducer->finalReduce( values , _config.finalizer.get() );
+            insert( _config.tempLong , res );
+        }
+
+        /**
+         * Applies last reduce and finalize.
+         * After calling this method, the temp collection will be completed.
+         * If inline, the results will be in the in memory map
+         */
+        void State::finalReduce( CurOp * op , ProgressMeterHolder& pm ) {
+            if ( ! _onDisk ) {
+                // all data has already been reduced, just finalize
+                if ( _config.finalizer ) {
+                    long size = 0;
+                    for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+                        BSONObj key = i->first;
+                        BSONList& all = i->second;
+
+                        assert( all.size() == 1 );
+
+                        BSONObj res = _config.finalizer->finalize( all[0] );
+
+                        all.clear();
+                        all.push_back( res );
+                        size += res.objsize();
+                    }
+                    _size = size;
+                }
+                return;
+            }
+
+            // use index on "0" to pull sorted data
+            assert( _temp->size() == 0 );
+            BSONObj sortKey = BSON( "0" << 1 );
+            {
+                bool foundIndex = false;
+
+                auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.incLong );
+                while ( idx.get() && idx->more() ) {
+                    BSONObj x = idx->next();
+                    if ( sortKey.woCompare( x["key"].embeddedObject() ) == 0 ) {
+                        foundIndex = true;
+                        break;
+                    }
+                }
+
+                assert( foundIndex );
+            }
+
+            readlock rl( _config.incLong.c_str() );
+            Client::Context ctx( _config.incLong );
+
+            BSONObj prev;
+            BSONList all;
+
+            assert( pm == op->setMessage( "m/r: (3/3) final reduce to collection" , _db.count( _config.incLong, BSONObj(), QueryOption_SlaveOk ) ) );
+
+            shared_ptr<Cursor> temp = bestGuessCursor( _config.incLong.c_str() , BSONObj() , sortKey );
+            auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , _config.incLong.c_str() ) );
+
+            // iterate over all sorted objects
+            while ( cursor->ok() ) {
+                BSONObj o = cursor->current().getOwned();
+                cursor->advance();
+
+                pm.hit();
+
+                if ( o.woSortOrder( prev , sortKey ) == 0 ) {
+                    // object is same as previous, add to array
+                    all.push_back( o );
+                    if ( pm->hits() % 1000 == 0 ) {
+                        if ( ! cursor->yield() ) {
+                            cursor.release();
+                            break;
+                        }
+                        killCurrentOp.checkForInterrupt();
+                    }
+                    continue;
+                }
+
+                ClientCursor::YieldLock yield (cursor.get());
+                // reduce an finalize array
+                finalReduce( all );
+
+                all.clear();
+                prev = o;
+                all.push_back( o );
+
+                if ( ! yield.stillOk() ) {
+                    cursor.release();
+                    break;
+                }
+
+                killCurrentOp.checkForInterrupt();
+            }
+            
+            // we need to release here since we temp release below
+            cursor.release();
+
+            {
+                dbtempreleasecond tl;
+                if ( ! tl.unlocked() )
+                    log( LL_WARNING ) << "map/reduce can't temp release" << endl;
+                // reduce and finalize last array
+                finalReduce( all );
+            }
+
+            pm.finished();
+        }
+
+        /**
+         * Attempts to reduce objects in the memory map.
+         * A new memory map will be created to hold the results.
+         * If applicable, objects with unique key may be dumped to inc collection.
+         * Input and output objects are both {"0": key, "1": val}
+         */
+        void State::reduceInMemory() {
+
+            auto_ptr<InMemory> n( new InMemory() ); // for new data
+            long nSize = 0;
+            long dupCount = 0;
+
+            for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+                BSONObj key = i->first;
+                BSONList& all = i->second;
+
+                if ( all.size() == 1 ) {
+                    // only 1 value for this key
+                    if ( _onDisk ) {
+                        // this key has low cardinality, so just write to collection
+                        writelock l(_config.incLong);
+                        Client::Context ctx(_config.incLong.c_str());
+                        _insertToInc( *(all.begin()) );
+                    }
+                    else {
+                        // add to new map
+                        _add( n.get() , all[0] , nSize, dupCount );
+                    }
+                }
+                else if ( all.size() > 1 ) {
+                    // several values, reduce and add to map
+                    BSONObj res = _config.reducer->reduce( all );
+                    _add( n.get() , res , nSize, dupCount );
+                }
+            }
+
+            // swap maps
+            _temp.reset( n.release() );
+            _size = nSize;
+            _dupCount = dupCount;
+        }
+
+        /**
+         * Dumps the entire in memory map to the inc collection.
+         */
+        void State::dumpToInc() {
+            if ( ! _onDisk )
+                return;
+
+            writelock l(_config.incLong);
+            Client::Context ctx(_config.incLong);
+
+            for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ) {
+                BSONList& all = i->second;
+                if ( all.size() < 1 )
+                    continue;
+
+                for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ )
+                    _insertToInc( *j );
+            }
+            _temp->clear();
+            _size = 0;
+
+        }
+
+        /**
+         * Adds object to in memory map
+         */
+        void State::emit( const BSONObj& a ) {
+            _numEmits++;
+            _add( _temp.get() , a , _size, _dupCount );
+        }
+
+        void State::_add( InMemory* im, const BSONObj& a , long& size, long& dupCount ) {
+            BSONList& all = (*im)[a];
+            all.push_back( a );
+            size += a.objsize() + 16;
+            if (all.size() > 1)
+            	++dupCount;
+        }
+
+        /**
+         * this method checks the size of in memory map and potentially flushes to disk
+         */
+        void State::checkSize() {
+            if ( _size < 1024 * 50 )
+                return;
+
+            // attempt to reduce in memory map, if we've seen duplicates
+            if ( _dupCount > 0) {
+				long before = _size;
+				reduceInMemory();
+				log(1) << "  mr: did reduceInMemory  " << before << " -->> " << _size << endl;
+            }
+
+            if ( ! _onDisk || _size < 1024 * 100 )
+                return;
+
+            dumpToInc();
+            log(1) << "  mr: dumping to db" << endl;
+        }
+
+        boost::thread_specific_ptr<State*> _tl;
+
+        /**
+         * emit that will be called by js function
+         */
+        BSONObj fast_emit( const BSONObj& args ) {
+            uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 );
+            uassert( 13069 , "an emit can't be more than half max bson size" , args.objsize() < ( BSONObjMaxUserSize / 2 ) );
+            (*_tl)->emit( args );
+            return BSONObj();
+        }
+
+        /**
+         * This class represents a map/reduce command executed on a single server
+         */
+        class MapReduceCommand : public Command {
+        public:
+            MapReduceCommand() : Command("mapReduce", false, "mapreduce") {}
+            virtual bool slaveOk() const { return !replSet; }
+            virtual bool slaveOverrideOk() { return true; }
+
+            virtual void help( stringstream &help ) const {
+                help << "Run a map/reduce operation on the server.\n";
+                help << "Note this is used for aggregation, not querying, in MongoDB.\n";
+                help << "http://www.mongodb.org/display/DOCS/MapReduce";
+            }
+            virtual LockType locktype() const { return NONE; }
+            bool run(const string& dbname , BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+                Timer t;
+                Client::GodScope cg;
+                Client& client = cc();
+                CurOp * op = client.curop();
+
+                Config config( dbname , cmd );
+
+                log(1) << "mr ns: " << config.ns << endl;
+
+                bool shouldHaveData = false;
+
+                long long num = 0;
+                long long inReduce = 0;
+
+                BSONObjBuilder countsBuilder;
+                BSONObjBuilder timingBuilder;
+                State state( config );
+
+                if ( ! state.sourceExists() ) {
+                    errmsg = "ns doesn't exist";
+                    return false;
+                }
+
+                if (replSet && state.isOnDisk()) {
+                    // this means that it will be doing a write operation, make sure we are on Master
+                    // ideally this check should be in slaveOk(), but at that point config is not known
+                    if (!isMaster(dbname.c_str())) {
+                        errmsg = "not master";
+                        return false;
+                    }
+                }
+
+                try {
+                    state.init();
+
+                    {
+                        State** s = new State*();
+                        s[0] = &state;
+                        _tl.reset( s );
+                    }
+
+                    wassert( config.limit < 0x4000000 ); // see case on next line to 32 bit unsigned
+                    ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , state.incomingDocuments() ) );
+                    long long mapTime = 0;
+                    {
+                        readlock lock( config.ns );
+                        Client::Context ctx( config.ns );
+
+                        ShardChunkManagerPtr chunkManager;
+                        if ( shardingState.needShardChunkManager( config.ns ) ) {
+                            chunkManager = shardingState.getShardChunkManager( config.ns );
+                        }
+
+                        // obtain cursor on data to apply mr to, sorted
+                        shared_ptr<Cursor> temp = bestGuessCursor( config.ns.c_str(), config.filter, config.sort );
+                        auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , config.ns.c_str() ) );
+
+                        Timer mt;
+                        // go through each doc
+                        while ( cursor->ok() ) {
+                            // make sure we dont process duplicates in case data gets moved around during map
+                            if ( cursor->currentIsDup() ) {
+                                cursor->advance();
+                                continue;
+                            }
+
+                            if ( ! cursor->currentMatches() ) {
+                                cursor->advance();
+                                continue;
+                            }
+
+                            BSONObj o = cursor->current();
+                            cursor->advance();
+
+                            // check to see if this is a new object we don't own yet
+                            // because of a chunk migration
+                            if ( chunkManager && ! chunkManager->belongsToMe( o ) )
+                                continue;
+
+                            // do map
+                            if ( config.verbose ) mt.reset();
+                            config.mapper->map( o );
+                            if ( config.verbose ) mapTime += mt.micros();
+
+                            num++;
+                            if ( num % 100 == 0 ) {
+                                // try to yield lock regularly
+                                ClientCursor::YieldLock yield (cursor.get());
+                                Timer t;
+                                // check if map needs to be dumped to disk
+                                state.checkSize();
+                                inReduce += t.micros();
+
+                                if ( ! yield.stillOk() ) {
+                                    cursor.release();
+                                    break;
+                                }
+
+                                killCurrentOp.checkForInterrupt();
+                            }
+                            pm.hit();
+
+                            if ( config.limit && num >= config.limit )
+                                break;
+                        }
+                    }
+                    pm.finished();
+
+                    killCurrentOp.checkForInterrupt();
+                    // update counters
+                    countsBuilder.appendNumber( "input" , num );
+                    countsBuilder.appendNumber( "emit" , state.numEmits() );
+                    if ( state.numEmits() )
+                        shouldHaveData = true;
+
+                    timingBuilder.append( "mapTime" , mapTime / 1000 );
+                    timingBuilder.append( "emitLoop" , t.millis() );
+
+                    op->setMessage( "m/r: (2/3) final reduce in memory" );
+                    // do reduce in memory
+                    // this will be the last reduce needed for inline mode
+                    state.reduceInMemory();
+                    // if not inline: dump the in memory map to inc collection, all data is on disk
+                    state.dumpToInc();
+                    state.prepTempCollection();
+                    // final reduce
+                    state.finalReduce( op , pm );
+
+                    _tl.reset();
+                }
+                catch ( ... ) {
+                    log() << "mr failed, removing collection" << endl;
+                    throw;
+                }
+
+                long long finalCount = state.postProcessCollection();
+                state.appendResults( result );
+
+                timingBuilder.append( "total" , t.millis() );
+
+                if (!config.outDB.empty()) {
+                    BSONObjBuilder loc;
+                    if ( !config.outDB.empty())
+                        loc.append( "db" , config.outDB );
+                    if ( !config.finalShort.empty() )
+                        loc.append( "collection" , config.finalShort );
+                    result.append("result", loc.obj());
+                }
+                else {
+                    if ( !config.finalShort.empty() )
+                        result.append( "result" , config.finalShort );
+                }
+                result.append( "timeMillis" , t.millis() );
+                countsBuilder.appendNumber( "output" , finalCount );
+                if ( config.verbose ) result.append( "timing" , timingBuilder.obj() );
+                result.append( "counts" , countsBuilder.obj() );
+
+                if ( finalCount == 0 && shouldHaveData ) {
+                    result.append( "cmd" , cmd );
+                    errmsg = "there were emits but no data!";
+                    return false;
+                }
+
+                return true;
+            }
+
+        } mapReduceCommand;
+
+        /**
+         * This class represents a map/reduce command executed on the output server of a sharded env
+         */
+        class MapReduceFinishCommand : public Command {
+        public:
+            MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ) {}
+            virtual bool slaveOk() const { return !replSet; }
+            virtual bool slaveOverrideOk() { return true; }
+
+            virtual LockType locktype() const { return NONE; }
+            bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+                string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe();
+
+                Config config( dbname , cmdObj.firstElement().embeddedObjectUserCheck() );
+                config.incLong = config.tempLong;
+
+                set<ServerAndQuery> servers;
+
+                BSONObjBuilder shardCounts;
+                map<string,long long> counts;
+
+                BSONObj shards = cmdObj["shards"].embeddedObjectUserCheck();
+                vector< auto_ptr<DBClientCursor> > shardCursors;
+
+                {
+                    // parse per shard results
+                    BSONObjIterator i( shards );
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        string shard = e.fieldName();
+
+                        BSONObj res = e.embeddedObjectUserCheck();
+
+                        uassert( 10078 ,  "something bad happened" , shardedOutputCollection == res["result"].valuestrsafe() );
+                        servers.insert( shard );
+                        shardCounts.appendAs( res["counts"] , shard );
+
+                        BSONObjIterator j( res["counts"].embeddedObjectUserCheck() );
+                        while ( j.more() ) {
+                            BSONElement temp = j.next();
+                            counts[temp.fieldName()] += temp.numberLong();
+                        }
+
+                    }
+
+                }
+
+                State state(config);
+                state.prepTempCollection();
+
+                {
+                    // reduce from each stream
+
+                    BSONObj sortKey = BSON( "_id" << 1 );
+
+                    ParallelSortClusteredCursor cursor( servers , dbname + "." + shardedOutputCollection ,
+                                                        Query().sort( sortKey ) );
+                    cursor.init();
+                    state.init();
+
+                    BSONList values;
+                    if (!config.outDB.empty()) {
+                        BSONObjBuilder loc;
+                        if ( !config.outDB.empty())
+                            loc.append( "db" , config.outDB );
+                        if ( !config.finalShort.empty() )
+                            loc.append( "collection" , config.finalShort );
+                        result.append("result", loc.obj());
+                    }
+                    else {
+                        if ( !config.finalShort.empty() )
+                            result.append( "result" , config.finalShort );
+                    }
+
+                    while ( cursor.more() ) {
+                        BSONObj t = cursor.next().getOwned();
+
+                        if ( values.size() == 0 ) {
+                            values.push_back( t );
+                            continue;
+                        }
+
+                        if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ) {
+                            values.push_back( t );
+                            continue;
+                        }
+
+
+                        state.emit( config.reducer->finalReduce( values , config.finalizer.get() ) );
+                        values.clear();
+                        values.push_back( t );
+                    }
+
+                    if ( values.size() )
+                        state.emit( config.reducer->finalReduce( values , config.finalizer.get() ) );
+                }
+
+
+                state.dumpToInc();
+                state.postProcessCollection();
+                state.appendResults( result );
+
+                for ( set<ServerAndQuery>::iterator i=servers.begin(); i!=servers.end(); i++ ) {
+                    ScopedDbConnection conn( i->_server );
+                    conn->dropCollection( dbname + "." + shardedOutputCollection );
+                    conn.done();
+                }
+
+                result.append( "shardCounts" , shardCounts.obj() );
+
+                {
+                    BSONObjBuilder c;
+                    for ( map<string,long long>::iterator i=counts.begin(); i!=counts.end(); i++ ) {
+                        c.append( i->first , i->second );
+                    }
+                    result.append( "counts" , c.obj() );
+                }
+
+                return 1;
+            }
+        } mapReduceFinishCommand;
+
+    }
+
+}
+
diff --git a/db/commands/mr.h b/db/commands/mr.h
new file mode 100644
index 0000000..f505a45
--- /dev/null
+++ b/db/commands/mr.h
@@ -0,0 +1,291 @@
+// mr.h
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+    namespace mr {
+
+        typedef vector<BSONObj> BSONList;
+
+        class State;
+
+        // ------------  function interfaces -----------
+
+        class Mapper : boost::noncopyable {
+        public:
+            virtual ~Mapper() {}
+            virtual void init( State * state ) = 0;
+
+            virtual void map( const BSONObj& o ) = 0;
+        };
+
+        class Finalizer : boost::noncopyable {
+        public:
+            virtual ~Finalizer() {}
+            virtual void init( State * state ) = 0;
+
+            /**
+             * this takes a tuple and returns a tuple
+             */
+            virtual BSONObj finalize( const BSONObj& tuple ) = 0;
+        };
+
+        class Reducer : boost::noncopyable {
+        public:
+            virtual ~Reducer() {}
+            virtual void init( State * state ) = 0;
+
+            virtual BSONObj reduce( const BSONList& tuples ) = 0;
+            /** this means its a final reduce, even if there is no finalizer */
+            virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ) = 0;
+        };
+
+        // ------------  js function implementations -----------
+
+        /**
+         * used as a holder for Scope and ScriptingFunction
+         * visitor like pattern as Scope is gotten from first access
+         */
+        class JSFunction : boost::noncopyable {
+        public:
+            /**
+             * @param type (map|reduce|finalize)
+             */
+            JSFunction( string type , const BSONElement& e );
+            virtual ~JSFunction() {}
+
+            virtual void init( State * state );
+
+            Scope * scope() const { return _scope; }
+            ScriptingFunction func() const { return _func; }
+
+        private:
+            string _type;
+            string _code; // actual javascript code
+            BSONObj _wantedScope; // this is for CodeWScope
+
+            Scope * _scope; // this is not owned by us, and might be shared
+            ScriptingFunction _func;
+        };
+
+        class JSMapper : public Mapper {
+        public:
+            JSMapper( const BSONElement & code ) : _func( "map" , code ) {}
+            virtual void map( const BSONObj& o );
+            virtual void init( State * state );
+
+        private:
+            JSFunction _func;
+            BSONObj _params;
+        };
+
+        class JSReducer : public Reducer {
+        public:
+            JSReducer( const BSONElement& code ) : _func( "reduce" , code ) {}
+            virtual void init( State * state ) { _func.init( state ); }
+
+            virtual BSONObj reduce( const BSONList& tuples );
+            virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer );
+
+        private:
+
+            /**
+             * result in "return"
+             * @param key OUT
+             * @param endSizeEstimate OUT
+            */
+            void _reduce( const BSONList& values , BSONObj& key , int& endSizeEstimate );
+
+            JSFunction _func;
+
+        };
+
+        class JSFinalizer : public Finalizer  {
+        public:
+            JSFinalizer( const BSONElement& code ) : _func( "finalize" , code ) {}
+            virtual BSONObj finalize( const BSONObj& o );
+            virtual void init( State * state ) { _func.init( state ); }
+        private:
+            JSFunction _func;
+
+        };
+
+        // -----------------
+
+
+        class TupleKeyCmp {
+        public:
+            TupleKeyCmp() {}
+            bool operator()( const BSONObj &l, const BSONObj &r ) const {
+                return l.firstElement().woCompare( r.firstElement() ) < 0;
+            }
+        };
+
+        typedef map< BSONObj,BSONList,TupleKeyCmp > InMemory; // from key to list of tuples
+
+        /**
+         * holds map/reduce config information
+         */
+        class Config {
+        public:
+            Config( const string& _dbname , const BSONObj& cmdObj );
+
+            string dbname;
+            string ns;
+
+            // options
+            bool verbose;
+
+            // query options
+
+            BSONObj filter;
+            BSONObj sort;
+            long long limit;
+
+            // functions
+
+            scoped_ptr<Mapper> mapper;
+            scoped_ptr<Reducer> reducer;
+            scoped_ptr<Finalizer> finalizer;
+
+            BSONObj mapParams;
+            BSONObj scopeSetup;
+
+            // output tables
+            string incLong;
+            string tempLong;
+
+            string finalShort;
+            string finalLong;
+
+            string outDB;
+
+            enum { REPLACE , // atomically replace the collection
+                   MERGE ,  // merge keys, override dups
+                   REDUCE , // merge keys, reduce dups
+                   INMEMORY // only store in memory, limited in size
+                 } outType;
+
+            static AtomicUInt JOB_NUMBER;
+        }; // end MRsetup
+
+        /**
+         * stores information about intermediate map reduce state
+         * controls flow of data from map->reduce->finalize->output
+         */
+        class State {
+        public:
+            State( const Config& c );
+            ~State();
+
+            void init();
+
+            // ---- prep  -----
+            bool sourceExists();
+
+            long long incomingDocuments();
+
+            // ---- map stage ----
+
+            /**
+             * stages on in in-memory storage
+             */
+            void emit( const BSONObj& a );
+
+            /**
+             * if size is big, run a reduce
+             * if its still big, dump to temp collection
+             */
+            void checkSize();
+
+            /**
+             * run reduce on _temp
+             */
+            void reduceInMemory();
+
+            /**
+             * transfers in memory storage to temp collection
+             */
+            void dumpToInc();
+
+            // ------ reduce stage -----------
+
+            void prepTempCollection();
+
+            void finalReduce( BSONList& values );
+
+            void finalReduce( CurOp * op , ProgressMeterHolder& pm );
+
+            // ------- cleanup/data positioning ----------
+
+            /**
+               @return number objects in collection
+             */
+            long long postProcessCollection();
+
+            /**
+             * if INMEMORY will append
+             * may also append stats or anything else it likes
+             */
+            void appendResults( BSONObjBuilder& b );
+
+            // -------- util ------------
+
+            /**
+             * inserts with correct replication semantics
+             */
+            void insert( const string& ns , BSONObj& o );
+
+            // ------ simple accessors -----
+
+            /** State maintains ownership, do no use past State lifetime */
+            Scope* scope() { return _scope.get(); }
+
+            const Config& config() { return _config; }
+
+            const bool isOnDisk() { return _onDisk; }
+
+            long long numEmits() const { return _numEmits; }
+
+        protected:
+
+            void _insertToInc( BSONObj& o );
+            static void _add( InMemory* im , const BSONObj& a , long& size, long& dupCount );
+
+            scoped_ptr<Scope> _scope;
+            const Config& _config;
+            bool _onDisk; // if the end result of this map reduce is disk or not
+
+            DBDirectClient _db;
+
+            scoped_ptr<InMemory> _temp;
+            long _size; // bytes in _temp
+            long _dupCount; // number of duplicate key entries
+
+            long long _numEmits;
+        };
+
+        BSONObj fast_emit( const BSONObj& args );
+
+    } // end mr namespace
+}
+
+
diff --git a/db/common.cpp b/db/common.cpp
index b7883f5..44bc54d 100644
--- a/db/common.cpp
+++ b/db/common.cpp
@@ -26,4 +26,8 @@ namespace mongo {
     /* we use new here so we don't have to worry about destructor orders at program shutdown */
     MongoMutex &dbMutex( *(new MongoMutex("rw:dbMutex")) );
 
+    MongoMutex::MongoMutex(const char *name) : _m(name) {
+        _remapPrivateViewRequested = false;
+    }
+
 }
diff --git a/db/compact.cpp b/db/compact.cpp
new file mode 100644
index 0000000..6bafd91
--- /dev/null
+++ b/db/compact.cpp
@@ -0,0 +1,199 @@
+/* @file compact.cpp
+   compaction of deleted space in pdfiles (datafiles)
+*/
+
+/* NOTE 6Oct2010 : this file PRELIMINARY, EXPERIMENTAL, NOT DONE, NOT USED YET (not in SConstruct) */
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "concurrency.h"
+#include "commands.h"
+#include "curop-inl.h"
+#include "../util/concurrency/task.h"
+
+namespace mongo {
+
+    class CompactJob : public task::Task {
+    public:
+        CompactJob(string ns) : _ns(ns) { }
+    private:
+        virtual string name() const { return "compact"; }
+        virtual void doWork();
+        NamespaceDetails * beginBlock();
+        void doBatch();
+        void prep();
+        const string _ns;
+        unsigned long long _nrecords;
+        unsigned long long _ncompacted;
+        DiskLoc _firstExtent;
+    };
+
+    // lock & set context first.  this checks that collection still exists, and that it hasn't
+    // morphed into a capped collection between locks (which is possible)
+    NamespaceDetails * CompactJob::beginBlock() {
+        NamespaceDetails *nsd = nsdetails(_ns.c_str());
+        if( nsd == 0 ) throw "ns no longer present";
+        if( nsd->firstExtent.isNull() )
+            throw "no first extent";
+        if( nsd->capped )
+            throw "capped collection";
+        return nsd;
+    }
+
+    void CompactJob::doBatch() {
+        unsigned n = 0;
+        {
+            /* pre-touch records in a read lock so that paging happens in read not write lock.
+               note we are only touching the records though; if indexes aren't in RAM, they will
+               page later.  So the concept is only partial.
+               */
+            readlock lk;
+            Timer t;
+            Client::Context ctx(_ns);
+            NamespaceDetails *nsd = beginBlock();
+            if( nsd->firstExtent != _firstExtent )  {
+                // TEMP DEV - stop after 1st extent
+                throw "change of first extent";
+            }
+            DiskLoc loc = nsd->firstExtent.ext()->firstRecord;
+            while( !loc.isNull() ) {
+                Record *r = loc.rec();
+                loc = r->getNext(loc);
+                if( ++n >= 100 || (n % 8 == 0 && t.millis() > 50) )
+                    break;
+            }
+        }
+        {
+            writelock lk;
+            Client::Context ctx(_ns);
+            NamespaceDetails *nsd = beginBlock();
+            for( unsigned i = 0; i < n; i++ ) {
+                if( nsd->firstExtent != _firstExtent )  {
+                    // TEMP DEV - stop after 1st extent
+                    throw "change of first extent (or it is now null)";
+                }
+                DiskLoc loc = nsd->firstExtent.ext()->firstRecord;
+                Record *rec = loc.rec();
+                BSONObj o = loc.obj().getOwned(); // todo: inefficient, double mem copy...
+                try {
+                    theDataFileMgr.deleteRecord(_ns.c_str(), rec, loc, false);
+                }
+                catch(DBException&) { throw "error deleting record"; }
+                try {
+                    theDataFileMgr.insertNoReturnVal(_ns.c_str(), o);
+                }
+                catch(DBException&) {
+                    /* todo: save the record somehow??? try again with 'avoid' logic? */
+                    log() << "compact: error re-inserting record ns:" << _ns << " n:" << _nrecords << " _id:" << o["_id"].toString() << endl;
+                    throw "error re-inserting record";
+                }
+                ++_ncompacted;
+                if( killCurrentOp.globalInterruptCheck() )
+                    throw "interrupted";
+            }
+        }
+    }
+
+    void CompactJob::prep() {
+        readlock lk;
+        Client::Context ctx(_ns);
+        NamespaceDetails *nsd = beginBlock();
+        DiskLoc L = nsd->firstExtent;
+        assert( !L.isNull() );
+        _firstExtent = L;
+        _nrecords = nsd->stats.nrecords;
+        _ncompacted = 0;
+    }
+
+    static mutex m("compact");
+    static volatile bool running;
+
+    void CompactJob::doWork() {
+        Client::initThread("compact");
+        cc().curop()->reset();
+        cc().curop()->setNS(_ns.c_str());
+        cc().curop()->markCommand();
+        sleepsecs(60);
+        try {
+            prep();
+            while( _ncompacted < _nrecords )
+                doBatch();
+        }
+        catch(const char *p) {
+            log() << "info: exception compact " << p << endl;
+        }
+        catch(...) {
+            log() << "info: exception compact" << endl;
+        }
+        mongo::running = false;
+        cc().shutdown();
+    }
+
+    /* --- CompactCmd --- */
+
+    class CompactCmd : public Command {
+    public:
+        virtual bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string coll = cmdObj.firstElement().valuestr();
+            if( coll.empty() || db.empty() ) {
+                errmsg = "no collection name specified";
+                return false;
+            }
+            string ns = db + '.' + coll;
+            assert( isANormalNSName(ns.c_str()) );
+            {
+                readlock lk;
+                Client::Context ctx(ns);
+                if( nsdetails(ns.c_str()) == 0 ) {
+                    errmsg = "namespace " + ns + " does not exist";
+                    return false;
+                }
+            }
+            {
+                scoped_lock lk(m);
+                if( running ) {
+                    errmsg = "a compaction is already running";
+                    return false;
+                }
+                running = true;
+                task::fork( new CompactJob(ns) );
+                return true;
+            }
+            errmsg = "not done";
+            return false;
+        }
+
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool logTheOp() { return false; }
+        virtual void help( stringstream& help ) const {
+            help << "compact / defragment a collection in the background, slowly, attempting to minimize disruptions to other operations\n"
+                 "{ compact : <collection> }";
+        }
+        virtual bool requiresAuth() { return true; }
+
+        /** @param webUI expose the command in the web ui as localhost:28017/<name>
+            @param oldName an optional old, deprecated name for the command
+        */
+        CompactCmd() : Command("compact") { }
+    };
+    static CompactCmd compactCmd;
+
+}
diff --git a/db/concurrency.h b/db/concurrency.h
index 9b91b0f..39cd853 100644
--- a/db/concurrency.h
+++ b/db/concurrency.h
@@ -1,3 +1,5 @@
+// @file concurrency.h
+
 /*
  *    Copyright (C) 2010 10gen Inc.
  *
@@ -14,9 +16,7 @@
  *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-/* concurrency.h
-
-   mongod concurrency rules & notes will be placed here.
+/*mongod concurrency rules & notes will be placed here.
 
    Mutex heirarchy (1 = "leaf")
      name                   level
@@ -31,19 +31,22 @@
 
 #include "../util/concurrency/rwlock.h"
 #include "../util/mmap.h"
+#include "../util/time_support.h"
 
 namespace mongo {
 
     string sayClientState();
     bool haveClient();
-    
-    void curopWaitingForLock( int type );
-    void curopGotLock();
+
+    class Client;
+    Client* curopWaitingForLock( int type );
+    void curopGotLock(Client*);
 
     /* mutex time stats */
     class MutexInfo {
-        unsigned long long start, enter, timeLocked; // all in microseconds
+        unsigned long long enter, timeLocked; // microseconds
         int locked;
+        unsigned long long start; // last as we touch this least often
 
     public:
         MutexInfo() : timeLocked(0) , locked(0) {
@@ -61,215 +64,53 @@ namespace mongo {
             if ( locked == 0 )
                 timeLocked += curTimeMicros64() - enter;
         }
-        int isLocked() const {
-            return locked;
-        }
+        int isLocked() const { return locked; }
         void getTimingInfo(unsigned long long &s, unsigned long long &tl) const {
             s = start;
             tl = timeLocked;
         }
-        unsigned long long getTimeLocked() const {
-            return timeLocked;
-        }
+        unsigned long long getTimeLocked() const { return timeLocked; }
     };
 
-    class MongoMutex {
-        MutexInfo _minfo;
-        RWLock _m;
-        ThreadLocalValue<int> _state;
-
-        /* we use a separate TLS value for releasedEarly - that is ok as 
-           our normal/common code path, we never even touch it.
-        */
-        ThreadLocalValue<bool> _releasedEarly;
-    public:
-        MongoMutex(const char * name) : _m(name) { }
-
-        /**
-         * @return
-         *    > 0  write lock
-         *    = 0  no lock
-         *    < 0  read lock
-         */
-        int getState() { return _state.get(); }
-        bool isWriteLocked() { return getState() > 0; }
-        void assertWriteLocked() { 
-            assert( getState() > 0 ); 
-            DEV assert( !_releasedEarly.get() );
-        }
-        bool atLeastReadLocked() { return _state.get() != 0; }
-        void assertAtLeastReadLocked() { assert(atLeastReadLocked()); }
-
-        bool _checkWriteLockAlready(){
-            //DEV cout << "LOCK" << endl;
-            DEV assert( haveClient() );
-                
-            int s = _state.get();
-            if( s > 0 ) {
-                _state.set(s+1);
-                return true;
-            }
-
-            massert( 10293 , (string)"internal error: locks are not upgradeable: " + sayClientState() , s == 0 );
-
-            return false;
-        }
-
-        void lock() { 
-            if ( _checkWriteLockAlready() )
-                return;
-            
-            _state.set(1);
-
-            curopWaitingForLock( 1 );
-            _m.lock(); 
-            curopGotLock();
-
-            _minfo.entered();
-
-            MongoFile::lockAll();
-        }
-
-        bool lock_try( int millis ) { 
-            if ( _checkWriteLockAlready() )
-                return true;
-
-            curopWaitingForLock( 1 );
-            bool got = _m.lock_try( millis ); 
-            curopGotLock();
-            
-            if ( got ){
-                _minfo.entered();
-                _state.set(1);
-                MongoFile::lockAll();
-            }                
-            
-            return got;
-        }
-
-
-        void unlock() { 
-            //DEV cout << "UNLOCK" << endl;
-            int s = _state.get();
-            if( s > 1 ) { 
-                _state.set(s-1);
-                return;
-            }
-            if( s != 1 ) { 
-                if( _releasedEarly.get() ) { 
-                    _releasedEarly.set(false);
-                    return;
-                }
-                massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false);
-            }
-
-            MongoFile::unlockAll();
-
-            _state.set(0);
-            _minfo.leaving();
-            _m.unlock(); 
-        }
-
-        /* unlock (write lock), and when unlock() is called later, 
-           be smart then and don't unlock it again.
-           */
-        void releaseEarly() {
-            assert( getState() == 1 ); // must not be recursive
-            assert( !_releasedEarly.get() );
-            _releasedEarly.set(true);
-            unlock();
-        }
-
-        void lock_shared() { 
-            //DEV cout << " LOCKSHARED" << endl;
-            int s = _state.get();
-            if( s ) {
-                if( s > 0 ) { 
-                    // already in write lock - just be recursive and stay write locked
-                    _state.set(s+1);
-                    return;
-                }
-                else { 
-                    // already in read lock - recurse
-                    _state.set(s-1);
-                    return;
-                }
-            }
-            _state.set(-1);
-            curopWaitingForLock( -1 );
-            _m.lock_shared(); 
-            curopGotLock();
-        }
-        
-        bool lock_shared_try( int millis ) {
-            int s = _state.get();
-            if ( s ){
-                // we already have a lock, so no need to try
-                lock_shared();
-                return true;
-            }
+}
 
-            bool got = _m.lock_shared_try( millis );
-            if ( got )
-                _state.set(-1);
-            return got;
-        }
-        
-        void unlock_shared() { 
-            //DEV cout << " UNLOCKSHARED" << endl;
-            int s = _state.get();
-            if( s > 0 ) { 
-                assert( s > 1 ); /* we must have done a lock write first to have s > 1 */
-                _state.set(s-1);
-                return;
-            }
-            if( s < -1 ) { 
-                _state.set(s+1);
-                return;
-            }
-            assert( s == -1 );
-            _state.set(0);
-            _m.unlock_shared(); 
-        }
-        
-        MutexInfo& info() { return _minfo; }
-    };
+#include "mongomutex.h"
 
-    extern MongoMutex &dbMutex;
+namespace mongo {
 
     inline void dbunlocking_write() { }
     inline void dbunlocking_read() { }
 
     struct writelock {
-        writelock(const string& ns) {
-            dbMutex.lock();
-        }
-        ~writelock() { 
+        writelock() { dbMutex.lock(); }
+        writelock(const string& ns) { dbMutex.lock(); }
+        ~writelock() {
             DESTRUCTOR_GUARD(
                 dbunlocking_write();
                 dbMutex.unlock();
             );
         }
     };
-    
+
     struct readlock {
         readlock(const string& ns) {
             dbMutex.lock_shared();
         }
-        ~readlock() { 
+        readlock() { dbMutex.lock_shared(); }
+        ~readlock() {
             DESTRUCTOR_GUARD(
                 dbunlocking_read();
                 dbMutex.unlock_shared();
             );
         }
-    };	
+    };
 
     struct readlocktry {
-        readlocktry( const string&ns , int tryms ){
+        readlocktry( const string&ns , int tryms ) {
             _got = dbMutex.lock_shared_try( tryms );
         }
         ~readlocktry() {
-            if ( _got ){
+            if ( _got ) {
                 dbunlocking_read();
                 dbMutex.unlock_shared();
             }
@@ -280,11 +121,11 @@ namespace mongo {
     };
 
     struct writelocktry {
-        writelocktry( const string&ns , int tryms ){
+        writelocktry( const string&ns , int tryms ) {
             _got = dbMutex.lock_try( tryms );
         }
         ~writelocktry() {
-            if ( _got ){
+            if ( _got ) {
                 dbunlocking_read();
                 dbMutex.unlock();
             }
@@ -294,10 +135,10 @@ namespace mongo {
         bool _got;
     };
 
-    struct readlocktryassert : public readlocktry { 
-        readlocktryassert(const string& ns, int tryms) : 
-          readlocktry(ns,tryms) { 
-              uassert(13142, "timeout getting readlock", got());
+    struct readlocktryassert : public readlocktry {
+        readlocktryassert(const string& ns, int tryms) :
+            readlocktry(ns,tryms) {
+            uassert(13142, "timeout getting readlock", got());
         }
     };
 
@@ -305,12 +146,12 @@ namespace mongo {
         if you have a write lock, that's ok too.
     */
     struct atleastreadlock {
-        atleastreadlock( const string& ns ){
+        atleastreadlock( const string& ns ) {
             _prev = dbMutex.getState();
             if ( _prev == 0 )
                 dbMutex.lock_shared();
         }
-        ~atleastreadlock(){
+        ~atleastreadlock() {
             if ( _prev == 0 )
                 dbMutex.unlock_shared();
         }
@@ -318,6 +159,9 @@ namespace mongo {
         int _prev;
     };
 
+    /* parameterized choice of read or write locking
+       use readlock and writelock instead of this when statically known which you want
+    */
     class mongolock {
         bool _writelock;
     public:
@@ -328,27 +172,28 @@ namespace mongo {
             else
                 dbMutex.lock_shared();
         }
-        ~mongolock() { 
+        ~mongolock() {
             DESTRUCTOR_GUARD(
-                if( _writelock ) { 
-                    dbunlocking_write();
-                    dbMutex.unlock();
-                } else {
-                    dbunlocking_read();
-                    dbMutex.unlock_shared();
-                }
+            if( _writelock ) {
+            dbunlocking_write();
+                dbMutex.unlock();
+            }
+            else {
+                dbunlocking_read();
+                dbMutex.unlock_shared();
+            }
             );
         }
         /* this unlocks, does NOT upgrade. that works for our current usage */
         void releaseAndWriteLock();
     };
-    
-    /* use writelock and readlock instead */
+
+    /* deprecated - use writelock and readlock instead */
     struct dblock : public writelock {
         dblock() : writelock("") { }
     };
 
-    // eliminate
+    // eliminate this - we should just type "dbMutex.assertWriteLocked();" instead
     inline void assertInWriteLock() { dbMutex.assertWriteLocked(); }
 
 }
diff --git a/db/curop-inl.h b/db/curop-inl.h
new file mode 100644
index 0000000..21d6f0a
--- /dev/null
+++ b/db/curop-inl.h
@@ -0,0 +1,42 @@
+// @file curop-inl.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "curop.h"
+
+namespace mongo {
+
+    // todo : move more here
+
+    inline CurOp::CurOp( Client * client , CurOp * wrapped ) {
+        _client = client;
+        _wrapped = wrapped;
+        if ( _wrapped )
+            _client->_curOp = this;
+        _start = _checkpoint = 0;
+        _active = false;
+        _reset();
+        _op = 0;
+        // These addresses should never be written to again.  The zeroes are
+        // placed here as a precaution because currentOp may be accessed
+        // without the db mutex.
+        memset(_ns, 0, sizeof(_ns));
+    }
+
+}
diff --git a/db/curop.h b/db/curop.h
index bf06a69..c6e949b 100644
--- a/db/curop.h
+++ b/db/curop.h
@@ -1,4 +1,5 @@
-// curop.h
+// @file curop.h
+
 /*
  *    Copyright (C) 2010 10gen Inc.
  *
@@ -18,152 +19,188 @@
 
 #pragma once
 
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "client.h"
 #include "../bson/util/atomic_int.h"
+#include "../util/concurrency/spin_lock.h"
+#include "../util/time_support.h"
 #include "db.h"
+#include "../scripting/engine.h"
 
-namespace mongo { 
+namespace mongo {
 
     /* lifespan is different than CurOp because of recursives with DBDirectClient */
     class OpDebug {
     public:
         StringBuilder str;
-        
-        void reset(){
-            str.reset();
-        }
+        void reset() { str.reset(); }
     };
-    
-    /* Current operation (for the current Client).
-       an embedded member of Client class, and typically used from within the mutex there. */
-    class CurOp : boost::noncopyable {
-        static AtomicUInt _nextOpNum;
+
+    /**
+     * stores a copy of a bson obj in a fixed size buffer
+     * if its too big for the buffer, says "too big"
+     * useful for keeping a copy around indefinitely without wasting a lot of space or doing malloc
+     */
+    class CachedBSONObj {
+    public:
+        enum { TOO_BIG_SENTINEL = 1 } ;
         static BSONObj _tooBig; // { $msg : "query not recording (too large)" }
-        
-        Client * _client;
-        CurOp * _wrapped;
 
-        unsigned long long _start;
-        unsigned long long _checkpoint;
-        unsigned long long _end;
+        CachedBSONObj() {
+            _size = (int*)_buf;
+            reset();
+        }
 
-        bool _active;
-        int _op;
-        bool _command;
-        int _lockType; // see concurrency.h for values
-        bool _waitingForLock;
-        int _dbprofile; // 0=off, 1=slow, 2=all
-        AtomicUInt _opNum;
-        char _ns[Namespace::MaxNsLen+2];
-        struct SockAddr _remote;
-        char _queryBuf[256];
-        
-        void resetQuery(int x=0) { *((int *)_queryBuf) = x; }
-        
-        OpDebug _debug;
-        
-        ThreadSafeString _message;
-        ProgressMeter _progressMeter;
+        void reset( int sz = 0 ) {
+            _lock.lock();
+            _reset( sz );
+            _lock.unlock();
+        }
+
+        void set( const BSONObj& o ) {
+            _lock.lock();
+            try {
+                int sz = o.objsize();
+
+                if ( sz > (int) sizeof(_buf) ) {
+                    _reset(TOO_BIG_SENTINEL);
+                }
+                else {
+                    memcpy(_buf, o.objdata(), sz );
+                }
+
+                _lock.unlock();
+            }
+            catch ( ... ) {
+                _lock.unlock();
+                throw;
+            }
 
-        void _reset(){
-            _command = false;
-            _lockType = 0;
-            _dbprofile = 0;
-            _end = 0;
-            _waitingForLock = false;
-            _message = "";
-            _progressMeter.finished();
         }
 
-        void setNS(const char *ns) {
-            strncpy(_ns, ns, Namespace::MaxNsLen);
+        int size() const { return *_size; }
+        bool have() const { return size() > 0; }
+
+        BSONObj get() {
+            _lock.lock();
+            BSONObj o;
+            try {
+                o = _get();
+                _lock.unlock();
+            }
+            catch ( ... ) {
+                _lock.unlock();
+                throw;
+            }
+            return o;
+        }
+
+        void append( BSONObjBuilder& b , const StringData& name ) {
+            _lock.lock();
+            try {
+                BSONObj temp = _get();
+                b.append( name , temp );
+                _lock.unlock();
+            }
+            catch ( ... ) {
+                _lock.unlock();
+                throw;
+            }
         }
 
+    private:
+        /** you have to be locked when you call this */
+        BSONObj _get() {
+            int sz = size();
+            if ( sz == 0 )
+                return BSONObj();
+            if ( sz == TOO_BIG_SENTINEL )
+                return _tooBig;
+            return BSONObj( _buf ).copy();
+        }
+
+        /** you have to be locked when you call this */
+        void _reset( int sz ) { _size[0] = sz; }
+
+        SpinLock _lock;
+        int * _size;
+        char _buf[512];
+    };
+
+    /* Current operation (for the current Client).
+       an embedded member of Client class, and typically used from within the mutex there.
+    */
+    class CurOp : boost::noncopyable {
     public:
-        
-        int querySize() const { return *((int *) _queryBuf); }
-        bool haveQuery() const { return querySize() != 0; }
+        CurOp( Client * client , CurOp * wrapped = 0 );
+        ~CurOp();
 
-        BSONObj query( bool threadSafe = false);
+        bool haveQuery() const { return _query.have(); }
+        BSONObj query() { return _query.get();  }
 
-        void ensureStarted(){
+        void ensureStarted() {
             if ( _start == 0 )
-                _start = _checkpoint = curTimeMicros64();            
+                _start = _checkpoint = curTimeMicros64();
         }
-        void enter( Client::Context * context ){
+        void enter( Client::Context * context ) {
             ensureStarted();
             setNS( context->ns() );
             if ( context->_db && context->_db->profile > _dbprofile )
                 _dbprofile = context->_db->profile;
         }
 
-        void leave( Client::Context * context ){
+        void leave( Client::Context * context ) {
             unsigned long long now = curTimeMicros64();
             Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command );
             _checkpoint = now;
         }
 
-        void reset(){
+        void reset() {
             _reset();
             _start = _checkpoint = 0;
             _active = true;
             _opNum = _nextOpNum++;
             _ns[0] = '?'; // just in case not set later
             _debug.reset();
-            resetQuery();            
+            _query.reset();
         }
-        
+
         void reset( const SockAddr & remote, int op ) {
             reset();
             _remote = remote;
             _op = op;
         }
-        
-        void markCommand(){
-            _command = true;
-        }
 
-        void waitingForLock( int type ){
+        void markCommand() { _command = true; }
+
+        void waitingForLock( int type ) {
             _waitingForLock = true;
             if ( type > 0 )
                 _lockType = 1;
             else
                 _lockType = -1;
         }
-        void gotLock(){
-            _waitingForLock = false;
-        }
-
-        OpDebug& debug(){
-            return _debug;
-        }
-        
-        int profileLevel() const {
-            return _dbprofile;
-        }
-
-        const char * getNS() const {
-            return _ns;
-        }
+        void gotLock()             { _waitingForLock = false; }
+        OpDebug& debug()           { return _debug; }
+        int profileLevel() const   { return _dbprofile; }
+        const char * getNS() const { return _ns; }
 
         bool shouldDBProfile( int ms ) const {
             if ( _dbprofile <= 0 )
                 return false;
-            
+
             return _dbprofile >= 2 || ms >= cmdLine.slowMS;
         }
-        
+
         AtomicUInt opNum() const { return _opNum; }
 
         /** if this op is running */
         bool active() const { return _active; }
-        
+
         int getLockType() const { return _lockType; }
-        bool isWaitingForLock() const { return _waitingForLock; } 
+        bool isWaitingForLock() const { return _waitingForLock; }
         int getOp() const { return _op; }
-        
-        
+
         /** micros */
         unsigned long long startTime() {
             ensureStarted();
@@ -174,75 +211,41 @@ namespace mongo {
             _active = false;
             _end = curTimeMicros64();
         }
-        
+
         unsigned long long totalTimeMicros() {
             massert( 12601 , "CurOp not marked done yet" , ! _active );
             return _end - startTime();
         }
 
-        int totalTimeMillis() {
-            return (int) (totalTimeMicros() / 1000);
-        }
+        int totalTimeMillis() { return (int) (totalTimeMicros() / 1000); }
 
         int elapsedMillis() {
             unsigned long long total = curTimeMicros64() - startTime();
             return (int) (total / 1000);
         }
 
-        int elapsedSeconds() {
-            return elapsedMillis() / 1000;
-        }
+        int elapsedSeconds() { return elapsedMillis() / 1000; }
 
-        void setQuery(const BSONObj& query) { 
-            if( query.objsize() > (int) sizeof(_queryBuf) ) { 
-                resetQuery(1); // flag as too big and return
-                return;
-            }
-            memcpy(_queryBuf, query.objdata(), query.objsize());
-        }
+        void setQuery(const BSONObj& query) { _query.set( query ); }
 
-        Client * getClient() const { 
-            return _client;
-        }
+        Client * getClient() const { return _client; }
 
-        CurOp( Client * client , CurOp * wrapped = 0 ) { 
-            _client = client;
-            _wrapped = wrapped;
-            if ( _wrapped ){
-                _client->_curOp = this;
-            }
-            _start = _checkpoint = 0;
-            _active = false;
-            _reset();
-            _op = 0;
-            // These addresses should never be written to again.  The zeroes are
-            // placed here as a precaution because currentOp may be accessed
-            // without the db mutex.
-            memset(_ns, 0, sizeof(_ns));
-            memset(_queryBuf, 0, sizeof(_queryBuf));
-        }
-        
-        ~CurOp();
-
-        BSONObj info() { 
-            if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) { 
+        BSONObj info() {
+            if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) {
                 BSONObjBuilder b;
                 b.append("err", "unauthorized");
                 return b.obj();
             }
             return infoNoauth();
         }
-        
-        BSONObj infoNoauth( int attempt = 0 );
 
-        string getRemoteString( bool includePort = true ){
-            return _remote.toString(includePort);
-        }
+        BSONObj infoNoauth();
 
-        ProgressMeter& setMessage( const char * msg , long long progressMeterTotal = 0 , int secondsBetween = 3 ){
+        string getRemoteString( bool includePort = true ) { return _remote.toString(includePort); }
 
-            if ( progressMeterTotal ){
-                if ( _progressMeter.isActive() ){
+        ProgressMeter& setMessage( const char * msg , unsigned long long progressMeterTotal = 0 , int secondsBetween = 3 ) {
+            if ( progressMeterTotal ) {
+                if ( _progressMeter.isActive() ) {
                     cout << "about to assert, old _message: " << _message << " new message:" << msg << endl;
                     assert( ! _progressMeter.isActive() );
                 }
@@ -251,38 +254,93 @@ namespace mongo {
             else {
                 _progressMeter.finished();
             }
-            
+
             _message = msg;
-            
+
             return _progressMeter;
         }
-        
+
         string getMessage() const { return _message.toString(); }
         ProgressMeter& getProgressMeter() { return _progressMeter; }
-
+        CurOp *parent() const { return _wrapped; }
+        void kill() { _killed = true; }
+        bool killed() const { return _killed; }
+        void setNS(const char *ns) {
+            strncpy(_ns, ns, Namespace::MaxNsLen);
+            _ns[Namespace::MaxNsLen] = 0;
+        }
         friend class Client;
+
+    private:
+        static AtomicUInt _nextOpNum;
+        Client * _client;
+        CurOp * _wrapped;
+        unsigned long long _start;
+        unsigned long long _checkpoint;
+        unsigned long long _end;
+        bool _active;
+        int _op;
+        bool _command;
+        int _lockType; // see concurrency.h for values
+        bool _waitingForLock;
+        int _dbprofile; // 0=off, 1=slow, 2=all
+        AtomicUInt _opNum;
+        char _ns[Namespace::MaxNsLen+2];
+        struct SockAddr _remote;
+        CachedBSONObj _query;
+        OpDebug _debug;
+        ThreadSafeString _message;
+        ProgressMeter _progressMeter;
+        volatile bool _killed;
+
+        void _reset() {
+            _command = false;
+            _lockType = 0;
+            _dbprofile = 0;
+            _end = 0;
+            _waitingForLock = false;
+            _message = "";
+            _progressMeter.finished();
+            _killed = false;
+        }
     };
 
-    /* 0 = ok
-       1 = kill current operation and reset this to 0
-       future: maybe use this as a "going away" thing on process termination with a higher flag value 
+    /* _globalKill: we are shutting down
+       otherwise kill attribute set on specified CurOp
+       this class does not handle races between interruptJs and the checkForInterrupt functions - those must be
+       handled by the client of this class
     */
-    extern class KillCurrentOp { 
-        enum { Off, On, All } state;
-        AtomicUInt toKill;
+    extern class KillCurrentOp {
     public:
-        void killAll() { state = All; }
-        void kill(AtomicUInt i) { toKill = i; state = On; }
-        
-        void checkForInterrupt() { 
-            if( state != Off ) { 
-                if( state == All ) 
-                    uasserted(11600,"interrupted at shutdown");
-                if( cc().curop()->opNum() == toKill ) { 
-                    state = Off;
-                    uasserted(11601,"interrupted");
-                }
-            }
+        void killAll();
+        void kill(AtomicUInt i);
+
+        /** @return true if global interrupt and should terminate the operation */
+        bool globalInterruptCheck() const { return _globalKill; }
+
+        void checkForInterrupt( bool heedMutex = true ) {
+            if ( heedMutex && dbMutex.isWriteLocked() )
+                return;
+            if( _globalKill )
+                uasserted(11600,"interrupted at shutdown");
+            if( cc().curop()->killed() )
+                uasserted(11601,"interrupted");
         }
+
+        /** @return "" if not interrupted.  otherwise, you should stop. */
+        const char *checkForInterruptNoAssert( bool heedMutex = true ) {
+            if ( heedMutex && dbMutex.isWriteLocked() )
+                return "";
+            if( _globalKill )
+                return "interrupted at shutdown";
+            if( cc().curop()->killed() )
+                return "interrupted";
+            return "";
+        }
+
+    private:
+        void interruptJs( AtomicUInt *op );
+        volatile bool _globalKill;
     } killCurrentOp;
+
 }
diff --git a/db/cursor.cpp b/db/cursor.cpp
index e98cb7a..ac7afc1 100644
--- a/db/cursor.cpp
+++ b/db/cursor.cpp
@@ -16,7 +16,7 @@
 
 #include "pch.h"
 #include "pdfile.h"
-#include "curop.h"
+#include "curop-inl.h"
 
 namespace mongo {
 
@@ -24,14 +24,17 @@ namespace mongo {
         killCurrentOp.checkForInterrupt();
         if ( eof() ) {
             if ( tailable_ && !last.isNull() ) {
-                curr = s->next( last );                    
-            } else {
+                curr = s->next( last );
+            }
+            else {
                 return false;
             }
-        } else {
+        }
+        else {
             last = curr;
             curr = s->next( curr );
         }
+        incNscanned();
         return ok();
     }
 
@@ -72,7 +75,7 @@ namespace mongo {
     }
 
     ForwardCappedCursor::ForwardCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
-            nsd( _nsd ) {
+        nsd( _nsd ) {
         if ( !nsd )
             return;
         DiskLoc start = startLoc;
@@ -89,6 +92,7 @@ namespace mongo {
         }
         curr = start;
         s = this;
+        incNscanned();
     }
 
     DiskLoc ForwardCappedCursor::next( const DiskLoc &prev ) const {
@@ -112,19 +116,21 @@ namespace mongo {
     }
 
     ReverseCappedCursor::ReverseCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
-            nsd( _nsd ) {
+        nsd( _nsd ) {
         if ( !nsd )
             return;
         DiskLoc start = startLoc;
         if ( start.isNull() ) {
             if ( !nsd->capLooped() ) {
                 start = nsd->lastRecord();
-            } else {
+            }
+            else {
                 start = nsd->capExtent.ext()->lastRecord;
             }
         }
         curr = start;
         s = this;
+        incNscanned();
     }
 
     DiskLoc ReverseCappedCursor::next( const DiskLoc &prev ) const {
@@ -138,7 +144,8 @@ namespace mongo {
             if ( i == nextLoop( nsd, nsd->capExtent.ext()->lastRecord ) ) {
                 return DiskLoc();
             }
-        } else {
+        }
+        else {
             if ( i == nsd->capExtent.ext()->firstRecord ) {
                 return DiskLoc();
             }
diff --git a/db/cursor.h b/db/cursor.h
index db5d9a3..9797d66 100644
--- a/db/cursor.h
+++ b/db/cursor.h
@@ -23,14 +23,15 @@
 #include "matcher.h"
 
 namespace mongo {
-    
+
+    class NamespaceDetails;
     class Record;
     class CoveredIndexMatcher;
 
     /* Query cursors, base class.  This is for our internal cursors.  "ClientCursor" is a separate
        concept and is for the user's cursor.
 
-       WARNING concurrency: the vfunctions below are called back from within a 
+       WARNING concurrency: the vfunctions below are called back from within a
        ClientCursor::ccmutex.  Don't cause a deadlock, you've been warned.
     */
     class Cursor : boost::noncopyable {
@@ -49,7 +50,7 @@ namespace mongo {
         virtual DiskLoc refLoc() = 0;
 
         /* Implement these if you want the cursor to be "tailable" */
-        
+
         /* Request that the cursor starts tailing after advancing past last record. */
         /* The implementation may or may not honor this request. */
         virtual void setTailable() {}
@@ -76,10 +77,10 @@ namespace mongo {
 
         /* called before query getmore block is iterated */
         virtual void checkLocation() { }
-        
+
         virtual bool supportGetMore() = 0;
         virtual bool supportYields() = 0;
-        
+
         virtual string toString() { return "abstract?"; }
 
         /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
@@ -87,20 +88,33 @@ namespace mongo {
              if loc has already been sent, returns true.
              otherwise, marks loc as sent.
            @param deep - match was against an array, so we know it is multikey.  this is legacy and kept
-                         for backwards datafile compatibility.  'deep' can be eliminated next time we 
+                         for backwards datafile compatibility.  'deep' can be eliminated next time we
                          force a data file conversion. 7Jul09
         */
         virtual bool getsetdup(DiskLoc loc) = 0;
 
+        virtual bool isMultiKey() const = 0;
+
+        /**
+         * return true if the keys in the index have been modified from the main doc
+         * if you have { a : 1 , b : [ 1 , 2 ] }
+         * an index on { a : 1 } would not be modified
+         * an index on { b : 1 } would be since the values of the array are put in the index
+         *                       not the array
+         */
+        virtual bool modifiedKeys() const = 0;
+
         virtual BSONObj prettyIndexBounds() const { return BSONArray(); }
 
         virtual bool capped() const { return false; }
 
+        virtual long long nscanned() = 0;
+
         // The implementation may return different matchers depending on the
         // position of the cursor.  If matcher() is nonzero at the start,
         // matcher() should be checked each time advance() is called.
         virtual CoveredIndexMatcher *matcher() const { return 0; }
-        
+
         // A convenience function for setting the value of matcher() manually
         // so it may accessed later.  Implementations which must generate
         // their own matcher() should assert here.
@@ -121,20 +135,15 @@ namespace mongo {
 
     /* table-scan style cursor */
     class BasicCursor : public Cursor {
-    protected:
-        DiskLoc curr, last;
-        const AdvanceStrategy *s;
-
-    private:
-        bool tailable_;
-        shared_ptr< CoveredIndexMatcher > _matcher;
-        void init() {
-            tailable_ = false;
-        }
     public:
-        bool ok() {
-            return !curr.isNull();
+        BasicCursor(DiskLoc dl, const AdvanceStrategy *_s = forward()) : curr(dl), s( _s ), _nscanned() {
+            incNscanned();
+            init();
         }
+        BasicCursor(const AdvanceStrategy *_s = forward()) : s( _s ), _nscanned() {
+            init();
+        }
+        bool ok() { return !curr.isNull(); }
         Record* _current() {
             assert( ok() );
             return curr.rec();
@@ -144,42 +153,33 @@ namespace mongo {
             BSONObj j(r);
             return j;
         }
-        virtual DiskLoc currLoc() {
-            return curr;
-        }
-        virtual DiskLoc refLoc() {
-            return curr.isNull() ? last : curr;
-        }
-        
+        virtual DiskLoc currLoc() { return curr; }
+        virtual DiskLoc refLoc()  { return curr.isNull() ? last : curr; }
         bool advance();
-
-        BasicCursor(DiskLoc dl, const AdvanceStrategy *_s = forward()) : curr(dl), s( _s ) {
-            init();
-        }
-        BasicCursor(const AdvanceStrategy *_s = forward()) : s( _s ) {
-            init();
-        }
-        virtual string toString() {
-            return "BasicCursor";
-        }
+        virtual string toString() { return "BasicCursor"; }
         virtual void setTailable() {
             if ( !curr.isNull() || !last.isNull() )
                 tailable_ = true;
         }
-        virtual bool tailable() {
-            return tailable_;
-        }
+        virtual bool tailable() { return tailable_; }
         virtual bool getsetdup(DiskLoc loc) { return false; }
-
+        virtual bool isMultiKey() const { return false; }
+        virtual bool modifiedKeys() const { return false; }
         virtual bool supportGetMore() { return true; }
         virtual bool supportYields() { return true; }
-
         virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
-        
-        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) {
-            _matcher = matcher;
-        }
-        
+        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; }
+        virtual long long nscanned() { return _nscanned; }
+
+    protected:
+        DiskLoc curr, last;
+        const AdvanceStrategy *s;
+        void incNscanned() { if ( !curr.isNull() ) { ++_nscanned; } }
+    private:
+        bool tailable_;
+        shared_ptr< CoveredIndexMatcher > _matcher;
+        long long _nscanned;
+        void init() { tailable_ = false; }
     };
 
     /* used for order { $natural: -1 } */
@@ -187,13 +187,9 @@ namespace mongo {
     public:
         ReverseCursor(DiskLoc dl) : BasicCursor( dl, reverse() ) { }
         ReverseCursor() : BasicCursor( reverse() ) { }
-        virtual string toString() {
-            return "ReverseCursor";
-        }
+        virtual string toString() { return "ReverseCursor"; }
     };
 
-    class NamespaceDetails;
-
     class ForwardCappedCursor : public BasicCursor, public AdvanceStrategy {
     public:
         ForwardCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() );
diff --git a/db/database.cpp b/db/database.cpp
index dde117f..d164ba5 100644
--- a/db/database.cpp
+++ b/db/database.cpp
@@ -20,15 +20,29 @@
 #include "pdfile.h"
 #include "database.h"
 #include "instance.h"
+#include "clientcursor.h"
 
 namespace mongo {
 
     bool Database::_openAllFiles = false;
 
+    Database::~Database() {
+        magic = 0;
+        size_t n = files.size();
+        for ( size_t i = 0; i < n; i++ )
+            delete files[i];
+        if( ccByLoc.size() ) {
+            log() << "\n\n\nWARNING: ccByLoc not empty on database close! " << ccByLoc.size() << ' ' << name << endl;
+        }
+    }
+
     Database::Database(const char *nm, bool& newDb, const string& _path )
-        : name(nm), path(_path), namespaceIndex( path, name ) {
-        
-        { // check db name is valid
+        : name(nm), path(_path), namespaceIndex( path, name ),
+          profileName(name + ".system.profile") {
+        try {
+
+        {
+            // check db name is valid
             size_t L = strlen(nm);
             uassert( 10028 ,  "db name is empty", L > 0 );
             uassert( 10029 ,  "bad db name [1]", *nm != '.' );
@@ -36,66 +50,184 @@ namespace mongo {
             uassert( 10031 ,  "bad char(s) in db name", strchr(nm, ' ') == 0 );
             uassert( 10032 ,  "db name too long", L < 64 );
         }
-        
+
         newDb = namespaceIndex.exists();
         profile = 0;
-        profileName = name + ".system.profile";
 
         {
             vector<string> others;
             getDatabaseNames( others , path );
-            
-            for ( unsigned i=0; i<others.size(); i++ ){
+
+            for ( unsigned i=0; i<others.size(); i++ ) {
 
                 if ( strcasecmp( others[i].c_str() , nm ) )
                     continue;
 
                 if ( strcmp( others[i].c_str() , nm ) == 0 )
                     continue;
-                
+
                 stringstream ss;
                 ss << "db already exists with different case other: [" << others[i] << "] me [" << nm << "]";
                 uasserted( DatabaseDifferCaseCode , ss.str() );
             }
         }
 
-        
+
         // If already exists, open.  Otherwise behave as if empty until
         // there's a write, then open.
         if ( ! newDb || cmdLine.defaultProfile ) {
             namespaceIndex.init();
             if( _openAllFiles )
                 openAllFiles();
-            
+
         }
-       
+
 
         magic = 781231;
+        } catch(...) { 
+            // since destructor won't be called:
+            for ( size_t i = 0; i < files.size(); i++ )
+                delete files[i];
+            throw;
+        }
+    }
+
+    boost::filesystem::path Database::fileName( int n ) const {
+        stringstream ss;
+        ss << name << '.' << n;
+        boost::filesystem::path fullName;
+        fullName = boost::filesystem::path(path);
+        if ( directoryperdb )
+            fullName /= name;
+        fullName /= ss.str();
+        return fullName;
+    }
+
+    void Database::openAllFiles() {
+        int n = 0;
+        while( exists(n) ) {
+            getFile(n);
+            n++;
+        }
+        // If last file is empty, consider it preallocated and make sure it's not mapped
+        // until a write is requested
+        if ( n > 1 && getFile( n - 1 )->getHeader()->isEmpty() ) {
+            delete files[ n - 1 ];
+            files.pop_back();
+        }
+    }
+
+    MongoDataFile* Database::getFile( int n, int sizeNeeded , bool preallocateOnly) {
+        assert(this);
+
+        namespaceIndex.init();
+        if ( n < 0 || n >= DiskLoc::MaxFiles ) {
+            out() << "getFile(): n=" << n << endl;
+            massert( 10295 , "getFile(): bad file number value (corrupt db?): run repair", false);
+        }
+        DEV {
+            if ( n > 100 )
+                out() << "getFile(): n=" << n << "?" << endl;
+        }
+        MongoDataFile* p = 0;
+        if ( !preallocateOnly ) {
+            while ( n >= (int) files.size() )
+                files.push_back(0);
+            p = files[n];
+        }
+        if ( p == 0 ) {
+            boost::filesystem::path fullName = fileName( n );
+            string fullNameString = fullName.string();
+            p = new MongoDataFile(n);
+            int minSize = 0;
+            if ( n != 0 && files[ n - 1 ] )
+                minSize = files[ n - 1 ]->getHeader()->fileLength;
+            if ( sizeNeeded + DataFileHeader::HeaderSize > minSize )
+                minSize = sizeNeeded + DataFileHeader::HeaderSize;
+            try {
+                p->open( fullNameString.c_str(), minSize, preallocateOnly );
+            }
+            catch ( AssertionException& ) {
+                delete p;
+                throw;
+            }
+            if ( preallocateOnly )
+                delete p;
+            else
+                files[n] = p;
+        }
+        return preallocateOnly ? 0 : p;
+    }
+
+    MongoDataFile* Database::addAFile( int sizeNeeded, bool preallocateNextFile ) {
+        int n = (int) files.size();
+        MongoDataFile *ret = getFile( n, sizeNeeded );
+        if ( preallocateNextFile )
+            preallocateAFile();
+        return ret;
     }
 
+    MongoDataFile* Database::suitableFile( int sizeNeeded, bool preallocate ) {
 
-    bool Database::setProfilingLevel( int newLevel , string& errmsg ){
+        // check existing files
+        for ( int i=numFiles()-1; i>=0; i-- ) {
+            MongoDataFile* f = getFile( i );
+            if ( f->getHeader()->unusedLength >= sizeNeeded )
+                return f;
+        }
+
+        // allocate files until we either get one big enough or hit maxSize
+        for ( int i = 0; i < 8; i++ ) {
+            MongoDataFile* f = addAFile( sizeNeeded, preallocate );
+
+            if ( f->getHeader()->unusedLength >= sizeNeeded )
+                return f;
+
+            if ( f->getHeader()->fileLength >= MongoDataFile::maxSize() ) // this is as big as they get so might as well stop
+                return f;
+        }
+
+        return 0;
+    }
+
+    MongoDataFile* Database::newestFile() {
+        int n = numFiles();
+        if ( n == 0 )
+            return 0;
+        return getFile(n-1);
+    }
+
+
+    Extent* Database::allocExtent( const char *ns, int size, bool capped ) {
+        Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped );
+        if( e )
+            return e;
+        return suitableFile( size, !capped )->createExtent( ns, size, capped );
+    }
+
+
+    bool Database::setProfilingLevel( int newLevel , string& errmsg ) {
         if ( profile == newLevel )
             return true;
-        
-        if ( newLevel < 0 || newLevel > 2 ){
+
+        if ( newLevel < 0 || newLevel > 2 ) {
             errmsg = "profiling level has to be >=0 and <= 2";
             return false;
         }
-        
-        if ( newLevel == 0 ){
+
+        if ( newLevel == 0 ) {
             profile = 0;
             return true;
         }
-        
+
         assert( cc().database() == this );
 
-        if ( ! namespaceIndex.details( profileName.c_str() ) ){
+        if ( ! namespaceIndex.details( profileName.c_str() ) ) {
             log(1) << "creating profile ns: " << profileName << endl;
             BSONObjBuilder spec;
             spec.appendBool( "capped", true );
             spec.append( "size", 131072.0 );
-            if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , true ) ){
+            if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , true ) ) {
                 return false;
             }
         }
@@ -103,26 +235,57 @@ namespace mongo {
         return true;
     }
 
-    void Database::finishInit(){
+    void Database::finishInit() {
         if ( cmdLine.defaultProfile == profile )
             return;
-        
+
         string errmsg;
         massert( 12506 , errmsg , setProfilingLevel( cmdLine.defaultProfile , errmsg ) );
     }
 
-    bool Database::validDBName( const string& ns ){
+    bool Database::validDBName( const string& ns ) {
         if ( ns.size() == 0 || ns.size() > 64 )
             return false;
         size_t good = strcspn( ns.c_str() , "/\\. \"" );
         return good == ns.size();
     }
 
-    void Database::flushFiles( bool sync ){
+    void Database::flushFiles( bool sync ) const {
         dbMutex.assertAtLeastReadLocked();
-        for ( unsigned i=0; i<files.size(); i++ ){
+        for ( unsigned i=0; i<files.size(); i++ ) {
             files[i]->flush( sync );
         }
     }
 
+    long long Database::fileSize() const {
+        long long size=0;
+        for (int n=0; exists(n); n++)
+            size += boost::filesystem::file_size( fileName(n) );
+        return size;
+    }
+
+    Database* DatabaseHolder::getOrCreate( const string& ns , const string& path , bool& justCreated ) {
+        dbMutex.assertWriteLocked();
+        DBs& m = _paths[path];
+
+        string dbname = _todb( ns );
+
+        Database* & db = m[dbname];
+        if ( db ) {
+            justCreated = false;
+            return db;
+        }
+
+        log(1) << "Accessing: " << dbname << " for the first time" << endl;
+        try {
+            db = new Database( dbname.c_str() , justCreated , path );
+        }
+        catch ( ... ) {
+            m.erase( dbname );
+            throw;
+        }
+        _size++;
+        return db;
+    }
+
 } // namespace mongo
diff --git a/db/database.h b/db/database.h
index c7d72c5..6e72ba8 100644
--- a/db/database.h
+++ b/db/database.h
@@ -23,6 +23,8 @@
 namespace mongo {
 
     class ClientCursor;
+    struct ByLocKey;
+    typedef map<ByLocKey, ClientCursor*> CCByLoc;
 
     /**
      * Database represents a database database
@@ -32,176 +34,90 @@ namespace mongo {
     class Database {
     public:
         static bool _openAllFiles;
-        
-        Database(const char *nm, bool& newDb, const string& _path = dbpath);
-        
-        ~Database() {
-            magic = 0;
-            btreeStore->closeFiles(name, path);
-            size_t n = files.size();
-            for ( size_t i = 0; i < n; i++ )
-                delete files[i];
-        }
-        
+
+        Database(const char *nm, /*out*/ bool& newDb, const string& _path = dbpath);
+    private:
+        ~Database();
+    public:
+        /* you must use this to close - there is essential code in this method that is not in the ~Database destructor.
+           thus the destructor is private.  this could be cleaned up one day...
+        */
+        static void closeDatabase( const char *db, const string& path );
+
+        void openAllFiles();
+
+        void finishInit();
+
         /**
          * tries to make sure that this hasn't been deleted
          */
-        bool isOk(){
-            return magic == 781231;
-        }
+        bool isOk() const { return magic == 781231; }
 
-        bool isEmpty(){
-            return ! namespaceIndex.allocated();
-        }
+        bool isEmpty() { return ! namespaceIndex.allocated(); }
 
-        boost::filesystem::path fileName( int n ) {
-            stringstream ss;
-            ss << name << '.' << n;
-            boost::filesystem::path fullName;
-            fullName = boost::filesystem::path(path);
-            if ( directoryperdb )
-                fullName /= name;
-            fullName /= ss.str();
-            return fullName;
-        }
-        
-        bool exists(int n) { 
-            return boost::filesystem::exists( fileName( n ) );
-        }
+        /**
+         * total file size of Database in bytes
+         */
+        long long fileSize() const;
 
-        void openAllFiles() { 
-            int n = 0;
-            while( exists(n) ) { 
-                getFile(n);
-                n++;
-            }
-            // If last file is empty, consider it preallocated and make sure it's not mapped
-            // until a write is requested
-            if ( n > 1 && getFile( n - 1 )->getHeader()->isEmpty() ) {
-                delete files[ n - 1 ];
-                files.pop_back();
-            }
-        }
+        int numFiles() const { return (int)files.size(); }
 
-        MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false ) {
-            assert(this);
-
-            namespaceIndex.init();
-            if ( n < 0 || n >= DiskLoc::MaxFiles ) {
-                out() << "getFile(): n=" << n << endl;
-#if 0
-                if( n >= RecCache::Base && n <= RecCache::Base+1000 )
-                    massert( 10294 , "getFile(): bad file number - using recstore db w/nonrecstore db build?", false);
-#endif
-                massert( 10295 , "getFile(): bad file number value (corrupt db?): run repair", false);
-            }
-            DEV {
-                if ( n > 100 )
-                    out() << "getFile(): n=" << n << "?" << endl;
-            }
-            MongoDataFile* p = 0;
-            if ( !preallocateOnly ) {
-                while ( n >= (int) files.size() )
-                    files.push_back(0);
-                p = files[n];
-            }
-            if ( p == 0 ) {
-                boost::filesystem::path fullName = fileName( n );
-                string fullNameString = fullName.string();
-                p = new MongoDataFile(n);
-                int minSize = 0;
-                if ( n != 0 && files[ n - 1 ] )
-                    minSize = files[ n - 1 ]->getHeader()->fileLength;
-                if ( sizeNeeded + DataFileHeader::HeaderSize > minSize )
-                    minSize = sizeNeeded + DataFileHeader::HeaderSize;
-                try {
-                    p->open( fullNameString.c_str(), minSize, preallocateOnly );
-                }
-                catch ( AssertionException& ) {
-                    delete p;
-                    throw;
-                }
-                if ( preallocateOnly )
-                    delete p;
-                else
-                    files[n] = p;
-            }
-            return preallocateOnly ? 0 : p;
-        }
+        /**
+         * returns file valid for file number n
+         */
+        boost::filesystem::path fileName( int n ) const;
 
-        MongoDataFile* addAFile( int sizeNeeded, bool preallocateNextFile ) {
-            int n = (int) files.size();
-            MongoDataFile *ret = getFile( n, sizeNeeded );
-            if ( preallocateNextFile )
-                preallocateAFile();
-            return ret;
-        }
-        
-        // safe to call this multiple times - the implementation will only preallocate one file
-        void preallocateAFile() {
-            int n = (int) files.size();
-            getFile( n, 0, true );
-        }
+        bool exists(int n) const { return boost::filesystem::exists( fileName( n ) ); }
 
-        MongoDataFile* suitableFile( int sizeNeeded, bool preallocate ) {
-            MongoDataFile* f = newestFile();
-            if ( !f ) {
-                f = addAFile( sizeNeeded, preallocate );                
-            }
-            for ( int i = 0; i < 8; i++ ) {
-                if ( f->getHeader()->unusedLength >= sizeNeeded )
-                    break;
-                f = addAFile( sizeNeeded, preallocate );
-                if ( f->getHeader()->fileLength >= MongoDataFile::maxSize() ) // this is as big as they get so might as well stop
-                    break;
-            }
-            return f;
-        }
+        /**
+         * return file n.  if it doesn't exist, create it
+         */
+        MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false );
+
+        MongoDataFile* addAFile( int sizeNeeded, bool preallocateNextFile );
+
+        /**
+         * makes sure we have an extra file at the end that is empty
+         * safe to call this multiple times - the implementation will only preallocate one file
+         */
+        void preallocateAFile() { getFile( numFiles() , 0, true ); }
+
+        MongoDataFile* suitableFile( int sizeNeeded, bool preallocate );
+
+        Extent* allocExtent( const char *ns, int size, bool capped );
+
+        MongoDataFile* newestFile();
 
-        Extent* allocExtent( const char *ns, int size, bool capped ) { 
-            Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped );
-            if( e ) return e;
-            return suitableFile( size, !capped )->createExtent( ns, size, capped );
-        }
-        
-        MongoDataFile* newestFile() {
-            int n = (int) files.size();
-            if ( n > 0 ) {
-                n--;
-            } else {
-                return 0;   
-            }
-            return getFile(n);
-        }
-        
         /**
-         * @return true if success, false otherwise
+         * @return true if success.  false if bad level or error creating profile ns
          */
         bool setProfilingLevel( int newLevel , string& errmsg );
 
-        void finishInit();
 
-        static bool validDBName( const string& ns );
+        void flushFiles( bool sync ) const;
 
-        long long fileSize(){
-            long long size=0;
-            for (int n=0; exists(n); n++)
-                size += boost::filesystem::file_size( fileName(n) );
-            return size;
+        /**
+         * @return true if ns is part of the database
+         *         ns=foo.bar, db=foo returns true
+         */
+        bool ownsNS( const string& ns ) const {
+            if ( ! startsWith( ns , name ) )
+                return false;
+            return ns[name.size()] == '.';
         }
 
-        void flushFiles( bool sync );
-        
+        static bool validDBName( const string& ns );
+
+    public: // this should be private later
+
         vector<MongoDataFile*> files;
-        string name; // "alleyinsider"
-        string path;
+        const string name; // "alleyinsider"
+        const string path;
         NamespaceIndex namespaceIndex;
         int profile; // 0=off.
-        string profileName; // "alleyinsider.system.profile"
-
-        multimap<DiskLoc, ClientCursor*> ccByLoc;
-
-        int magic; // used for making sure the object is still loaded in memory 
+        const string profileName; // "alleyinsider.system.profile"
+        CCByLoc ccByLoc;
+        int magic; // used for making sure the object is still loaded in memory
     };
 
 } // namespace mongo
diff --git a/db/db.cpp b/db/db.cpp
index d5b9339..548ac14 100644
--- a/db/db.cpp
+++ b/db/db.cpp
@@ -1,4 +1,4 @@
-// @file db.cpp : Defines the entry point for the mongod application.
+// @file db.cpp : Defines main() for the mongod program.
 
 /**
 *    Copyright (C) 2008 10gen Inc.
@@ -37,7 +37,10 @@
 #include "../util/concurrency/task.h"
 #include "../util/version.h"
 #include "client.h"
+#include "restapi.h"
 #include "dbwebserver.h"
+#include "dur.h"
+#include "concurrency.h"
 
 #if defined(_WIN32)
 # include "../util/ntservice.h"
@@ -55,31 +58,25 @@ namespace mongo {
 
     extern char *appsrvPath;
     extern int diagLogging;
-    extern int lenForNewNsFiles;
+    extern unsigned lenForNewNsFiles;
     extern int lockFile;
-    extern bool checkNsFilesOnLoad;    
+    extern bool checkNsFilesOnLoad;
     extern string repairpath;
 
-#if defined(_WIN32)
-    std::wstring windowsServiceName = L"MongoDB";
-    std::wstring windowsServiceUser = L"";
-    std::wstring windowsServicePassword = L"";
-#endif
-
-    void setupSignals();
+    void setupSignals( bool inFork );
     void startReplSets(ReplSetCmdline*);
     void startReplication();
     void pairWith(const char *remoteEnd, const char *arb);
     void exitCleanly( ExitCode code );
 
     CmdLine cmdLine;
-    bool useJNI = true;
+    static bool scriptingEnabled = true;
     bool noHttpInterface = false;
     bool shouldRepairDatabases = 0;
-    bool forceRepair = 0;
+    static bool forceRepair = 0;
     Timer startupSrandTimer;
 
-    const char *ourgetns() { 
+    const char *ourgetns() {
         Client *c = currentClient.get();
         if ( ! c )
             return "";
@@ -102,7 +99,7 @@ namespace mongo {
         OurListener(const string &ip, int p) : Listener(ip, p) { }
         virtual void accepted(MessagingPort *mp) {
 
-            if ( ! connTicketHolder.tryAcquire() ){
+            if ( ! connTicketHolder.tryAcquire() ) {
                 log() << "connection refused because too many open connections: " << connTicketHolder.used() << " of " << connTicketHolder.outof() << endl;
                 // TODO: would be nice if we notified them...
                 mp->shutdown();
@@ -113,12 +110,12 @@ namespace mongo {
             try {
                 boost::thread thr(boost::bind(&connThread,mp));
             }
-            catch ( boost::thread_resource_error& ){
+            catch ( boost::thread_resource_error& ) {
                 log() << "can't create new thread, closing connection" << endl;
                 mp->shutdown();
                 delete mp;
             }
-            catch ( ... ){
+            catch ( ... ) {
                 log() << "unkonwn exception starting connThread" << endl;
                 mp->shutdown();
                 delete mp;
@@ -126,14 +123,14 @@ namespace mongo {
         }
     };
 
-/* todo: make this a real test.  the stuff in dbtests/ seem to do all dbdirectclient which exhaust doesn't support yet. */
+    /* todo: make this a real test.  the stuff in dbtests/ seem to do all dbdirectclient which exhaust doesn't support yet. */
 // QueryOption_Exhaust
 #define TESTEXHAUST 0
 #if( TESTEXHAUST )
-    void testExhaust() { 
+    void testExhaust() {
         sleepsecs(1);
         unsigned n = 0;
-        auto f = [&n](const BSONObj& o) { 
+        auto f = [&n](const BSONObj& o) {
             assert( o.valid() );
             //cout << o << endl;
             n++;
@@ -145,20 +142,20 @@ namespace mongo {
         db.connect("localhost");
         const char *ns = "local.foo";
         if( db.count(ns) < 10000 )
-            for( int i = 0; i < 20000; i++ ) 
+            for( int i = 0; i < 20000; i++ )
                 db.insert(ns, BSON("aaa" << 3 << "b" << "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
 
         try {
             db.query(f, ns, Query() );
         }
-        catch(...) { 
+        catch(...) {
             cout << "hmmm" << endl;
         }
 
         try {
             db.query(f, ns, Query() );
         }
-        catch(...) { 
+        catch(...) {
             cout << "caught" << endl;
         }
 
@@ -173,7 +170,7 @@ namespace mongo {
         l.setAsTimeTracker();
         startReplication();
         if ( !noHttpInterface )
-            boost::thread thr(webServerThread);
+            boost::thread web( boost::bind(&webServerThread, new RestAdminAccess() /* takes ownership */));
 
 #if(TESTEXHAUST)
         boost::thread thr(testExhaust);
@@ -203,8 +200,7 @@ namespace mongo {
        app server will open a pool of threads.
        todo: one day, asio...
     */
-    void connThread( MessagingPort * inPort )
-    {
+    void connThread( MessagingPort * inPort ) {
         TicketHolderReleaser connTicketReleaser( &connTicketHolder );
 
         /* todo: move to Client object */
@@ -221,11 +217,11 @@ namespace mongo {
 
             Message m;
             while ( 1 ) {
-                m.reset();
+                inPort->clearCounters();
 
                 if ( !dbMsgPort->recv(m) ) {
                     if( !cmdLine.quiet )
-                      log() << "end connection " << dbMsgPort->farEnd.toString() << endl;
+                        log() << "end connection " << dbMsgPort->farEnd.toString() << endl;
                     dbMsgPort->shutdown();
                     break;
                 }
@@ -234,27 +230,15 @@ sendmore:
                     log() << "got request after shutdown()" << endl;
                     break;
                 }
-                
+
                 lastError.startRequest( m , le );
 
                 DbResponse dbresponse;
-                if ( !assembleResponse( m, dbresponse, dbMsgPort->farEnd ) ) {
-                    log() << curTimeMillis() % 10000 << "   end msg " << dbMsgPort->farEnd.toString() << endl;
-                    /* todo: we may not wish to allow this, even on localhost: very low priv accounts could stop us. */
-                    if ( dbMsgPort->farEnd.isLocalHost() ) {
-                        dbMsgPort->shutdown();
-                        sleepmillis(50);
-                        problem() << "exiting end msg" << endl;
-                        dbexit(EXIT_CLEAN);
-                    }
-                    else {
-                        log() << "  (not from localhost, ignoring end msg)" << endl;
-                    }
-                }
+                assembleResponse( m, dbresponse, dbMsgPort->farEnd );
 
                 if ( dbresponse.response ) {
                     dbMsgPort->reply(m, *dbresponse.response, dbresponse.responseTo);
-                    if( dbresponse.exhaust ) { 
+                    if( dbresponse.exhaust ) {
                         MsgData *header = dbresponse.response->header();
                         QueryResult *qr = (QueryResult *) header;
                         long long cursorid = qr->cursorId;
@@ -279,6 +263,10 @@ sendmore:
                         }
                     }
                 }
+
+                networkCounter.hit( inPort->getBytesIn() , inPort->getBytesOut() );
+
+                m.reset();
             }
 
         }
@@ -293,7 +281,7 @@ sendmore:
         }
         catch ( const ClockSkewException & ) {
             exitCleanly( EXIT_CLOCK_SKEW );
-        }        
+        }
         catch ( std::exception &e ) {
             problem() << "Uncaught std::exception: " << e.what() << ", terminating" << endl;
             dbexit( EXIT_UNCAUGHT );
@@ -303,91 +291,48 @@ sendmore:
             dbexit( EXIT_UNCAUGHT );
         }
 
-        // any thread cleanup can happen here
-
-        if ( currentClient.get() )
-            currentClient->shutdown();
-        globalScriptEngine->threadDone();
-    }
-
-    void msg(const char *m, const char *address, int port, int extras = 0) {
-        SockAddr db(address, port);
-
-        //  SockAddr db("127.0.0.1", DBPort);
-        //  SockAddr db("192.168.37.1", MessagingPort::DBPort);
-        //  SockAddr db("10.0.21.60", MessagingPort::DBPort);
-        //  SockAddr db("172.16.0.179", MessagingPort::DBPort);
-
-        MessagingPort p;
-        if ( !p.connect(db) ){
-            out() << "msg couldn't connect" << endl;
-            return;
-        }
-
-        const int Loops = 1;
-        for ( int q = 0; q < Loops; q++ ) {
-            Message send;
-            Message response;
-
-            send.setData( dbMsg , m);
-            int len = send.header()->dataLen();
-
-            for ( int i = 0; i < extras; i++ )
-                p.say(/*db, */send);
-
-            Timer t;
-            bool ok = p.call(send, response);
-            double tm = ((double) t.micros()) + 1;
-            out() << " ****ok. response.data:" << ok << " time:" << tm / 1000.0 << "ms "
-                  << "len: " << len << " data: " << response.singleData()->_data << endl;
-
-            if (  q+1 < Loops ) {
-                out() << "\t\tSLEEP 8 then sending again as a test" << endl;
-                sleepsecs(8);
-            }
+        // thread ending...
+        {
+            Client * c = currentClient.get();
+            if( c ) c->shutdown();
         }
-        sleepsecs(1);
-
-        p.shutdown();
-    }
-
-    void msg(const char *m, int extras = 0) {
-        msg(m, "127.0.0.1", CmdLine::DefaultDBPort, extras);
+        globalScriptEngine->threadDone();
     }
 
-    bool doDBUpgrade( const string& dbName , string errmsg , DataFileHeader * h ){
+    bool doDBUpgrade( const string& dbName , string errmsg , DataFileHeader * h ) {
         static DBDirectClient db;
-        
-        if ( h->version == 4 && h->versionMinor == 4 ){
+
+        if ( h->version == 4 && h->versionMinor == 4 ) {
             assert( VERSION == 4 );
             assert( VERSION_MINOR == 5 );
-            
+
             list<string> colls = db.getCollectionNames( dbName );
-            for ( list<string>::iterator i=colls.begin(); i!=colls.end(); i++){
+            for ( list<string>::iterator i=colls.begin(); i!=colls.end(); i++) {
                 string c = *i;
                 log() << "\t upgrading collection:" << c << endl;
                 BSONObj out;
                 bool ok = db.runCommand( dbName , BSON( "reIndex" << c.substr( dbName.size() + 1 ) ) , out );
-                if ( ! ok ){
+                if ( ! ok ) {
                     errmsg = "reindex failed";
                     log() << "\t\t reindex failed: " << out << endl;
                     return false;
                 }
             }
-            
+
             h->versionMinor = 5;
             return true;
         }
-        
+
         // do this in the general case
         return repairDatabase( dbName.c_str(), errmsg );
     }
-    
-    void repairDatabases() {
-		//        LastError * le = lastError.get( true );
+
+    // ran at startup.
+    static void repairDatabasesAndCheckVersion() {
+        //        LastError * le = lastError.get( true );
         Client::GodScope gs;
         log(1) << "enter repairDatabases (to check pdfile version #)" << endl;
-        
+
         //assert(checkNsFilesOnLoad);
         checkNsFilesOnLoad = false; // we are mainly just checking the header - don't scan the whole .ns file for every db here.
 
@@ -400,33 +345,39 @@ sendmore:
             Client::Context ctx( dbName );
             MongoDataFile *p = cc().database()->getFile( 0 );
             DataFileHeader *h = p->getHeader();
-            if ( !h->currentVersion() || forceRepair ) {
+            if ( !h->isCurrentVersion() || forceRepair ) {
+
+                if( h->version <= 0 ) {
+                    uasserted(10000, str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version << " info: " << h->versionMinor << ' ' << h->fileLength);
+                }
+
                 log() << "****" << endl;
                 log() << "****" << endl;
                 log() << "need to upgrade database " << dbName << " with pdfile version " << h->version << "." << h->versionMinor << ", "
                       << "new version: " << VERSION << "." << VERSION_MINOR << endl;
-                if ( shouldRepairDatabases ){
+                if ( shouldRepairDatabases ) {
                     // QUESTION: Repair even if file format is higher version than code?
                     log() << "\t starting upgrade" << endl;
                     string errmsg;
                     assert( doDBUpgrade( dbName , errmsg , h ) );
                 }
                 else {
-                    log() << "\t Not upgrading, exiting!" << endl;
+                    log() << "\t Not upgrading, exiting" << endl;
                     log() << "\t run --upgrade to upgrade dbs, then start again" << endl;
                     log() << "****" << endl;
                     dbexit( EXIT_NEED_UPGRADE );
                     shouldRepairDatabases = 1;
                     return;
                 }
-            } else {
-                closeDatabase( dbName.c_str() );
+            }
+            else {
+                Database::closeDatabase( dbName.c_str(), dbpath );
             }
         }
 
         log(1) << "done repairDatabases" << endl;
 
-        if ( shouldRepairDatabases ){
+        if ( shouldRepairDatabases ) {
             log() << "finished checking dbs" << endl;
             cc().shutdown();
             dbexit( EXIT_CLEAN );
@@ -441,11 +392,11 @@ sendmore:
                 i != boost::filesystem::directory_iterator(); ++i ) {
             string fileName = boost::filesystem::path(*i).leaf();
             if ( boost::filesystem::is_directory( *i ) &&
-                fileName.length() && fileName[ 0 ] == '$' )
+                    fileName.length() && fileName[ 0 ] == '$' )
                 boost::filesystem::remove_all( *i );
         }
     }
-    
+
     void clearTmpCollections() {
         Client::GodScope gs;
         vector< string > toDelete;
@@ -460,35 +411,38 @@ sendmore:
             cli.dropCollection( *i );
         }
     }
-    
+
+    void flushDiagLog();
+
     /**
      * does background async flushes of mmapped files
      */
     class DataFileSync : public BackgroundJob {
     public:
-        string name() { return "DataFileSync"; }
-        void run(){
-            if( _sleepsecs == 0 )
+        string name() const { return "DataFileSync"; }
+        void run() {
+            if( cmdLine.syncdelay == 0 )
                 log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl;
-            else if( _sleepsecs == 1 ) 
+            else if( cmdLine.syncdelay == 1 )
                 log() << "--syncdelay 1" << endl;
-            else if( _sleepsecs != 60 )
-                log(1) << "--syncdelay " << _sleepsecs << endl;
+            else if( cmdLine.syncdelay != 60 )
+                log(1) << "--syncdelay " << cmdLine.syncdelay << endl;
             int time_flushing = 0;
-            while ( ! inShutdown() ){
-                if ( _sleepsecs == 0 ){
+            while ( ! inShutdown() ) {
+                flushDiagLog();
+                if ( cmdLine.syncdelay == 0 ) {
                     // in case at some point we add an option to change at runtime
                     sleepsecs(5);
                     continue;
                 }
 
-                sleepmillis( (long long) std::max(0.0, (_sleepsecs * 1000) - time_flushing) );
-                
-                if ( inShutdown() ){
+                sleepmillis( (long long) std::max(0.0, (cmdLine.syncdelay * 1000) - time_flushing) );
+
+                if ( inShutdown() ) {
                     // occasional issue trying to flush during shutdown when sleep interrupted
                     break;
                 }
-                
+
                 Date_t start = jsTime();
                 int numFiles = MemoryMappedFile::flushAll( true );
                 time_flushing = (int) (jsTime() - start);
@@ -498,12 +452,22 @@ sendmore:
                 log(1) << "flushing mmap took " << time_flushing << "ms " << " for " << numFiles << " files" << endl;
             }
         }
-        
-        double _sleepsecs; // default value controlled by program options
+
     } dataFileSync;
 
+    const char * jsInterruptCallback() {
+        // should be safe to interrupt in js code, even if we have a write lock
+        return killCurrentOp.checkForInterruptNoAssert( false );
+    }
+
+    unsigned jsGetInterruptSpecCallback() {
+        return cc().curop()->opNum();
+    }
+
     void _initAndListen(int listenPort, const char *appserverLoc = NULL) {
 
+        Client::initThread("initandlisten");
+
         bool is32bit = sizeof(int*) == 4;
 
         {
@@ -534,38 +498,37 @@ sendmore:
             ss << "repairpath (" << repairpath << ") does not exist";
             uassert( 12590 ,  ss.str().c_str(), boost::filesystem::exists( repairpath ) );
         }
-        
+
         acquirePathLock();
         remove_all( dbpath + "/_tmp/" );
 
-        theFileAllocator().start();
+        FileAllocator::get()->start();
 
         BOOST_CHECK_EXCEPTION( clearTmpFiles() );
 
-        Client::initThread("initandlisten");
         _diaglog.init();
 
+        dur::startup();
+
+        if( cmdLine.durOptions & CmdLine::DurRecoverOnly )
+            return;
+
+        // comes after getDur().startup() because this reads from the database
         clearTmpCollections();
 
         Module::initAll();
 
-#if 0
-        {
-            stringstream indexpath;
-            indexpath << dbpath << "/indexes.dat";
-            RecCache::tempStore.init(indexpath.str().c_str(), BucketSize);
-        }
-#endif
-
-        if ( useJNI ) {
+        if ( scriptingEnabled ) {
             ScriptEngine::setup();
+            globalScriptEngine->setCheckInterruptCallback( jsInterruptCallback );
+            globalScriptEngine->setGetInterruptSpecCallback( jsGetInterruptSpecCallback );
         }
 
-        repairDatabases();
+        repairDatabasesAndCheckVersion();
 
         /* we didn't want to pre-open all fiels for the repair check above. for regular
            operation we do for read/write lock concurrency reasons.
-        */        
+        */
         Database::_openAllFiles = true;
 
         if ( shouldRepairDatabases )
@@ -597,7 +560,7 @@ sendmore:
             log() << "exception in initAndListen std::exception: " << e.what() << ", terminating" << endl;
             dbexit( EXIT_UNCAUGHT );
         }
-        catch ( int& n ){
+        catch ( int& n ) {
             log() << "exception in initAndListen int: " << n << ", terminating" << endl;
             dbexit( EXIT_UNCAUGHT );
         }
@@ -607,13 +570,13 @@ sendmore:
         }
     }
 
-    #if defined(_WIN32)
+#if defined(_WIN32)
     bool initService() {
         ServiceController::reportStatus( SERVICE_RUNNING );
         initAndListen( cmdLine.port, appsrvPath );
         return true;
     }
-    #endif
+#endif
 
 } // namespace mongo
 
@@ -647,16 +610,17 @@ string arg_error_check(int argc, char* argv[]) {
     return "";
 }
 
-int main(int argc, char* argv[], char *envp[] )
-{
+int main(int argc, char* argv[]) {
     static StaticObserver staticObserver;
     getcurns = ourgetns;
 
     po::options_description general_options("General options");
-	#if defined(_WIN32)
-	po::options_description windows_scm_options("Windows Service Control Manager options");
-	#endif
+#if defined(_WIN32)
+    po::options_description windows_scm_options("Windows Service Control Manager options");
+#endif
     po::options_description replication_options("Replication options");
+    po::options_description ms_options("Master/slave options");
+    po::options_description rs_options("Replica set options");
     po::options_description sharding_options("Sharding options");
     po::options_description visible_options("Allowed options");
     po::options_description hidden_options("Hidden options");
@@ -666,94 +630,106 @@ int main(int argc, char* argv[], char *envp[] )
     CmdLine::addGlobalOptions( general_options , hidden_options );
 
     general_options.add_options()
-        ("dbpath", po::value<string>() , "directory for datafiles")
-        ("directoryperdb", "each database will be stored in a separate directory")
-        ("repairpath", po::value<string>() , "root directory for repair files - defaults to dbpath" )
-        ("cpu", "periodically show cpu and iowait utilization")
-        ("noauth", "run without security")
-        ("auth", "run with security")
-        ("objcheck", "inspect client data for validity on receipt")
-        ("quota", "enable db quota management")
-        ("quotaFiles", po::value<int>(), "number of files allower per db, requires --quota")
-        ("appsrvpath", po::value<string>(), "root directory for the babble app server")
-        ("nocursors", "diagnostic/debugging option")
-        ("nohints", "ignore query hints")
-        ("nohttpinterface", "disable http interface")
-        ("rest","turn on simple rest api")
-        ("noscripting", "disable scripting engine")
-        ("noprealloc", "disable data file preallocation")
-        ("smallfiles", "use a smaller default file size")
-        ("nssize", po::value<int>()->default_value(16), ".ns file size (in MB) for new databases")
-        ("diaglog", po::value<int>(), "0=off 1=W 2=R 3=both 7=W+some reads")
-        ("sysinfo", "print some diagnostic system information")
-        ("upgrade", "upgrade db if needed")
-        ("repair", "run repair on all dbs")
-        ("notablescan", "do not allow table scans")
-        ("syncdelay",po::value<double>(&dataFileSync._sleepsecs)->default_value(60), "seconds between disk syncs (0=never, but not recommended)")
-        ("profile",po::value<int>(), "0=off 1=slow, 2=all")
-        ("slowms",po::value<int>(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" )
-        ("maxConns",po::value<int>(), "max number of simultaneous connections")
-		#if !defined(_WIN32)
-        ("nounixsocket", "disable listening on unix sockets")
-		#endif
-        ("ipv6", "enable IPv6 support (disabled by default)")
-        ;
-	#if defined(_WIN32)
-    windows_scm_options.add_options()
-        ("install", "install mongodb service")
-        ("remove", "remove mongodb service")
-        ("reinstall", "reinstall mongodb service (equivilant of mongod --remove followed by mongod --install)")
-        ("service", "start mongodb service")
-        ("serviceName", po::value<string>(), "windows service name")
-        ("serviceUser", po::value<string>(), "user name service executes as")
-        ("servicePassword", po::value<string>(), "password used to authenticate serviceUser")
-		;
-	#endif
-
-	replication_options.add_options()
-        ("master", "master mode")
-        ("slave", "slave mode")
-        ("source", po::value<string>(), "when slave: specify master as <server:port>")
-        ("only", po::value<string>(), "when slave: specify a single database to replicate")
-        ("pairwith", po::value<string>(), "address of server to pair with")
-        ("arbiter", po::value<string>(), "address of arbiter server")
-        ("slavedelay", po::value<int>(), "specify delay (in seconds) to be used when applying master ops to slave")
-        ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer")
-        ("autoresync", "automatically resync if slave data is stale")
-        ("oplogSize", po::value<int>(), "size limit (in MB) for op log")
-        ("opIdMem", po::value<long>(), "size limit (in bytes) for in memory storage of op ids")
-        ;
-
-	sharding_options.add_options()
-		("configsvr", "declare this is a config db of a cluster")
-		("shardsvr", "declare this is a shard db of a cluster")
-        ("noMoveParanoia" , "turn off paranoid saving of data for moveChunk.  this is on by default for now, but default will switch" )
-		;
+    ("auth", "run with security")
+    ("cpu", "periodically show cpu and iowait utilization")
+    ("dbpath", po::value<string>() , "directory for datafiles")
+    ("diaglog", po::value<int>(), "0=off 1=W 2=R 3=both 7=W+some reads")
+    ("directoryperdb", "each database will be stored in a separate directory")
+    ("journal", "enable journaling")
+    ("journalOptions", po::value<int>(), "journal diagnostic options")
+    ("ipv6", "enable IPv6 support (disabled by default)")
+    ("jsonp","allow JSONP access via http (has security implications)")
+    ("maxConns",po::value<int>(), "max number of simultaneous connections")
+    ("noauth", "run without security")
+    ("nohttpinterface", "disable http interface")
+    ("noprealloc", "disable data file preallocation - will often hurt performance")
+    ("noscripting", "disable scripting engine")
+    ("notablescan", "do not allow table scans")
+#if !defined(_WIN32)
+    ("nounixsocket", "disable listening on unix sockets")
+#endif
+    ("nssize", po::value<int>()->default_value(16), ".ns file size (in MB) for new databases")
+    ("objcheck", "inspect client data for validity on receipt")
+    ("profile",po::value<int>(), "0=off 1=slow, 2=all")
+    ("quota", "limits each database to a certain number of files (8 default)")
+    ("quotaFiles", po::value<int>(), "number of files allower per db, requires --quota")
+    ("rest","turn on simple rest api")
+    ("repair", "run repair on all dbs")
+    ("repairpath", po::value<string>() , "root directory for repair files - defaults to dbpath" )
+    ("slowms",po::value<int>(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" )
+    ("smallfiles", "use a smaller default file size")
+    ("syncdelay",po::value<double>(&cmdLine.syncdelay)->default_value(60), "seconds between disk syncs (0=never, but not recommended)")
+    ("sysinfo", "print some diagnostic system information")
+    ("upgrade", "upgrade db if needed")
+    ;
+
+#if defined(_WIN32)
+    CmdLine::addWindowsOptions( windows_scm_options, hidden_options );
+#endif
+
+    replication_options.add_options()
+    ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer")
+    ("autoresync", "automatically resync if slave data is stale")
+    ("oplogSize", po::value<int>(), "size limit (in MB) for op log")
+    ;
+
+    ms_options.add_options()
+    ("master", "master mode")
+    ("slave", "slave mode")
+    ("source", po::value<string>(), "when slave: specify master as <server:port>")
+    ("only", po::value<string>(), "when slave: specify a single database to replicate")
+    ("slavedelay", po::value<int>(), "specify delay (in seconds) to be used when applying master ops to slave")
+    ;
+
+    rs_options.add_options()
+    ("replSet", po::value<string>(), "arg is <setname>[/<optionalseedhostlist>]")
+    ;
+
+    sharding_options.add_options()
+    ("configsvr", "declare this is a config db of a cluster; default port 27019; default dir /data/configdb")
+    ("shardsvr", "declare this is a shard db of a cluster; default port 27018")
+    ("noMoveParanoia" , "turn off paranoid saving of data for moveChunk.  this is on by default for now, but default will switch" )
+    ;
 
     hidden_options.add_options()
-        ("pretouch", po::value<int>(), "n pretouch threads for applying replicationed operations")
-        ("replSet", po::value<string>(), "specify repl set seed hostnames format <set id>/<host1>,<host2>,etc...")
-        ("command", po::value< vector<string> >(), "command")
-        ("cacheSize", po::value<long>(), "cache size (in MB) for rec store")
-        ;
+    ("pretouch", po::value<int>(), "n pretouch threads for applying replicationed operations")
+    ("command", po::value< vector<string> >(), "command")
+    ("cacheSize", po::value<long>(), "cache size (in MB) for rec store")
+    // these move to unhidden later:
+    ("opIdMem", po::value<long>(), "size limit (in bytes) for in memory storage of op ids for replica pairs DEPRECATED")
+    ("pairwith", po::value<string>(), "address of server to pair with DEPRECATED")
+    ("arbiter", po::value<string>(), "address of replica pair arbiter server DEPRECATED")
+    ("nodur", "disable journaling (currently the default)")
+    ("appsrvpath", po::value<string>(), "root directory for the babble app server")
+    ("nocursors", "diagnostic/debugging option that turns off cursors DO NOT USE IN PRODUCTION")
+    ("nohints", "ignore query hints")
+    ("dur", "enable journaling") // deprecated version
+    ("durOptions", po::value<int>(), "durability diagnostic options") // deprecated version
+    ;
 
 
     positional_options.add("command", 3);
     visible_options.add(general_options);
-	#if defined(_WIN32)
-	visible_options.add(windows_scm_options);
-	#endif
+#if defined(_WIN32)
+    visible_options.add(windows_scm_options);
+#endif
     visible_options.add(replication_options);
+    visible_options.add(ms_options);
+    visible_options.add(rs_options);
     visible_options.add(sharding_options);
     Module::addOptions( visible_options );
 
     setupCoreSignals();
-    setupSignals();
+    setupSignals( false );
 
     dbExecCommand = argv[0];
 
     srand(curTimeMicros());
+#if( BOOST_VERSION >= 104500 )
+    boost::filesystem::path::default_name_check( boost::filesystem2::no_check );
+#else
     boost::filesystem::path::default_name_check( boost::filesystem::no_check );
+#endif
 
     {
         unsigned x = 0x12345678;
@@ -764,18 +740,12 @@ int main(int argc, char* argv[], char *envp[] )
         }
     }
 
-    UnitTest::runTests();
-
     if( argc == 1 )
         cout << dbExecCommand << " --help for help and startup options" << endl;
 
     {
-        bool installService = false;
-        bool removeService = false;
-        bool reinstallService = false;
-        bool startService = false;
         po::variables_map params;
-        
+
         string error_message = arg_error_check(argc, argv);
         if (error_message != "") {
             cout << error_message << endl << endl;
@@ -795,10 +765,19 @@ int main(int argc, char* argv[], char *envp[] )
             printGitVersion();
             return 0;
         }
-        if ( params.count( "dbpath" ) )
+        if ( params.count( "dbpath" ) ) {
             dbpath = params["dbpath"].as<string>();
-        else
+            if ( params.count( "fork" ) && dbpath[0] != '/' ) {
+                // we need to change dbpath if we fork since we change
+                // cwd to "/"
+                // fork only exists on *nix
+                // so '/' is safe 
+                dbpath = cmdLine.cwd + "/" + dbpath;
+            }
+        }
+        else {
             dbpath = "/data/db/";
+        }
 
         if ( params.count("directoryperdb")) {
             directoryperdb = true;
@@ -819,6 +798,18 @@ int main(int argc, char* argv[], char *envp[] )
             cmdLine.quota = true;
             cmdLine.quotaFiles = params["quotaFiles"].as<int>() - 1;
         }
+        if( params.count("nodur") ) {
+            cmdLine.dur = false;
+        }
+        if( params.count("dur") || params.count( "journal" ) ) {
+            cmdLine.dur = true;
+        }
+        if (params.count("durOptions")) {
+            cmdLine.durOptions = params["durOptions"].as<int>();
+        }
+        if (params.count("journalOptions")) {
+            cmdLine.durOptions = params["durOptions"].as<int>();
+        }
         if (params.count("objcheck")) {
             objcheck = true;
         }
@@ -828,8 +819,12 @@ int main(int argc, char* argv[], char *envp[] )
         }
         if (params.count("repairpath")) {
             repairpath = params["repairpath"].as<string>();
-            uassert( 12589, "repairpath has to be non-zero", repairpath.size() );
-        } else {
+            if (!repairpath.size()) {
+                out() << "repairpath has to be non-zero" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        else {
             repairpath = dbpath;
         }
         if (params.count("nocursors")) {
@@ -844,11 +839,15 @@ int main(int argc, char* argv[], char *envp[] )
         if (params.count("rest")) {
             cmdLine.rest = true;
         }
+        if (params.count("jsonp")) {
+            cmdLine.jsonp = true;
+        }
         if (params.count("noscripting")) {
-            useJNI = false;
+            scriptingEnabled = false;
         }
         if (params.count("noprealloc")) {
             cmdLine.prealloc = false;
+            cout << "note: noprealloc may hurt performance in many applications" << endl;
         }
         if (params.count("smallfiles")) {
             cmdLine.smallfiles = true;
@@ -873,29 +872,7 @@ int main(int argc, char* argv[], char *envp[] )
             shouldRepairDatabases = 1;
         }
         if (params.count("notablescan")) {
-            cmdLine.notablescan = true;
-        }
-        if (params.count("install")) {
-            if ( ! params.count( "logpath" ) ){
-                cout << "--install has to be used with --logpath" << endl;
-                ::exit(-1);
-            }
-
-            installService = true;
-        }
-        if (params.count("remove")) {
-            removeService = true;
-        }
-        if (params.count("reinstall")) {
-            if ( ! params.count( "logpath" ) ){
-                cout << "--reinstall has to be used with --logpath" << endl;
-                ::exit(-1);
-            }
-
-            reinstallService = true;
-        }
-        if (params.count("service")) {
-            startService = true;
+            cmdLine.noTableScan = true;
         }
         if (params.count("master")) {
             replSettings.master = true;
@@ -916,16 +893,17 @@ int main(int argc, char* argv[], char *envp[] )
             /* specifies what the source in local.sources should be */
             cmdLine.source = params["source"].as<string>().c_str();
         }
-        if( params.count("pretouch") ) { 
+        if( params.count("pretouch") ) {
             cmdLine.pretouch = params["pretouch"].as<int>();
         }
         if (params.count("replSet")) {
             if (params.count("slavedelay")) {
-                cout << "--slavedelay cannot be used with --replSet" << endl;
-                ::exit(-1);
-            } else if (params.count("only")) {
-                cout << "--only cannot be used with --replSet" << endl;
-                ::exit(-1);
+                out() << "--slavedelay cannot be used with --replSet" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            else if (params.count("only")) {
+                out() << "--only cannot be used with --replSet" << endl;
+                dbexit( EXIT_BADOPTIONS );
             }
             /* seed list of hosts for the repl set */
             cmdLine._replSet = params["replSet"].as<string>().c_str();
@@ -937,103 +915,108 @@ int main(int argc, char* argv[], char *envp[] )
             cout << "***********************************\n"
                  << "WARNING WARNING WARNING\n"
                  << " replica pairs are deprecated\n"
-                 << " see: http://www.mongodb.org/display/DOCS/Replica+Pairs \n" 
+                 << " see: http://www.mongodb.org/display/DOCS/Replica+Pairs \n"
                  << "***********************************" << endl;
             string paired = params["pairwith"].as<string>();
             if (params.count("arbiter")) {
                 string arbiter = params["arbiter"].as<string>();
                 pairWith(paired.c_str(), arbiter.c_str());
-            } else {
+            }
+            else {
                 pairWith(paired.c_str(), "-");
             }
-        } else if (params.count("arbiter")) {
-            uasserted(10999,"specifying --arbiter without --pairwith");
+        }
+        else if (params.count("arbiter")) {
+            out() << "specifying --arbiter without --pairwith" << endl;
+            dbexit( EXIT_BADOPTIONS );
         }
         if( params.count("nssize") ) {
             int x = params["nssize"].as<int>();
-            uassert( 10034 , "bad --nssize arg", x > 0 && x <= (0x7fffffff/1024/1024));
+            if (x <= 0 || x > (0x7fffffff/1024/1024)) {
+                out() << "bad --nssize arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
             lenForNewNsFiles = x * 1024 * 1024;
             assert(lenForNewNsFiles > 0);
         }
         if (params.count("oplogSize")) {
-            long x = params["oplogSize"].as<int>();
-            uassert( 10035 , "bad --oplogSize arg", x > 0);
+            long long x = params["oplogSize"].as<int>();
+            if (x <= 0) {
+                out() << "bad --oplogSize arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            // note a small size such as x==1 is ok for an arbiter.
+            if( x > 1000 && sizeof(void*) == 4 ) {
+                out() << "--oplogSize of " << x << "MB is too big for 32 bit version. Use 64 bit build instead." << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
             cmdLine.oplogSize = x * 1024 * 1024;
             assert(cmdLine.oplogSize > 0);
         }
         if (params.count("opIdMem")) {
             long x = params["opIdMem"].as<long>();
-            uassert( 10036 , "bad --opIdMem arg", x > 0);
+            if (x <= 0) {
+                out() << "bad --opIdMem arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
             replSettings.opIdMem = x;
             assert(replSettings.opIdMem > 0);
         }
         if (params.count("cacheSize")) {
             long x = params["cacheSize"].as<long>();
-            uassert( 10037 , "bad --cacheSize arg", x > 0);
+            if (x <= 0) {
+                out() << "bad --cacheSize arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
             log() << "--cacheSize option not currently supported" << endl;
-            //setRecCacheSize(x);
-        }
-		if (params.count("port") == 0 ) { 
-			if( params.count("configsvr") ) {
-				cmdLine.port = CmdLine::ConfigServerPort;
-			}
-			if( params.count("shardsvr") )
-				cmdLine.port = CmdLine::ShardServerPort;
-		}
-        else { 
-            if ( cmdLine.port <= 0 || cmdLine.port > 65535 ){
+        }
+        if (params.count("port") == 0 ) {
+            if( params.count("configsvr") ) {
+                cmdLine.port = CmdLine::ConfigServerPort;
+            }
+            if( params.count("shardsvr") )
+                cmdLine.port = CmdLine::ShardServerPort;
+        }
+        else {
+            if ( cmdLine.port <= 0 || cmdLine.port > 65535 ) {
                 out() << "bad --port number" << endl;
                 dbexit( EXIT_BADOPTIONS );
             }
         }
-        if ( params.count("configsvr" ) ){
+        if ( params.count("configsvr" ) ) {
+            if (cmdLine.usingReplSets() || replSettings.master || replSettings.slave) {
+                log() << "replication should not be enabled on a config server" << endl;
+                ::exit(-1);
+            }
             if ( params.count( "diaglog" ) == 0 )
                 _diaglog.level = 1;
             if ( params.count( "dbpath" ) == 0 )
                 dbpath = "/data/configdb";
         }
-        if ( params.count( "profile" ) ){
+        if ( params.count( "profile" ) ) {
             cmdLine.defaultProfile = params["profile"].as<int>();
         }
-        if ( params.count( "maxConns" ) ){
+        if ( params.count( "maxConns" ) ) {
             int newSize = params["maxConns"].as<int>();
-            uassert( 12507 , "maxConns has to be at least 5" , newSize >= 5 );
-            uassert( 12508 , "maxConns can't be greater than 10000000" , newSize < 10000000 );
+            if ( newSize < 5 ) {
+                out() << "maxConns has to be at least 5" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            else if ( newSize >= 10000000 ) {
+                out() << "maxConns can't be greater than 10000000" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
             connTicketHolder.resize( newSize );
         }
-        if (params.count("nounixsocket")){
+        if (params.count("nounixsocket")) {
             noUnixSocket = true;
         }
-        if (params.count("ipv6")){
+        if (params.count("ipv6")) {
             enableIPv6();
         }
-        if (params.count("noMoveParanoia")){
+        if (params.count("noMoveParanoia")) {
             cmdLine.moveParanoia = false;
         }
-#if defined(_WIN32)
-        if (params.count("serviceName")){
-            string x = params["serviceName"].as<string>();
-            windowsServiceName = wstring(x.size(),L' ');
-            for ( size_t i=0; i<x.size(); i++) {
-                windowsServiceName[i] = x[i];
-	    }
-        }
-        if (params.count("serviceUser")){
-            string x = params["serviceUser"].as<string>();
-            windowsServiceUser = wstring(x.size(),L' ');
-            for ( size_t i=0; i<x.size(); i++) {
-                windowsServiceUser[i] = x[i];
-	    }
-        }
-        if (params.count("servicePassword")){
-            string x = params["servicePassword"].as<string>();
-            windowsServicePassword = wstring(x.size(),L' ');
-            for ( size_t i=0; i<x.size(); i++) {
-                windowsServicePassword[i] = x[i];
-	    }
-        }
-        #endif
-
 
         Module::configAll( params );
         dataFileSync.go();
@@ -1041,20 +1024,6 @@ int main(int argc, char* argv[], char *envp[] )
         if (params.count("command")) {
             vector<string> command = params["command"].as< vector<string> >();
 
-            if (command[0].compare("msg") == 0) {
-                const char *m;
-
-                if (command.size() < 3) {
-                    cout << "Too few parameters to 'msg' command" << endl;
-                    cout << visible_options << endl;
-                    return 0;
-                }
-
-                m = command[1].c_str();
-
-                msg(m, "127.0.0.1", atoi(command[2].c_str()));
-                return 0;
-            }
             if (command[0].compare("run") == 0) {
                 if (command.size() > 1) {
                     cout << "Too many parameters to 'run' command" << endl;
@@ -1076,31 +1045,17 @@ int main(int argc, char* argv[], char *envp[] )
             return 0;
         }
 
+        if( cmdLine.pretouch )
+            log() << "--pretouch " << cmdLine.pretouch << endl;
+
 #if defined(_WIN32)
-        if ( reinstallService ) {
-            ServiceController::removeService( windowsServiceName );
-	}
-	if ( installService || reinstallService ) {
-            if ( !ServiceController::installService( windowsServiceName , L"Mongo DB", L"Mongo DB Server", windowsServiceUser, windowsServicePassword, dbpath, argc, argv ) )
-                dbexit( EXIT_NTSERVICE_ERROR );
-            dbexit( EXIT_CLEAN );
-        }
-        else if ( removeService ) {
-            if ( !ServiceController::removeService( windowsServiceName ) )
-                dbexit( EXIT_NTSERVICE_ERROR );
-            dbexit( EXIT_CLEAN );
-        }
-        else if ( startService ) {
-            if ( !ServiceController::startService( windowsServiceName , mongo::initService ) )
-                dbexit( EXIT_NTSERVICE_ERROR );
-            dbexit( EXIT_CLEAN );
+        if (serviceParamsCheck( params, dbpath, argc, argv )) {
+            return 0;
         }
 #endif
     }
 
-    if( cmdLine.pretouch )
-        log() << "--pretouch " << cmdLine.pretouch << endl;
-
+    UnitTest::runTests();
     initAndListen(cmdLine.port, appsrvPath);
     dbexit(EXIT_CLEAN);
     return 0;
@@ -1113,12 +1068,11 @@ namespace mongo {
 #undef out
 
     void exitCleanly( ExitCode code ) {
-        goingAway = true;
         killCurrentOp.killAll();
         {
             dblock lk;
             log() << "now exiting" << endl;
-            dbexit( code );        
+            dbexit( code );
         }
     }
 
@@ -1154,7 +1108,12 @@ namespace mongo {
         oss << "Backtrace:" << endl;
         printStackTrace( oss );
         rawOut( oss.str() );
-        dbexit( EXIT_ABRUBT );
+
+        if( cmdLine.dur ) { 
+            ::exit(EXIT_ABRUPT);
+        }
+
+        dbexit( EXIT_ABRUPT );
     }
 
     sigset_t asyncSignals;
@@ -1171,12 +1130,14 @@ namespace mongo {
     // this will be called in certain c++ error cases, for example if there are two active
     // exceptions
     void myterminate() {
-        rawOut( "terminate() called, printing stack:\n" );
+        rawOut( "terminate() called, printing stack:" );
         printStackTrace();
         abort();
     }
-    
-    void setupSignals() {
+
+    void setupSignals_ignoreHelper( int signal ) {}
+
+    void setupSignals( bool inFork ) {
         assert( signal(SIGSEGV, abruptQuit) != SIG_ERR );
         assert( signal(SIGFPE, abruptQuit) != SIG_ERR );
         assert( signal(SIGABRT, abruptQuit) != SIG_ERR );
@@ -1187,55 +1148,58 @@ namespace mongo {
         setupSIGTRAPforGDB();
 
         sigemptyset( &asyncSignals );
-        sigaddset( &asyncSignals, SIGHUP );
+
+        if ( inFork )
+            assert( signal( SIGHUP , setupSignals_ignoreHelper ) != SIG_ERR );
+        else
+            sigaddset( &asyncSignals, SIGHUP );
+
         sigaddset( &asyncSignals, SIGINT );
         sigaddset( &asyncSignals, SIGTERM );
         assert( pthread_sigmask( SIG_SETMASK, &asyncSignals, 0 ) == 0 );
         boost::thread it( interruptThread );
-        
+
         set_terminate( myterminate );
     }
 
 #else
-void ctrlCTerminate() {
-    log() << "got kill or ctrl-c signal, will terminate after current cmd ends" << endl;
-    Client::initThread( "ctrlCTerminate" );
-    exitCleanly( EXIT_KILL );
-}
-BOOL CtrlHandler( DWORD fdwCtrlType )
-{
-    switch( fdwCtrlType )
-    {
-    case CTRL_C_EVENT:
-        rawOut("Ctrl-C signal\n");
-        ctrlCTerminate();
-        return( TRUE );
-    case CTRL_CLOSE_EVENT:
-        rawOut("CTRL_CLOSE_EVENT signal\n");
-        ctrlCTerminate();
-        return( TRUE );
-    case CTRL_BREAK_EVENT:
-        rawOut("CTRL_BREAK_EVENT signal\n");
-        ctrlCTerminate();
-        return TRUE;
-    case CTRL_LOGOFF_EVENT:
-        rawOut("CTRL_LOGOFF_EVENT signal (ignored)\n");
-        return FALSE;
-    case CTRL_SHUTDOWN_EVENT:
-         rawOut("CTRL_SHUTDOWN_EVENT signal (ignored)\n");
-         return FALSE;
-    default:
-        return FALSE;
+    void ctrlCTerminate() {
+        log() << "got kill or ctrl-c signal, will terminate after current cmd ends" << endl;
+        Client::initThread( "ctrlCTerminate" );
+        exitCleanly( EXIT_KILL );
+    }
+    BOOL CtrlHandler( DWORD fdwCtrlType ) {
+        switch( fdwCtrlType ) {
+        case CTRL_C_EVENT:
+            rawOut("Ctrl-C signal");
+            ctrlCTerminate();
+            return( TRUE );
+        case CTRL_CLOSE_EVENT:
+            rawOut("CTRL_CLOSE_EVENT signal");
+            ctrlCTerminate();
+            return( TRUE );
+        case CTRL_BREAK_EVENT:
+            rawOut("CTRL_BREAK_EVENT signal");
+            ctrlCTerminate();
+            return TRUE;
+        case CTRL_LOGOFF_EVENT:
+            rawOut("CTRL_LOGOFF_EVENT signal (ignored)");
+            return FALSE;
+        case CTRL_SHUTDOWN_EVENT:
+            rawOut("CTRL_SHUTDOWN_EVENT signal (ignored)");
+            return FALSE;
+        default:
+            return FALSE;
+        }
     }
-}
 
     void myPurecallHandler() {
-        rawOut( "pure virtual method called, printing stack:\n" );
+        rawOut( "pure virtual method called, printing stack:" );
         printStackTrace();
-        abort();        
+        abort();
     }
-    
-    void setupSignals() {
+
+    void setupSignals( bool inFork ) {
         if( SetConsoleCtrlHandler( (PHANDLER_ROUTINE) CtrlHandler, TRUE ) )
             ;
         else
@@ -1245,6 +1209,3 @@ BOOL CtrlHandler( DWORD fdwCtrlType )
 #endif
 
 } // namespace mongo
-
-//#include "recstore.h"
-//#include "reccache.h"
diff --git a/db/db.h b/db/db.h
index a261f58..7ef7d03 100644
--- a/db/db.h
+++ b/db/db.h
@@ -26,19 +26,6 @@ namespace mongo {
 
 //    void jniCallback(Message& m, Message& out);
 
-    /* Note the limit here is rather arbitrary and is simply a standard. generally the code works
-       with any object that fits in ram.
-
-       Also note that the server has some basic checks to enforce this limit but those checks are not exhaustive
-       for example need to check for size too big after
-         update $push (append) operation
-         various db.eval() type operations
-
-       Note also we sometimes do work with objects slightly larger - an object in the replication local.oplog
-       could be slightly larger.
-    */
-    const int MaxBSONObjectSize = 4 * 1024 * 1024;
-    
     /**
      * class to hold path + dbname -> Database
      * might be able to optimizer further
@@ -48,8 +35,7 @@ namespace mongo {
         typedef map<string,Database*> DBs;
         typedef map<string,DBs> Paths;
 
-        DatabaseHolder() : _size(0){
-        }
+        DatabaseHolder() : _size(0) { }
 
         bool isLoaded( const string& ns , const string& path ) const {
             dbMutex.assertAtLeastReadLocked();
@@ -57,29 +43,29 @@ namespace mongo {
             if ( x == _paths.end() )
                 return false;
             const DBs& m = x->second;
-            
+
             string db = _todb( ns );
 
             DBs::const_iterator it = m.find(db);
             return it != m.end();
         }
-        
+
         Database * get( const string& ns , const string& path ) const {
             dbMutex.assertAtLeastReadLocked();
             Paths::const_iterator x = _paths.find( path );
             if ( x == _paths.end() )
                 return 0;
             const DBs& m = x->second;
-            
+
             string db = _todb( ns );
 
             DBs::const_iterator it = m.find(db);
-            if ( it != m.end() ) 
+            if ( it != m.end() )
                 return it->second;
             return 0;
         }
-        
-        void put( const string& ns , const string& path , Database * db ){
+
+        void put( const string& ns , const string& path , Database * db ) {
             dbMutex.assertWriteLocked();
             DBs& m = _paths[path];
             Database*& d = m[_todb(ns)];
@@ -87,35 +73,10 @@ namespace mongo {
                 _size++;
             d = db;
         }
-        
-        Database* getOrCreate( const string& ns , const string& path , bool& justCreated ){
-            dbMutex.assertWriteLocked();
-            DBs& m = _paths[path];
-            
-            string dbname = _todb( ns );
-
-            Database* & db = m[dbname];
-            if ( db ){
-                justCreated = false;
-                return db;
-            }
-            
-            log(1) << "Accessing: " << dbname << " for the first time" << endl;
-            try {
-                db = new Database( dbname.c_str() , justCreated , path );
-            }
-            catch ( ... ){
-                m.erase( dbname );
-                throw;
-            }
-            _size++;
-            return db;
-        }
-        
 
+        Database* getOrCreate( const string& ns , const string& path , bool& justCreated );
 
-
-        void erase( const string& ns , const string& path ){
+        void erase( const string& ns , const string& path ) {
             dbMutex.assertWriteLocked();
             DBs& m = _paths[path];
             _size -= (int)m.erase( _todb( ns ) );
@@ -124,71 +85,77 @@ namespace mongo {
         /* force - force close even if something underway - use at shutdown */
         bool closeAll( const string& path , BSONObjBuilder& result, bool force );
 
-        int size(){
+        int size() {
             return _size;
         }
-        
+
+        void forEach(boost::function<void(Database *)> f) const {
+            dbMutex.assertAtLeastReadLocked();
+            for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) {
+                DBs m = i->second;
+                for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) {
+                    f(j->second);
+                }
+            }
+        }
+
         /**
          * gets all unique db names, ignoring paths
          */
         void getAllShortNames( set<string>& all ) const {
             dbMutex.assertAtLeastReadLocked();
-            for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ){
+            for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) {
                 DBs m = i->second;
-                for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ){
+                for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) {
                     all.insert( j->first );
                 }
             }
         }
 
     private:
-        
+
         string _todb( const string& ns ) const {
             string d = __todb( ns );
-            uassert( 13280 , (string)"invalid db name: " + ns , Database::validDBName( d ) );            
+            uassert( 13280 , (string)"invalid db name: " + ns , Database::validDBName( d ) );
             return d;
         }
 
         string __todb( const string& ns ) const {
             size_t i = ns.find( '.' );
-            if ( i == string::npos ){
+            if ( i == string::npos ) {
                 uassert( 13074 , "db name can't be empty" , ns.size() );
                 return ns;
             }
             uassert( 13075 , "db name can't be empty" , i > 0 );
             return ns.substr( 0 , i );
         }
-        
+
         Paths _paths;
         int _size;
-        
+
     };
 
     extern DatabaseHolder dbHolder;
 
-    // shared functionality for removing references to a database from this program instance
-    // does not delete the files on disk
-    void closeDatabase( const char *cl, const string& path = dbpath );
-    
     struct dbtemprelease {
         Client::Context * _context;
         int _locktype;
-        
+
         dbtemprelease() {
             _context = cc().getContext();
             _locktype = dbMutex.getState();
             assert( _locktype );
-            
+
             if ( _locktype > 0 ) {
-				massert( 10298 , "can't temprelease nested write lock", _locktype == 1);
+                massert( 10298 , "can't temprelease nested write lock", _locktype == 1);
                 if ( _context ) _context->unlocked();
                 dbMutex.unlock();
-			}
+            }
             else {
-				massert( 10299 , "can't temprelease nested read lock", _locktype == -1);
+                massert( 10299 , "can't temprelease nested read lock", _locktype == -1);
                 if ( _context ) _context->unlocked();
                 dbMutex.unlock_shared();
-			}
+            }
 
         }
         ~dbtemprelease() {
@@ -196,11 +163,11 @@ namespace mongo {
                 dbMutex.lock();
             else
                 dbMutex.lock_shared();
-            
+
             if ( _context ) _context->relocked();
         }
     };
-    
+
 
     /**
        only does a temp release if we're not nested and have a lock
@@ -208,22 +175,22 @@ namespace mongo {
     struct dbtempreleasecond {
         dbtemprelease * real;
         int locktype;
-        
-        dbtempreleasecond(){
+
+        dbtempreleasecond() {
             real = 0;
             locktype = dbMutex.getState();
             if ( locktype == 1 || locktype == -1 )
                 real = new dbtemprelease();
         }
-        
-        ~dbtempreleasecond(){
-            if ( real ){
+
+        ~dbtempreleasecond() {
+            if ( real ) {
                 delete real;
                 real = 0;
             }
         }
-        
-        bool unlocked(){
+
+        bool unlocked() {
             return real > 0;
         }
     };
diff --git a/db/db.sln b/db/db.sln
deleted file mode 100644
index b02b79d..0000000
--- a/db/db.sln
+++ /dev/null
@@ -1,86 +0,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 10.00
-# Visual Studio 2008
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongod", "db.vcproj", "{215B2D68-0A70-4D10-8E75-B31010C62A91}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{4082881B-EB00-486F-906C-843B8EC06E18}"
-	ProjectSection(SolutionItems) = preProject
-		driverHelpers.cpp = driverHelpers.cpp
-	EndProjectSection
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
-	ProjectSection(SolutionItems) = preProject
-		..\tools\bridge.cpp = ..\tools\bridge.cpp
-		..\tools\dump.cpp = ..\tools\dump.cpp
-		..\tools\export.cpp = ..\tools\export.cpp
-		..\tools\files.cpp = ..\tools\files.cpp
-		..\tools\import.cpp = ..\tools\import.cpp
-		..\tools\restore.cpp = ..\tools\restore.cpp
-		..\tools\sniffer.cpp = ..\tools\sniffer.cpp
-		..\tools\stat.cpp = ..\tools\stat.cpp
-		..\tools\tool.cpp = ..\tools\tool.cpp
-		..\tools\tool.h = ..\tools\tool.h
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "..\dbtests\test.vcproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files", "{2F760952-C71B-4865-998F-AABAE96D1373}"
-	ProjectSection(SolutionItems) = preProject
-		..\util\mmap_posix.cpp = ..\util\mmap_posix.cpp
-		..\util\processinfo_darwin.cpp = ..\util\processinfo_darwin.cpp
-		..\util\processinfo_linux2.cpp = ..\util\processinfo_linux2.cpp
-		..\util\processinfo_none.cpp = ..\util\processinfo_none.cpp
-	EndProjectSection
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "shell", "shell", "{407B4B88-3451-433C-B74F-31B31FEB5791}"
-	ProjectSection(SolutionItems) = preProject
-		..\shell\dbshell.cpp = ..\shell\dbshell.cpp
-		..\shell\mongo_vstudio.cpp = ..\shell\mongo_vstudio.cpp
-		..\shell\utils.cpp = ..\shell\utils.cpp
-		..\shell\utils.h = ..\shell\utils.h
-	EndProjectSection
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "other source files", "other source files", "{12B11474-2D74-48C3-BB3D-F03249BEA88F}"
-	ProjectSection(SolutionItems) = preProject
-		..\buildscripts\buildboost.bat = ..\buildscripts\buildboost.bat
-		..\buildscripts\buildboost64.bat = ..\buildscripts\buildboost64.bat
-		..\SConstruct = ..\SConstruct
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bsondemo", "..\bson\bsondemo\bsondemo.vcproj", "{C9DB5EB7-81AA-4185-BAA1-DA035654402F}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Release|Win32 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
-		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.Build.0 = Debug|Win32
-		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.ActiveCfg = Release|Win32
-		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.Build.0 = Release|Win32
-		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.ActiveCfg = Debug|Win32
-		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.Build.0 = Debug|Win32
-		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.ActiveCfg = Release|Win32
-		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.Build.0 = Release|Win32
-		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
-		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32
-		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32
-		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32
-		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.ActiveCfg = Debug|Win32
-		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.Build.0 = Debug|Win32
-		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.ActiveCfg = Release|Win32
-		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(NestedProjects) = preSolution
-		{2B262D59-9DC7-4BF1-A431-1BD4966899A5} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
-		{2F760952-C71B-4865-998F-AABAE96D1373} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
-		{407B4B88-3451-433C-B74F-31B31FEB5791} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
-		{4082881B-EB00-486F-906C-843B8EC06E18} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
-	EndGlobalSection
-EndGlobal
diff --git a/db/db.vcproj b/db/db.vcproj
deleted file mode 100644
index 2eac6eb..0000000
--- a/db/db.vcproj
+++ /dev/null
@@ -1,1885 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
-	ProjectType="Visual C++"
-	Version="9.00"
-	Name="mongod"
-	ProjectGUID="{215B2D68-0A70-4D10-8E75-B31010C62A91}"
-	RootNamespace="db"
-	Keyword="Win32Proj"
-	TargetFrameworkVersion="196613"
-	>
-	<Platforms>
-		<Platform
-			Name="Win32"
-		/>
-	</Platforms>
-	<ToolFiles>
-	</ToolFiles>
-	<Configurations>
-		<Configuration
-			Name="Debug|Win32"
-			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
-			IntermediateDirectory="$(ConfigurationName)"
-			ConfigurationType="1"
-			UseOfMFC="0"
-			UseOfATL="0"
-			CharacterSet="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories="&quot;c:\program files\boost\latest&quot;;..\..\js\src;&quot;..\pcre-7.4&quot;;c:\boost;\boost"
-				PreprocessorDefinitions="MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC"
-				MinimalRebuild="true"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="3"
-				UsePrecompiledHeader="2"
-				PrecompiledHeaderThrough="pch.h"
-				WarningLevel="3"
-				Detect64BitPortabilityProblems="false"
-				DebugInformationFormat="4"
-				DisableSpecificWarnings="4355;4800"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				AdditionalDependencies="ws2_32.lib Psapi.lib"
-				LinkIncremental="2"
-				AdditionalLibraryDirectories="&quot;c:\program files\boost\latest\lib&quot;;c:\boost\lib;\boost\lib"
-				IgnoreAllDefaultLibraries="false"
-				IgnoreDefaultLibraryNames=""
-				GenerateDebugInformation="true"
-				SubSystem="1"
-				TargetMachine="1"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
-			IntermediateDirectory="$(ConfigurationName)"
-			ConfigurationType="1"
-			CharacterSet="1"
-			WholeProgramOptimization="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="2"
-				EnableIntrinsicFunctions="true"
-				AdditionalIncludeDirectories="&quot;c:\program files\boost\latest&quot;;..\..\js\src;&quot;..\pcre-7.4&quot;;c:\boost;\boost"
-				PreprocessorDefinitions="MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC"
-				RuntimeLibrary="0"
-				EnableFunctionLevelLinking="true"
-				UsePrecompiledHeader="2"
-				PrecompiledHeaderThrough="pch.h"
-				WarningLevel="3"
-				DebugInformationFormat="3"
-				DisableSpecificWarnings="4355;4800"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				AdditionalDependencies="ws2_32.lib psapi.lib"
-				LinkIncremental="1"
-				AdditionalLibraryDirectories="&quot;c:\program files\boost\latest\lib&quot;;c:\boost\lib;\boost\lib"
-				GenerateDebugInformation="true"
-				SubSystem="1"
-				OptimizeReferences="2"
-				EnableCOMDATFolding="2"
-				TargetMachine="1"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
-	</Configurations>
-	<References>
-	</References>
-	<Files>
-		<Filter
-			Name="libs"
-			>
-			<File
-				RelativePath=".\db.rc"
-				>
-			</File>
-			<File
-				RelativePath="..\..\js\src\js.lib"
-				>
-			</File>
-			<File
-				RelativePath="..\pcre-7.4\pcrecpp.cc"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="release_nojni|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\SConstruct"
-				>
-			</File>
-			<File
-				RelativePath="..\targetver.h"
-				>
-			</File>
-			<Filter
-				Name="pcre"
-				>
-				<File
-					RelativePath="..\pcre-7.4\config.h"
-					>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre.h"
-					>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_chartables.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_compile.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_config.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_dfa_exec.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_exec.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_fullinfo.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_get.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_globals.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_info.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_maketables.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_newline.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_ord2utf8.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_refcount.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_scanner.cc"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_stringpiece.cc"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_study.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug Recstore|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_tables.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug Recstore|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_try_flipped.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug Recstore|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_ucp_searchfuncs.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug Recstore|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_valid_utf8.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug Recstore|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_version.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug Recstore|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcre_xclass.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug Recstore|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\pcre-7.4\pcreposix.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug Recstore|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-			</Filter>
-			<Filter
-				Name="old_repl"
-				>
-				<File
-					RelativePath=".\oplog.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\repl.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\repl.h"
-					>
-				</File>
-				<File
-					RelativePath=".\repl_block.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\replpair.h"
-					>
-				</File>
-			</Filter>
-		</Filter>
-		<Filter
-			Name="storage related"
-			>
-			<File
-				RelativePath=".\rec.h"
-				>
-			</File>
-			<File
-				RelativePath=".\reccache.h"
-				>
-			</File>
-			<File
-				RelativePath=".\reci.h"
-				>
-			</File>
-			<File
-				RelativePath=".\recstore.h"
-				>
-			</File>
-			<File
-				RelativePath=".\storage.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\storage.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="client"
-			>
-			<File
-				RelativePath="..\client\connpool.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\client\connpool.h"
-				>
-			</File>
-			<File
-				RelativePath="..\client\dbclient.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\client\dbclient.h"
-				>
-			</File>
-			<File
-				RelativePath="..\client\dbclientcursor.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\client\model.h"
-				>
-			</File>
-			<File
-				RelativePath="..\client\redef_macros.h"
-				>
-			</File>
-			<File
-				RelativePath="..\client\syncclusterconnection.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\client\syncclusterconnection.h"
-				>
-			</File>
-			<File
-				RelativePath="..\client\undef_macros.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="db"
-			>
-			<File
-				RelativePath="..\pch.cpp"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="1"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="1"
-					/>
-				</FileConfiguration>
-			</File>
-			<Filter
-				Name="cpp"
-				Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
-				UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
-				>
-				<File
-					RelativePath=".\client.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\clientcursor.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\cloner.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\commands.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\common.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\cursor.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\s\d_util.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\database.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\db.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\dbcommands.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\dbcommands_admin.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\dbeval.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\dbhelpers.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\dbwebserver.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\extsort.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\index.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\index_geo2d.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\indexkey.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\instance.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\introspect.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\jsobj.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\json.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\lasterror.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\matcher.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\matcher_covered.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\util\mmap_win.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\modules\mms.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\module.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\mr.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\namespace.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\nonce.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\client\parallel.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\pdfile.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\query.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\queryoptimizer.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\util\ramstore.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\security.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\security_commands.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\tests.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\update.cpp"
-					>
-				</File>
-			</Filter>
-			<Filter
-				Name="h"
-				>
-				<File
-					RelativePath=".\background.h"
-					>
-				</File>
-				<File
-					RelativePath=".\client.h"
-					>
-				</File>
-				<File
-					RelativePath=".\clientcursor.h"
-					>
-				</File>
-				<File
-					RelativePath=".\cmdline.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\cmdline.h"
-					>
-				</File>
-				<File
-					RelativePath=".\commands.h"
-					>
-				</File>
-				<File
-					RelativePath=".\concurrency.h"
-					>
-				</File>
-				<File
-					RelativePath=".\curop.h"
-					>
-				</File>
-				<File
-					RelativePath=".\cursor.h"
-					>
-				</File>
-				<File
-					RelativePath=".\database.h"
-					>
-				</File>
-				<File
-					RelativePath=".\db.h"
-					>
-				</File>
-				<File
-					RelativePath=".\dbhelpers.h"
-					>
-				</File>
-				<File
-					RelativePath=".\dbinfo.h"
-					>
-				</File>
-				<File
-					RelativePath=".\dbmessage.h"
-					>
-				</File>
-				<File
-					RelativePath=".\diskloc.h"
-					>
-				</File>
-				<File
-					RelativePath=".\index.h"
-					>
-				</File>
-				<File
-					RelativePath=".\indexkey.h"
-					>
-				</File>
-				<File
-					RelativePath=".\introspect.h"
-					>
-				</File>
-				<File
-					RelativePath=".\json.h"
-					>
-				</File>
-				<File
-					RelativePath=".\matcher.h"
-					>
-				</File>
-				<File
-					RelativePath="..\grid\message.h"
-					>
-				</File>
-				<File
-					RelativePath=".\minilex.h"
-					>
-				</File>
-				<File
-					RelativePath=".\namespace.h"
-					>
-				</File>
-				<File
-					RelativePath="..\pch.h"
-					>
-				</File>
-				<File
-					RelativePath=".\pdfile.h"
-					>
-				</File>
-				<File
-					RelativePath="..\grid\protocol.h"
-					>
-				</File>
-				<File
-					RelativePath=".\query.h"
-					>
-				</File>
-				<File
-					RelativePath=".\queryoptimizer.h"
-					>
-				</File>
-				<File
-					RelativePath=".\queryutil.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\resource.h"
-					>
-				</File>
-				<File
-					RelativePath=".\scanandorder.h"
-					>
-				</File>
-				<File
-					RelativePath=".\security.h"
-					>
-				</File>
-				<File
-					RelativePath=".\update.h"
-					>
-				</File>
-			</Filter>
-		</Filter>
-		<Filter
-			Name="util"
-			>
-			<File
-				RelativePath="..\util\allocator.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\array.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\assert_util.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\util\assert_util.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\background.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\util\background.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\base64.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\util\base64.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\builder.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\debug_util.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\embedded_builder.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\file.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\file_allocator.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\goodies.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\hashtab.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\hex.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\interlocked.h"
-				>
-			</File>
-			<File
-				RelativePath=".\lasterror.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\rwlock.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\log.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\lruishmap.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\mmap.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\util\mmap.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\mvar.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\ntservice.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\util\ntservice.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\optime.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\processinfo.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\processinfo_win32.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\util\queue.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\ramstore.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\thread_pool.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\text.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\util\text.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\unittest.h"
-				>
-			</File>
-			<File
-				RelativePath="..\util\util.cpp"
-				>
-			</File>
-			<Filter
-				Name="concurrency"
-				>
-				<File
-					RelativePath="..\util\concurrency\list.h"
-					>
-				</File>
-				<File
-					RelativePath="..\util\concurrency\msg.h"
-					>
-				</File>
-				<File
-					RelativePath="..\util\concurrency\task.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\util\concurrency\task.h"
-					>
-				</File>
-				<File
-					RelativePath="..\util\concurrency\value.h"
-					>
-				</File>
-				<File
-					RelativePath="..\util\concurrency\vars.cpp"
-					>
-				</File>
-			</Filter>
-			<Filter
-				Name="web"
-				>
-				<File
-					RelativePath="..\util\web\html.h"
-					>
-				</File>
-				<File
-					RelativePath="..\util\httpclient.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\util\httpclient.h"
-					>
-				</File>
-				<File
-					RelativePath="..\util\miniwebserver.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\util\miniwebserver.h"
-					>
-				</File>
-			</Filter>
-			<Filter
-				Name="md5"
-				>
-				<File
-					RelativePath="..\util\md5.c"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-							PrecompiledHeaderThrough=""
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="release_nojni|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug Recstore|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="0"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\util\md5.h"
-					>
-				</File>
-				<File
-					RelativePath="..\util\md5.hpp"
-					>
-				</File>
-				<File
-					RelativePath="..\util\md5main.cpp"
-					>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCLCompilerTool"
-							UsePrecompiledHeader="2"
-						/>
-					</FileConfiguration>
-				</File>
-			</Filter>
-			<Filter
-				Name="net"
-				>
-				<File
-					RelativePath="..\util\message.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\util\message.h"
-					>
-				</File>
-				<File
-					RelativePath="..\util\message_server.h"
-					>
-				</File>
-				<File
-					RelativePath="..\util\message_server_port.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\util\sock.cpp"
-					>
-				</File>
-				<File
-					RelativePath="..\util\sock.h"
-					>
-				</File>
-			</Filter>
-		</Filter>
-		<Filter
-			Name="shard"
-			>
-			<File
-				RelativePath="..\s\d_logic.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\s\shardconnection.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="scripting"
-			>
-			<File
-				RelativePath="..\scripting\engine.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\scripting\engine.h"
-				>
-			</File>
-			<File
-				RelativePath="..\scripting\engine_java.h"
-				>
-			</File>
-			<File
-				RelativePath="..\scripting\engine_spidermonkey.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\scripting\engine_spidermonkey.h"
-				>
-			</File>
-			<File
-				RelativePath="..\scripting\engine_v8.h"
-				>
-			</File>
-			<File
-				RelativePath="..\shell\mongo_vstudio.cpp"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug Recstore|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						UsePrecompiledHeader="0"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\scripting\utils.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\scripting\v8_db.h"
-				>
-			</File>
-			<File
-				RelativePath="..\scripting\v8_utils.h"
-				>
-			</File>
-			<File
-				RelativePath="..\scripting\v8_wrapper.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="stats"
-			>
-			<File
-				RelativePath=".\stats\counters.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\stats\snapshots.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\stats\top.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="btree"
-			>
-			<File
-				RelativePath=".\btree.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\btree.h"
-				>
-			</File>
-			<File
-				RelativePath=".\btreecursor.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="replsets"
-			>
-			<File
-				RelativePath=".\repl\connections.h"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\consensus.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\health.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\health.h"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\heartbeat.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\util\hostandport.h"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\manager.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\replset.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\replset.h"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\replset_commands.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\rs_config.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\rs_sync.cpp"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\rs_config.h"
-				>
-			</File>
-			<File
-				RelativePath=".\repl\rs_initiate.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="bson"
-			>
-			<File
-				RelativePath="..\bson\bsonelement.h"
-				>
-			</File>
-			<File
-				RelativePath="..\bson\bsoninlines.h"
-				>
-			</File>
-			<File
-				RelativePath="..\bson\bsonmisc.h"
-				>
-			</File>
-			<File
-				RelativePath="..\bson\bsonobj.h"
-				>
-			</File>
-			<File
-				RelativePath="..\bson\bsonobjbuilder.h"
-				>
-			</File>
-			<File
-				RelativePath="..\bson\bsonobjiterator.h"
-				>
-			</File>
-			<File
-				RelativePath="..\bson\bsontypes.h"
-				>
-			</File>
-			<File
-				RelativePath=".\jsobj.h"
-				>
-			</File>
-			<File
-				RelativePath="..\bson\oid.h"
-				>
-			</File>
-			<File
-				RelativePath="..\bson\ordering.h"
-				>
-			</File>
-		</Filter>
-	</Files>
-	<Globals>
-	</Globals>
-</VisualStudioProject>
diff --git a/db/db.vcxproj b/db/db.vcxproj
index 0cabbd0..ad9c6d2 100644
--- a/db/db.vcxproj
+++ b/db/db.vcxproj
@@ -89,6 +89,10 @@
     <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
     <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
     <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.;..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..;$(IncludePath)</IncludePath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -192,7 +196,9 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClCompile Include="..\bson\oid.cpp" />
     <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
     <ClCompile Include="..\client\distlock.cpp" />
     <ClCompile Include="..\client\model.cpp" />
     <ClCompile Include="..\pcre-7.4\pcrecpp.cc">
@@ -435,8 +441,16 @@
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </PrecompiledHeader>
     </ClCompile>
+    <ClCompile Include="..\scripting\bench.cpp" />
+    <ClCompile Include="..\shell\mongo_vstudio.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
     <ClCompile Include="..\s\chunk.cpp" />
     <ClCompile Include="..\s\config.cpp" />
+    <ClCompile Include="..\s\d_chunk_manager.cpp" />
     <ClCompile Include="..\s\d_migrate.cpp" />
     <ClCompile Include="..\s\d_split.cpp" />
     <ClCompile Include="..\s\d_state.cpp" />
@@ -445,19 +459,40 @@
     <ClCompile Include="..\s\shard.cpp" />
     <ClCompile Include="..\s\shardconnection.cpp" />
     <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\util\alignedbuilder.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+    <ClCompile Include="..\util\concurrency\synchronization.cpp" />
     <ClCompile Include="..\util\concurrency\task.cpp" />
     <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
     <ClCompile Include="..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\util\file_allocator.cpp" />
     <ClCompile Include="..\util\log.cpp" />
+    <ClCompile Include="..\util\logfile.cpp" />
     <ClCompile Include="..\util\processinfo.cpp" />
     <ClCompile Include="..\util\stringutils.cpp" />
     <ClCompile Include="..\util\text.cpp" />
     <ClCompile Include="..\util\version.cpp" />
     <ClCompile Include="cap.cpp" />
+    <ClCompile Include="commands\distinct.cpp" />
+    <ClCompile Include="commands\group.cpp" />
+    <ClCompile Include="commands\isself.cpp" />
+    <ClCompile Include="commands\mr.cpp" />
+    <ClCompile Include="compact.cpp" />
     <ClCompile Include="dbcommands_generic.cpp" />
+    <ClCompile Include="dur.cpp" />
+    <ClCompile Include="durop.cpp" />
+    <ClCompile Include="dur_commitjob.cpp" />
+    <ClCompile Include="dur_journal.cpp" />
+    <ClCompile Include="dur_preplogbuffer.cpp" />
+    <ClCompile Include="dur_recover.cpp" />
+    <ClCompile Include="dur_writetodatafiles.cpp" />
     <ClCompile Include="geo\2d.cpp" />
     <ClCompile Include="geo\haystack.cpp" />
+    <ClCompile Include="mongommf.cpp" />
     <ClCompile Include="oplog.cpp" />
+    <ClCompile Include="projection.cpp" />
     <ClCompile Include="repl.cpp" />
     <ClCompile Include="repl\consensus.cpp" />
     <ClCompile Include="repl\heartbeat.cpp" />
@@ -468,7 +503,6 @@
     <ClCompile Include="repl\rs_sync.cpp" />
     <ClCompile Include="repl_block.cpp" />
     <ClCompile Include="restapi.cpp" />
-    <ClCompile Include="storage.cpp" />
     <ClCompile Include="..\client\connpool.cpp" />
     <ClCompile Include="..\client\dbclient.cpp" />
     <ClCompile Include="..\client\syncclusterconnection.cpp" />
@@ -484,7 +518,6 @@
     <ClCompile Include="commands.cpp" />
     <ClCompile Include="common.cpp" />
     <ClCompile Include="cursor.cpp" />
-    <ClCompile Include="..\s\d_util.cpp" />
     <ClCompile Include="database.cpp" />
     <ClCompile Include="db.cpp" />
     <ClCompile Include="dbcommands.cpp" />
@@ -505,16 +538,15 @@
     <ClCompile Include="..\util\mmap_win.cpp" />
     <ClCompile Include="modules\mms.cpp" />
     <ClCompile Include="module.cpp" />
-    <ClCompile Include="mr.cpp" />
     <ClCompile Include="namespace.cpp" />
     <ClCompile Include="nonce.cpp" />
     <ClCompile Include="..\client\parallel.cpp" />
     <ClCompile Include="pdfile.cpp" />
     <ClCompile Include="query.cpp" />
     <ClCompile Include="queryoptimizer.cpp" />
-    <ClCompile Include="..\util\ramstore.cpp" />
     <ClCompile Include="security.cpp" />
     <ClCompile Include="security_commands.cpp" />
+    <ClCompile Include="security_key.cpp" />
     <ClCompile Include="tests.cpp" />
     <ClCompile Include="update.cpp" />
     <ClCompile Include="cmdline.cpp" />
@@ -552,16 +584,6 @@
     <ClCompile Include="..\s\d_logic.cpp" />
     <ClCompile Include="..\scripting\engine.cpp" />
     <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
-    <ClCompile Include="..\shell\mongo_vstudio.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
     <ClCompile Include="..\scripting\utils.cpp" />
     <ClCompile Include="stats\counters.cpp" />
     <ClCompile Include="stats\snapshots.cpp" />
@@ -574,6 +596,8 @@
     <ClCompile Include="repl\rs_config.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <None Include="..\jstests\dur\basic1.sh" />
+    <None Include="..\jstests\dur\dur1.js" />
     <None Include="..\jstests\replsets\replset1.js" />
     <None Include="..\jstests\replsets\replset2.js" />
     <None Include="..\jstests\replsets\replset3.js" />
@@ -604,26 +628,37 @@
     <ClInclude Include="..\targetver.h" />
     <ClInclude Include="..\pcre-7.4\config.h" />
     <ClInclude Include="..\pcre-7.4\pcre.h" />
+    <ClInclude Include="..\util\concurrency\race.h" />
     <ClInclude Include="..\util\concurrency\rwlock.h" />
     <ClInclude Include="..\util\concurrency\msg.h" />
     <ClInclude Include="..\util\concurrency\mutex.h" />
     <ClInclude Include="..\util\concurrency\mvar.h" />
     <ClInclude Include="..\util\concurrency\task.h" />
     <ClInclude Include="..\util\concurrency\thread_pool.h" />
+    <ClInclude Include="..\util\logfile.h" />
+    <ClInclude Include="..\util\mongoutils\checksum.h" />
     <ClInclude Include="..\util\mongoutils\html.h" />
     <ClInclude Include="..\util\mongoutils\str.h" />
+    <ClInclude Include="..\util\paths.h" />
     <ClInclude Include="..\util\ramlog.h" />
     <ClInclude Include="..\util\text.h" />
+    <ClInclude Include="..\util\time_support.h" />
+    <ClInclude Include="durop.h" />
+    <ClInclude Include="dur_commitjob.h" />
+    <ClInclude Include="dur_journal.h" />
+    <ClInclude Include="dur_journalformat.h" />
+    <ClInclude Include="dur_journalimpl.h" />
+    <ClInclude Include="dur_stats.h" />
     <ClInclude Include="geo\core.h" />
     <ClInclude Include="helpers\dblogger.h" />
     <ClInclude Include="instance.h" />
+    <ClInclude Include="mongommf.h" />
+    <ClInclude Include="mongomutex.h" />
+    <ClInclude Include="namespace-inl.h" />
     <ClInclude Include="oplogreader.h" />
+    <ClInclude Include="projection.h" />
     <ClInclude Include="repl.h" />
     <ClInclude Include="replpair.h" />
-    <ClInclude Include="rec.h" />
-    <ClInclude Include="reccache.h" />
-    <ClInclude Include="reci.h" />
-    <ClInclude Include="recstore.h" />
     <ClInclude Include="repl\connections.h" />
     <ClInclude Include="repl\multicmd.h" />
     <ClInclude Include="repl\rsmember.h" />
@@ -656,7 +691,6 @@
     <ClInclude Include="introspect.h" />
     <ClInclude Include="json.h" />
     <ClInclude Include="matcher.h" />
-    <ClInclude Include="minilex.h" />
     <ClInclude Include="namespace.h" />
     <ClInclude Include="..\pch.h" />
     <ClInclude Include="pdfile.h" />
diff --git a/db/db.vcxproj.filters b/db/db.vcxproj.filters
index bf30b4e..a2011df 100755
--- a/db/db.vcxproj.filters
+++ b/db/db.vcxproj.filters
@@ -1,928 +1,329 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
-    <ClCompile Include="repl\replset_commands.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\rs_config.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\health.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="btree.cpp">
-      <Filter>db\btree</Filter>
-    </ClCompile>
-    <ClCompile Include="btreecursor.cpp">
-      <Filter>db\btree</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\consensus.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="..\client\connpool.cpp">
-      <Filter>client</Filter>
-    </ClCompile>
-    <ClCompile Include="..\client\dbclient.cpp">
-      <Filter>client</Filter>
-    </ClCompile>
-    <ClCompile Include="..\client\dbclientcursor.cpp">
-      <Filter>client</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\manager.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcreposix.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_chartables.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_compile.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_config.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_dfa_exec.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_exec.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_fullinfo.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_get.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_globals.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_info.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_maketables.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_newline.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_ord2utf8.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_refcount.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_scanner.cc">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_stringpiece.cc">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_study.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_tables.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_try_flipped.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_ucp_searchfuncs.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_valid_utf8.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_version.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_xclass.c">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcrecpp.cc">
-      <Filter>util\pcre</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\rs_initiate.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\concurrency\vars.cpp">
-      <Filter>util\concurrency</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\concurrency\task.cpp">
-      <Filter>util\concurrency</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\heartbeat.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="..\scripting\engine_spidermonkey.cpp">
-      <Filter>scripting</Filter>
-    </ClCompile>
-    <ClCompile Include="..\scripting\engine.cpp">
-      <Filter>scripting</Filter>
-    </ClCompile>
-    <ClCompile Include="..\scripting\utils.cpp">
-      <Filter>scripting</Filter>
-    </ClCompile>
-    <ClCompile Include="..\client\syncclusterconnection.cpp">
-      <Filter>client</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\rs.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="oplog.cpp">
-      <Filter>repl_old</Filter>
-    </ClCompile>
-    <ClCompile Include="client.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="clientcursor.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="cloner.cpp">
-      <Filter>repl_old</Filter>
-    </ClCompile>
-    <ClCompile Include="cmdline.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="commands.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="common.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="stats\counters.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="cursor.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\d_util.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\d_logic.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="database.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="db.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="dbeval.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="dbcommands.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="dbcommands_admin.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="dbhelpers.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="extsort.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="dbwebserver.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\httpclient.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="index.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="indexkey.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="instance.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="introspect.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="jsobj.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="json.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\md5.c">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="matcher_covered.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="lasterror.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="matcher.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\message_server_port.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\md5main.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\message.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="modules\mms.cpp">
-      <Filter>db\modules</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\mmap.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\mmap_win.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\shell\mongo_vstudio.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="module.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="mr.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="namespace.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="nonce.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\ntservice.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="queryutil.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\client\parallel.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="pdfile.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="query.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="queryoptimizer.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\ramstore.cpp">
-      <Filter>db\storage engine</Filter>
-    </ClCompile>
-    <ClCompile Include="repl_block.cpp">
-      <Filter>repl_old</Filter>
-    </ClCompile>
-    <ClCompile Include="repl.cpp">
-      <Filter>repl_old</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\shardconnection.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\sock.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\util.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="security.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="security_commands.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="stats\snapshots.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="storage.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="tests.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="stats\top.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="update.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\background.cpp">
-      <Filter>util\concurrency</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\assert_util.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\base64.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\miniwebserver.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\processinfo_win32.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\version.cpp">
-      <Filter>db</Filter>
-    </ClCompile>
-    <ClCompile Include="..\pch.cpp">
-      <Filter>db</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\d_writeback.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\d_state.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\text.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="geo\2d.cpp">
-      <Filter>db\geo</Filter>
-    </ClCompile>
-    <ClCompile Include="dbcommands_generic.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\config.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\chunk.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\shard.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\shardkey.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="..\client\model.cpp">
-      <Filter>client</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\stringutils.cpp">
-      <Filter>util\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\concurrency\thread_pool.cpp">
-      <Filter>util\concurrency</Filter>
-    </ClCompile>
-    <ClCompile Include="..\client\distlock.cpp">
-      <Filter>client</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\d_migrate.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="..\s\d_split.cpp">
-      <Filter>sharding</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\rs_sync.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\rs_initialsync.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="repl\rs_rollback.cpp">
-      <Filter>replSets</Filter>
-    </ClCompile>
-    <ClCompile Include="geo\haystack.cpp">
-      <Filter>db\geo</Filter>
-    </ClCompile>
-    <ClCompile Include="cap.cpp">
-      <Filter>db\core</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\log.cpp">
-      <Filter>util</Filter>
-    </ClCompile>
-    <ClCompile Include="..\util\processinfo.cpp">
-      <Filter>util</Filter>
-    </ClCompile>
-    <ClCompile Include="restapi.cpp">
-      <Filter>db</Filter>
-    </ClCompile>
+    <ClCompile Include="..\bson\oid.cpp" />
+    <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
+    <ClCompile Include="..\client\distlock.cpp" />
+    <ClCompile Include="..\client\model.cpp" />
+    <ClCompile Include="..\pcre-7.4\pcrecpp.cc" />
+    <ClCompile Include="..\pcre-7.4\pcre_chartables.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_compile.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_config.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_dfa_exec.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_exec.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_fullinfo.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_get.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_globals.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_info.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_maketables.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_newline.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_ord2utf8.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_refcount.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_scanner.cc" />
+    <ClCompile Include="..\pcre-7.4\pcre_stringpiece.cc" />
+    <ClCompile Include="..\pcre-7.4\pcre_study.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_tables.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_try_flipped.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_ucp_searchfuncs.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_valid_utf8.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_version.c" />
+    <ClCompile Include="..\pcre-7.4\pcre_xclass.c" />
+    <ClCompile Include="..\pcre-7.4\pcreposix.c" />
+    <ClCompile Include="..\scripting\bench.cpp" />
+    <ClCompile Include="..\shell\mongo_vstudio.cpp" />
+    <ClCompile Include="..\s\chunk.cpp" />
+    <ClCompile Include="..\s\config.cpp" />
+    <ClCompile Include="..\s\d_chunk_manager.cpp" />
+    <ClCompile Include="..\s\d_migrate.cpp" />
+    <ClCompile Include="..\s\d_split.cpp" />
+    <ClCompile Include="..\s\d_state.cpp" />
+    <ClCompile Include="..\s\d_writeback.cpp" />
     <ClCompile Include="..\s\grid.cpp" />
+    <ClCompile Include="..\s\shard.cpp" />
+    <ClCompile Include="..\s\shardconnection.cpp" />
+    <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\util\alignedbuilder.cpp" />
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+    <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+    <ClCompile Include="..\util\concurrency\task.cpp" />
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+    <ClCompile Include="..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\util\log.cpp" />
+    <ClCompile Include="..\util\logfile.cpp" />
+    <ClCompile Include="..\util\processinfo.cpp" />
+    <ClCompile Include="..\util\stringutils.cpp" />
+    <ClCompile Include="..\util\text.cpp" />
+    <ClCompile Include="..\util\version.cpp" />
+    <ClCompile Include="cap.cpp" />
+    <ClCompile Include="commands\distinct.cpp" />
+    <ClCompile Include="commands\group.cpp" />
+    <ClCompile Include="commands\isself.cpp" />
+    <ClCompile Include="commands\mr.cpp" />
+    <ClCompile Include="compact.cpp" />
+    <ClCompile Include="dbcommands_generic.cpp" />
+    <ClCompile Include="dur.cpp" />
+    <ClCompile Include="durop.cpp" />
+    <ClCompile Include="dur_commitjob.cpp" />
+    <ClCompile Include="dur_journal.cpp" />
+    <ClCompile Include="dur_preplogbuffer.cpp" />
+    <ClCompile Include="dur_recover.cpp" />
+    <ClCompile Include="dur_writetodatafiles.cpp" />
+    <ClCompile Include="geo\2d.cpp" />
+    <ClCompile Include="geo\haystack.cpp" />
+    <ClCompile Include="mongommf.cpp" />
+    <ClCompile Include="oplog.cpp" />
+    <ClCompile Include="projection.cpp" />
+    <ClCompile Include="repl.cpp" />
+    <ClCompile Include="repl\consensus.cpp" />
+    <ClCompile Include="repl\heartbeat.cpp" />
+    <ClCompile Include="repl\manager.cpp" />
+    <ClCompile Include="repl\rs_initialsync.cpp" />
+    <ClCompile Include="repl\rs_initiate.cpp" />
+    <ClCompile Include="repl\rs_rollback.cpp" />
+    <ClCompile Include="repl\rs_sync.cpp" />
+    <ClCompile Include="repl_block.cpp" />
+    <ClCompile Include="restapi.cpp" />
+    <ClCompile Include="..\client\connpool.cpp" />
+    <ClCompile Include="..\client\dbclient.cpp" />
+    <ClCompile Include="..\client\syncclusterconnection.cpp" />
+    <ClCompile Include="..\pch.cpp" />
+    <ClCompile Include="client.cpp" />
+    <ClCompile Include="clientcursor.cpp" />
+    <ClCompile Include="cloner.cpp" />
+    <ClCompile Include="commands.cpp" />
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="cursor.cpp" />
+    <ClCompile Include="database.cpp" />
+    <ClCompile Include="db.cpp" />
+    <ClCompile Include="dbcommands.cpp" />
+    <ClCompile Include="dbcommands_admin.cpp" />
+    <ClCompile Include="dbeval.cpp" />
+    <ClCompile Include="dbhelpers.cpp" />
+    <ClCompile Include="dbwebserver.cpp" />
+    <ClCompile Include="extsort.cpp" />
+    <ClCompile Include="index.cpp" />
+    <ClCompile Include="indexkey.cpp" />
+    <ClCompile Include="instance.cpp" />
+    <ClCompile Include="introspect.cpp" />
+    <ClCompile Include="jsobj.cpp" />
+    <ClCompile Include="json.cpp" />
+    <ClCompile Include="lasterror.cpp" />
+    <ClCompile Include="matcher.cpp" />
+    <ClCompile Include="matcher_covered.cpp" />
+    <ClCompile Include="..\util\mmap_win.cpp" />
+    <ClCompile Include="modules\mms.cpp" />
+    <ClCompile Include="module.cpp" />
+    <ClCompile Include="namespace.cpp" />
+    <ClCompile Include="nonce.cpp" />
+    <ClCompile Include="..\client\parallel.cpp" />
+    <ClCompile Include="pdfile.cpp" />
+    <ClCompile Include="query.cpp" />
+    <ClCompile Include="queryoptimizer.cpp" />
+    <ClCompile Include="security.cpp" />
+    <ClCompile Include="security_commands.cpp" />
+    <ClCompile Include="tests.cpp" />
+    <ClCompile Include="update.cpp" />
+    <ClCompile Include="cmdline.cpp" />
+    <ClCompile Include="queryutil.cpp" />
+    <ClCompile Include="..\util\assert_util.cpp" />
+    <ClCompile Include="..\util\background.cpp" />
+    <ClCompile Include="..\util\base64.cpp" />
+    <ClCompile Include="..\util\mmap.cpp" />
+    <ClCompile Include="..\util\ntservice.cpp" />
+    <ClCompile Include="..\util\processinfo_win32.cpp" />
+    <ClCompile Include="..\util\util.cpp" />
+    <ClCompile Include="..\util\httpclient.cpp" />
+    <ClCompile Include="..\util\miniwebserver.cpp" />
+    <ClCompile Include="..\util\md5.c" />
+    <ClCompile Include="..\util\md5main.cpp" />
+    <ClCompile Include="..\util\message.cpp" />
+    <ClCompile Include="..\util\message_server_port.cpp" />
+    <ClCompile Include="..\util\sock.cpp" />
+    <ClCompile Include="..\s\d_logic.cpp" />
+    <ClCompile Include="..\scripting\engine.cpp" />
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\scripting\utils.cpp" />
+    <ClCompile Include="stats\counters.cpp" />
+    <ClCompile Include="stats\snapshots.cpp" />
+    <ClCompile Include="stats\top.cpp" />
+    <ClCompile Include="btree.cpp" />
+    <ClCompile Include="btreecursor.cpp" />
+    <ClCompile Include="repl\health.cpp" />
+    <ClCompile Include="repl\rs.cpp" />
+    <ClCompile Include="repl\replset_commands.cpp" />
+    <ClCompile Include="repl\rs_config.cpp" />
+    <ClCompile Include="security_key.cpp" />
+    <ClCompile Include="..\util\file_allocator.cpp" />
   </ItemGroup>
   <ItemGroup>
-    <ClInclude Include="repl\rs_config.h">
-      <Filter>replSets</Filter>
-    </ClInclude>
-    <ClInclude Include="repl\health.h">
-      <Filter>replSets</Filter>
-    </ClInclude>
-    <ClInclude Include="..\bson\bsonelement.h">
-      <Filter>bson</Filter>
-    </ClInclude>
-    <ClInclude Include="..\bson\bsontypes.h">
-      <Filter>bson</Filter>
-    </ClInclude>
-    <ClInclude Include="..\bson\bsoninlines.h">
-      <Filter>bson</Filter>
-    </ClInclude>
-    <ClInclude Include="..\bson\bsonmisc.h">
-      <Filter>bson</Filter>
-    </ClInclude>
-    <ClInclude Include="..\bson\bsonobj.h">
-      <Filter>bson</Filter>
-    </ClInclude>
-    <ClInclude Include="..\bson\bsonobjbuilder.h">
-      <Filter>bson</Filter>
-    </ClInclude>
-    <ClInclude Include="..\bson\bsonobjiterator.h">
-      <Filter>bson</Filter>
-    </ClInclude>
-    <ClInclude Include="btree.h">
-      <Filter>db\btree</Filter>
-    </ClInclude>
-    <ClInclude Include="repl\connections.h">
-      <Filter>replSets</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\connpool.h">
-      <Filter>client</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\dbclient.h">
-      <Filter>client</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\mongoutils\html.h">
-      <Filter>util\mongoutils</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\mongoutils\str.h">
-      <Filter>util\mongoutils</Filter>
-    </ClInclude>
-    <ClInclude Include="..\pcre-7.4\pcre.h">
-      <Filter>util\pcre</Filter>
-    </ClInclude>
-    <ClInclude Include="repl\rsmember.h">
-      <Filter>replSets</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\concurrency\list.h">
-      <Filter>util\concurrency</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\concurrency\value.h">
-      <Filter>util\concurrency</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\dbclientcursor.h">
-      <Filter>client</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\gridfs.h">
-      <Filter>client</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\parallel.h">
-      <Filter>client</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\concurrency\task.h">
-      <Filter>util\concurrency</Filter>
-    </ClInclude>
-    <ClInclude Include="repl\multicmd.h">
-      <Filter>replSets</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\concurrency\msg.h">
-      <Filter>util\concurrency</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\concurrency\mutex.h">
-      <Filter>util\concurrency</Filter>
-    </ClInclude>
-    <ClInclude Include="stats\counters.h">
-      <Filter>stats</Filter>
-    </ClInclude>
-    <ClInclude Include="stats\snapshots.h">
-      <Filter>stats</Filter>
-    </ClInclude>
-    <ClInclude Include="stats\top.h">
-      <Filter>stats</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\concurrency\rwlock.h">
-      <Filter>util\concurrency</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\concurrency\mvar.h">
-      <Filter>util\concurrency</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\concurrency\thread_pool.h">
-      <Filter>util\concurrency</Filter>
-    </ClInclude>
-    <ClInclude Include="..\scripting\v8_db.h">
-      <Filter>scripting</Filter>
-    </ClInclude>
-    <ClInclude Include="..\scripting\v8_wrapper.h">
-      <Filter>scripting</Filter>
-    </ClInclude>
-    <ClInclude Include="..\scripting\v8_utils.h">
-      <Filter>scripting</Filter>
-    </ClInclude>
-    <ClInclude Include="..\scripting\engine.h">
-      <Filter>scripting</Filter>
-    </ClInclude>
-    <ClInclude Include="..\scripting\engine_spidermonkey.h">
-      <Filter>scripting</Filter>
-    </ClInclude>
-    <ClInclude Include="..\scripting\engine_v8.h">
-      <Filter>scripting</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\syncclusterconnection.h">
-      <Filter>client</Filter>
-    </ClInclude>
-    <ClInclude Include="repl\rs_optime.h">
-      <Filter>replSets</Filter>
-    </ClInclude>
-    <ClInclude Include="repl\rs.h">
-      <Filter>replSets</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\optime.h">
-      <Filter>repl_old</Filter>
-    </ClInclude>
-    <ClInclude Include="client.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="clientcursor.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="cursor.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="cmdline.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="commands.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="concurrency.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\pcre-7.4\config.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="curop.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="database.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="db.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\embedded_builder.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\debug_util.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="diskloc.h">
-      <Filter>db\storage engine</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\hashtab.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\file.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\file_allocator.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\goodies.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="dbhelpers.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="dbinfo.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="helpers\dblogger.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="dbmessage.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\hostandport.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\hex.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\web\html.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\httpclient.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="introspect.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="index.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="indexkey.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="json.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="jsobj.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\log.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\md5.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="lasterror.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="matcher.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\md5.hpp">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\message.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\message_server.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="minilex.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\mmap.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\model.h">
-      <Filter>client</Filter>
-    </ClInclude>
-    <ClInclude Include="namespace.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\ntservice.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\bson\oid.h">
-      <Filter>bson</Filter>
-    </ClInclude>
-    <ClInclude Include="..\bson\ordering.h">
-      <Filter>bson</Filter>
-    </ClInclude>
-    <ClInclude Include="pdfile.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\grid\protocol.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="query.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="queryoptimizer.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\queue.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\ramstore.h">
-      <Filter>db\storage engine</Filter>
-    </ClInclude>
-    <ClInclude Include="recstore.h">
-      <Filter>db\storage engine</Filter>
-    </ClInclude>
-    <ClInclude Include="rec.h">
-      <Filter>db\storage engine</Filter>
-    </ClInclude>
-    <ClInclude Include="reccache.h">
-      <Filter>db\storage engine</Filter>
-    </ClInclude>
-    <ClInclude Include="reci.h">
-      <Filter>db\storage engine</Filter>
-    </ClInclude>
-    <ClInclude Include="repl.h">
-      <Filter>repl_old</Filter>
-    </ClInclude>
-    <ClInclude Include="replpair.h">
-      <Filter>repl_old</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\sock.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\redef_macros.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="update.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="resource.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="scanandorder.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="security.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\targetver.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\undef_macros.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\unittest.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="background.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\background.h">
-      <Filter>util\concurrency</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\ramlog.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\allocator.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\array.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\assert_util.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\base64.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\builder.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\lruishmap.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\miniwebserver.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\processinfo.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\pch.h">
-      <Filter>db</Filter>
-    </ClInclude>
-    <ClInclude Include="..\util\text.h">
-      <Filter>util\core</Filter>
-    </ClInclude>
-    <ClInclude Include="geo\core.h">
-      <Filter>db\geo</Filter>
-    </ClInclude>
-    <ClInclude Include="instance.h">
-      <Filter>db\core</Filter>
-    </ClInclude>
-    <ClInclude Include="..\client\distlock.h">
-      <Filter>client</Filter>
-    </ClInclude>
-    <ClInclude Include="..\s\d_logic.h">
-      <Filter>sharding</Filter>
-    </ClInclude>
-    <ClInclude Include="oplogreader.h">
-      <Filter>repl_old</Filter>
-    </ClInclude>
+    <ClInclude Include="..\client\dbclientcursor.h" />
+    <ClInclude Include="..\client\distlock.h" />
+    <ClInclude Include="..\client\gridfs.h" />
+    <ClInclude Include="..\client\parallel.h" />
+    <ClInclude Include="..\s\d_logic.h" />
+    <ClInclude Include="..\targetver.h" />
+    <ClInclude Include="..\pcre-7.4\config.h" />
+    <ClInclude Include="..\pcre-7.4\pcre.h" />
+    <ClInclude Include="..\util\concurrency\rwlock.h" />
+    <ClInclude Include="..\util\concurrency\msg.h" />
+    <ClInclude Include="..\util\concurrency\mutex.h" />
+    <ClInclude Include="..\util\concurrency\mvar.h" />
+    <ClInclude Include="..\util\concurrency\task.h" />
+    <ClInclude Include="..\util\concurrency\thread_pool.h" />
+    <ClInclude Include="..\util\logfile.h" />
+    <ClInclude Include="..\util\mongoutils\checksum.h" />
+    <ClInclude Include="..\util\mongoutils\html.h" />
+    <ClInclude Include="..\util\mongoutils\str.h" />
+    <ClInclude Include="..\util\paths.h" />
+    <ClInclude Include="..\util\ramlog.h" />
+    <ClInclude Include="..\util\text.h" />
+    <ClInclude Include="..\util\time_support.h" />
+    <ClInclude Include="durop.h" />
+    <ClInclude Include="dur_commitjob.h" />
+    <ClInclude Include="dur_journal.h" />
+    <ClInclude Include="dur_journalformat.h" />
+    <ClInclude Include="dur_stats.h" />
+    <ClInclude Include="geo\core.h" />
+    <ClInclude Include="helpers\dblogger.h" />
+    <ClInclude Include="instance.h" />
+    <ClInclude Include="mongommf.h" />
+    <ClInclude Include="mongomutex.h" />
+    <ClInclude Include="namespace-inl.h" />
+    <ClInclude Include="oplogreader.h" />
+    <ClInclude Include="projection.h" />
+    <ClInclude Include="repl.h" />
+    <ClInclude Include="replpair.h" />
+    <ClInclude Include="repl\connections.h" />
+    <ClInclude Include="repl\multicmd.h" />
+    <ClInclude Include="repl\rsmember.h" />
+    <ClInclude Include="repl\rs_optime.h" />
+    <ClInclude Include="stats\counters.h" />
+    <ClInclude Include="stats\snapshots.h" />
+    <ClInclude Include="stats\top.h" />
+    <ClInclude Include="..\client\connpool.h" />
+    <ClInclude Include="..\client\dbclient.h" />
+    <ClInclude Include="..\client\model.h" />
+    <ClInclude Include="..\client\redef_macros.h" />
+    <ClInclude Include="..\client\syncclusterconnection.h" />
+    <ClInclude Include="..\client\undef_macros.h" />
+    <ClInclude Include="background.h" />
+    <ClInclude Include="client.h" />
+    <ClInclude Include="clientcursor.h" />
+    <ClInclude Include="cmdline.h" />
+    <ClInclude Include="commands.h" />
+    <ClInclude Include="concurrency.h" />
+    <ClInclude Include="curop.h" />
+    <ClInclude Include="cursor.h" />
+    <ClInclude Include="database.h" />
+    <ClInclude Include="db.h" />
+    <ClInclude Include="dbhelpers.h" />
+    <ClInclude Include="dbinfo.h" />
+    <ClInclude Include="dbmessage.h" />
+    <ClInclude Include="diskloc.h" />
+    <ClInclude Include="index.h" />
+    <ClInclude Include="indexkey.h" />
+    <ClInclude Include="introspect.h" />
+    <ClInclude Include="json.h" />
+    <ClInclude Include="matcher.h" />
+    <ClInclude Include="namespace.h" />
+    <ClInclude Include="..\pch.h" />
+    <ClInclude Include="pdfile.h" />
+    <ClInclude Include="..\grid\protocol.h" />
+    <ClInclude Include="query.h" />
+    <ClInclude Include="queryoptimizer.h" />
+    <ClInclude Include="resource.h" />
+    <ClInclude Include="scanandorder.h" />
+    <ClInclude Include="security.h" />
+    <ClInclude Include="update.h" />
+    <ClInclude Include="..\util\allocator.h" />
+    <ClInclude Include="..\util\array.h" />
+    <ClInclude Include="..\util\assert_util.h" />
+    <ClInclude Include="..\util\background.h" />
+    <ClInclude Include="..\util\base64.h" />
+    <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\debug_util.h" />
+    <ClInclude Include="..\util\embedded_builder.h" />
+    <ClInclude Include="..\util\file.h" />
+    <ClInclude Include="..\util\file_allocator.h" />
+    <ClInclude Include="..\util\goodies.h" />
+    <ClInclude Include="..\util\hashtab.h" />
+    <ClInclude Include="..\util\hex.h" />
+    <ClInclude Include="lasterror.h" />
+    <ClInclude Include="..\util\log.h" />
+    <ClInclude Include="..\util\lruishmap.h" />
+    <ClInclude Include="..\util\mmap.h" />
+    <ClInclude Include="..\util\ntservice.h" />
+    <ClInclude Include="..\util\optime.h" />
+    <ClInclude Include="..\util\processinfo.h" />
+    <ClInclude Include="..\util\queue.h" />
+    <ClInclude Include="..\util\ramstore.h" />
+    <ClInclude Include="..\util\unittest.h" />
+    <ClInclude Include="..\util\concurrency\list.h" />
+    <ClInclude Include="..\util\concurrency\value.h" />
+    <ClInclude Include="..\util\web\html.h" />
+    <ClInclude Include="..\util\httpclient.h" />
+    <ClInclude Include="..\util\miniwebserver.h" />
+    <ClInclude Include="..\util\md5.h" />
+    <ClInclude Include="..\util\md5.hpp" />
+    <ClInclude Include="..\util\message.h" />
+    <ClInclude Include="..\util\message_server.h" />
+    <ClInclude Include="..\util\sock.h" />
+    <ClInclude Include="..\scripting\engine.h" />
+    <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+    <ClInclude Include="..\scripting\engine_v8.h" />
+    <ClInclude Include="..\scripting\v8_db.h" />
+    <ClInclude Include="..\scripting\v8_utils.h" />
+    <ClInclude Include="..\scripting\v8_wrapper.h" />
+    <ClInclude Include="btree.h" />
+    <ClInclude Include="repl\health.h" />
+    <ClInclude Include="..\util\hostandport.h" />
+    <ClInclude Include="repl\rs.h" />
+    <ClInclude Include="repl\rs_config.h" />
+    <ClInclude Include="..\bson\bsonelement.h" />
+    <ClInclude Include="..\bson\bsoninlines.h" />
+    <ClInclude Include="..\bson\bsonmisc.h" />
+    <ClInclude Include="..\bson\bsonobj.h" />
+    <ClInclude Include="..\bson\bsonobjbuilder.h" />
+    <ClInclude Include="..\bson\bsonobjiterator.h" />
+    <ClInclude Include="..\bson\bsontypes.h" />
+    <ClInclude Include="jsobj.h" />
+    <ClInclude Include="..\bson\oid.h" />
+    <ClInclude Include="..\bson\ordering.h" />
+    <ClInclude Include="dur_journalimpl.h" />
+    <ClInclude Include="..\util\concurrency\race.h" />
   </ItemGroup>
   <ItemGroup>
-    <Filter Include="libs">
-      <UniqueIdentifier>{4b29c82d-d30a-4bf1-9c78-19f59c5777ba}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="util">
-      <UniqueIdentifier>{d2c3db88-7fb7-4365-a63b-b7ad45d316ae}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="util\concurrency">
-      <UniqueIdentifier>{8e6fe846-2833-45bb-b13b-c0f0d4d38593}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="util\mongoutils">
-      <UniqueIdentifier>{cc5d96e6-1805-422b-804d-adcb367dc721}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="util\pcre">
-      <UniqueIdentifier>{fa527226-9b03-4f17-8e4c-80d31fb1e449}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="client">
-      <UniqueIdentifier>{932baf83-ba80-49e5-8280-f1b9c8dbbde6}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="stats">
-      <UniqueIdentifier>{88f4374a-9d55-44a2-a234-c758cc4affa9}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="db">
-      <UniqueIdentifier>{6204f40e-3a9c-44e2-a88b-0e1b6fd9a510}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="db\btree">
-      <UniqueIdentifier>{37b238b2-21ec-4788-bdf9-a59b43490454}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="scripting">
-      <UniqueIdentifier>{6b78f34f-e6b0-49e4-b04e-6478c3a3c077}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="db\storage engine">
-      <UniqueIdentifier>{d565a775-7a99-4860-b25f-441e1655b7c6}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="db\modules">
-      <UniqueIdentifier>{466f15bb-4d5b-4634-ba6b-05a282e0a174}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="db\core">
-      <UniqueIdentifier>{d7f08f93-36bf-49cd-9e1c-ba1fec3234ce}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="repl_old">
-      <UniqueIdentifier>{e899caa1-9a90-4604-ac2e-68d5ca12425c}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="util\core">
-      <UniqueIdentifier>{9775f24c-3a29-4e0d-b5de-991c592cf376}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Resource Files">
-      <UniqueIdentifier>{9aea1b83-cdcb-48a8-97e6-47805cacdc29}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="bson">
-      <UniqueIdentifier>{aff20a87-2efe-4861-930f-8780c08cbea5}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="db\geo">
-      <UniqueIdentifier>{2a0924a5-9bd9-4c86-a149-0df09dcb5548}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="sharding">
-      <UniqueIdentifier>{03b0d798-b13d-48f4-930d-ca827e2a3f00}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="replSets">
-      <UniqueIdentifier>{3b73f786-d352-446f-a5f5-df49384baf7a}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="replSets\testing">
-      <UniqueIdentifier>{4a1ea357-1077-4ad1-85b4-db48a6e1eb46}</UniqueIdentifier>
-    </Filter>
+    <ResourceCompile Include="db.rc" />
   </ItemGroup>
   <ItemGroup>
-    <None Include="repl\notes.txt">
-      <Filter>replSets</Filter>
-    </None>
-    <None Include="..\util\mongoutils\README">
-      <Filter>util\mongoutils</Filter>
-    </None>
-    <None Include="..\SConstruct">
-      <Filter>db</Filter>
-    </None>
-    <None Include="mongo.ico">
-      <Filter>Resource Files</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replset_remove_node.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replset1.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replset2.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replset3.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replset4.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replset5.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replsetadd.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replsetarb1.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replsetarb2.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replsetprio1.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replsetrestart1.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\replsetrestart2.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\rollback.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\rollback2.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\sync1.js">
-      <Filter>replSets\testing</Filter>
-    </None>
-    <None Include="..\jstests\replsets\twosets.js">
-      <Filter>replSets\testing</Filter>
-    </None>
+    <None Include="..\jstests\dur\basic1.sh" />
+    <None Include="..\jstests\dur\dur1.js" />
+    <None Include="..\jstests\replsets\replset1.js" />
+    <None Include="..\jstests\replsets\replset2.js" />
+    <None Include="..\jstests\replsets\replset3.js" />
+    <None Include="..\jstests\replsets\replset4.js" />
+    <None Include="..\jstests\replsets\replset5.js" />
+    <None Include="..\jstests\replsets\replsetadd.js" />
+    <None Include="..\jstests\replsets\replsetarb1.js" />
+    <None Include="..\jstests\replsets\replsetarb2.js" />
+    <None Include="..\jstests\replsets\replsetprio1.js" />
+    <None Include="..\jstests\replsets\replsetrestart1.js" />
+    <None Include="..\jstests\replsets\replsetrestart2.js" />
+    <None Include="..\jstests\replsets\replset_remove_node.js" />
+    <None Include="..\jstests\replsets\rollback.js" />
+    <None Include="..\jstests\replsets\rollback2.js" />
+    <None Include="..\jstests\replsets\sync1.js" />
+    <None Include="..\jstests\replsets\twosets.js" />
+    <None Include="..\SConstruct" />
+    <None Include="..\util\mongoutils\README" />
+    <None Include="mongo.ico" />
+    <None Include="repl\notes.txt" />
   </ItemGroup>
   <ItemGroup>
-    <Library Include="..\..\js\js64r.lib">
-      <Filter>libs</Filter>
-    </Library>
-    <Library Include="..\..\js\js32d.lib">
-      <Filter>libs</Filter>
-    </Library>
-    <Library Include="..\..\js\js32r.lib">
-      <Filter>libs</Filter>
-    </Library>
-    <Library Include="..\..\js\js64d.lib">
-      <Filter>libs</Filter>
-    </Library>
-  </ItemGroup>
-  <ItemGroup>
-    <ResourceCompile Include="db.rc">
-      <Filter>Resource Files</Filter>
-    </ResourceCompile>
+    <Library Include="..\..\js\js32d.lib" />
+    <Library Include="..\..\js\js32r.lib" />
+    <Library Include="..\..\js\js64d.lib" />
+    <Library Include="..\..\js\js64r.lib" />
   </ItemGroup>
 </Project>
 \ No newline at end of file
diff --git a/db/db_10.sln b/db/db_10.sln
index d68d897..f74ac3d 100644..100755
--- a/db/db_10.sln
+++ b/db/db_10.sln
@@ -8,16 +8,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{40
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
 	ProjectSection(SolutionItems) = preProject
-		..\tools\bridge.cpp = ..\tools\bridge.cpp
-		..\tools\bsondump.cpp = ..\tools\bsondump.cpp
-		..\tools\dump.cpp = ..\tools\dump.cpp
 		..\tools\export.cpp = ..\tools\export.cpp
-		..\tools\import.cpp = ..\tools\import.cpp
-		..\tools\restore.cpp = ..\tools\restore.cpp
 		..\tools\sniffer.cpp = ..\tools\sniffer.cpp
-		..\tools\stat.cpp = ..\tools\stat.cpp
-		..\tools\tool.cpp = ..\tools\tool.cpp
-		..\tools\tool.h = ..\tools\tool.h
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files", "{2F760952-C71B-4865-998F-AABAE96D1373}"
diff --git a/db/dbcommands.cpp b/db/dbcommands.cpp
index 7bd7203..8974bd3 100644
--- a/db/dbcommands.cpp
+++ b/db/dbcommands.cpp
@@ -40,11 +40,13 @@
 #include "stats/counters.h"
 #include "background.h"
 #include "../util/version.h"
+#include "../s/d_writeback.h"
+#include "dur_stats.h"
 
 namespace mongo {
 
     extern int otherTraceLevel;
-    void flushOpLog( stringstream &ss );
+    void flushDiagLog();
 
     /* reset any errors so that getlasterror comes back clean.
 
@@ -54,7 +56,7 @@ namespace mongo {
     */
     class CmdResetError : public Command {
     public:
-        virtual LockType locktype() const { return NONE; } 
+        virtual LockType locktype() const { return NONE; }
         virtual bool requiresAuth() { return false; }
         virtual bool logTheOp() {
             return false;
@@ -74,8 +76,8 @@ namespace mongo {
         }
     } cmdResetError;
 
-    /* set by replica sets if specified in the configuration. 
-       a pointer is used to avoid any possible locking issues with lockless reading (see below locktype() is NONE 
+    /* set by replica sets if specified in the configuration.
+       a pointer is used to avoid any possible locking issues with lockless reading (see below locktype() is NONE
        and would like to keep that)
        (for now, it simply orphans any old copy as config changes should be extremely rare).
        note: once non-null, never goes to null again.
@@ -84,33 +86,38 @@ namespace mongo {
 
     class CmdGetLastError : public Command {
     public:
-        virtual LockType locktype() const { return NONE; } 
-        virtual bool requiresAuth() { return false; }
-        virtual bool logTheOp() {
-            return false;
-        }
-        virtual bool slaveOk() const {
-            return true;
-        }
+        CmdGetLastError() : Command("getLastError", false, "getlasterror") { }
+        virtual LockType locktype() const { return NONE;  }
+        virtual bool requiresAuth()       { return false; }
+        virtual bool logTheOp()           { return false; }
+        virtual bool slaveOk() const      { return true;  }
         virtual void help( stringstream& help ) const {
-            help << "return error status of the last operation on this connection";
+            help << "return error status of the last operation on this connection\n"
+                 << "options:\n"
+                 << "  fsync - fsync before returning, or wait for journal commit if running with --dur\n"
+                 << "  w - await replication to w servers (including self) before returning\n"
+                 << "  wtimeout - timeout for w in milliseconds";
         }
-        CmdGetLastError() : Command("getLastError", false, "getlasterror") {}
-        bool run(const string& dbnamne, BSONObj& _cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& _cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             LastError *le = lastError.disableForCommand();
+
+            bool err = false;
+
             if ( le->nPrev != 1 )
-                LastError::noError.appendSelf( result );
+                err = LastError::noError.appendSelf( result , false );
             else
-                le->appendSelf( result );
-            
+                err = le->appendSelf( result , false );
+
             Client& c = cc();
             c.appendLastOp( result );
 
+            result.appendNumber( "connectionId" , c.getConnectionId() );
+
             BSONObj cmdObj = _cmdObj;
-            { 
+            {
                 BSONObj::iterator i(_cmdObj);
                 i.next();
-                if( !i.more() ) { 
+                if( !i.more() ) {
                     /* empty, use default */
                     BSONObj *def = getLastErrorDefault;
                     if( def )
@@ -118,13 +125,27 @@ namespace mongo {
                 }
             }
 
-            if ( cmdObj["fsync"].trueValue() ){
-                log() << "fsync from getlasterror" << endl;
-                result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) );
+            if ( cmdObj["fsync"].trueValue() ) {
+                Timer t;
+                if( !getDur().awaitCommit() ) {
+                    // if get here, not running with --dur
+                    log() << "fsync from getlasterror" << endl;
+                    result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) );
+                }
+                else {
+                    // this perhaps is temp.  how long we wait for the group commit to occur.
+                    result.append( "waited", t.millis() );
+                }
             }
-            
+
+            if ( err ) {
+                // doesn't make sense to wait for replication
+                // if there was an error
+                return true;
+            }
+
             BSONElement e = cmdObj["w"];
-            if ( e.isNumber() ){
+            if ( e.isNumber() ) {
                 int timeout = cmdObj["wtimeout"].numberInt();
                 Timer t;
 
@@ -132,15 +153,43 @@ namespace mongo {
 
                 long long passes = 0;
                 char buf[32];
-                while ( 1 ){
-                    if ( opReplicatedEnough( c.getLastOp() , w ) )
-                        break;
+                while ( 1 ) {
+                    OpTime op(c.getLastOp());
                     
-                    if ( timeout > 0 && t.millis() >= timeout ){
+                    if ( op.isNull() ) {
+                        if ( anyReplEnabled() ) {
+                            result.append( "wnote" , "no write has been done on this connection" );
+                        }
+                        else if ( w <= 1 ) {
+                            // don't do anything
+                            // w=1 and no repl, so this is fine
+                        }
+                        else {
+                            // w=2 and no repl
+                            result.append( "wnote" , "no replication has been enabled, so w=2+ won't work" );
+                            result.append( "err", "norepl" );
+                            return true; 
+                        }
+                        break;
+                    }
+
+                    // check this first for w=0 or w=1
+                    if ( opReplicatedEnough( op, w ) )
+                        break;
+
+                    // if replication isn't enabled (e.g., config servers)
+                    if ( ! anyReplEnabled() ) {
+                        result.append( "err", "norepl" );
+                        return true;
+                    }
+
+
+                    if ( timeout > 0 && t.millis() >= timeout ) {
                         result.append( "wtimeout" , true );
                         errmsg = "timed out waiting for slaves";
                         result.append( "waited" , t.millis() );
-                        return false;
+                        result.append( "err" , "timeout" );
+                        return true;
                     }
 
                     assert( sprintf( buf , "w block pass: %lld" , ++passes ) < 30 );
@@ -150,14 +199,15 @@ namespace mongo {
                 }
                 result.appendNumber( "wtime" , t.millis() );
             }
-            
+
+            result.appendNull( "err" );
             return true;
         }
     } cmdGetLastError;
 
     class CmdGetPrevError : public Command {
     public:
-        virtual LockType locktype() const { return NONE; } 
+        virtual LockType locktype() const { return NONE; }
         virtual bool requiresAuth() { return false; }
         virtual bool logTheOp() {
             return false;
@@ -169,7 +219,7 @@ namespace mongo {
             return true;
         }
         CmdGetPrevError() : Command("getPrevError", false, "getpreverror") {}
-        bool run(const string& dbnamne, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             LastError *le = lastError.disableForCommand();
             le->appendSelf( result );
             if ( le->valid )
@@ -191,16 +241,16 @@ namespace mongo {
         virtual bool slaveOk() const {
             return false;
         }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         CmdDropDatabase() : Command("dropDatabase") {}
-        bool run(const string& dbnamne, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONElement e = cmdObj.firstElement();
-            log() << "dropDatabase " << dbnamne << endl;
+            log() << "dropDatabase " << dbname << endl;
             int p = (int) e.number();
             if ( p != 1 )
                 return false;
-            dropDatabase(dbnamne);
-            result.append( "dropped" , dbnamne );
+            dropDatabase(dbname);
+            result.append( "dropped" , dbname );
             return true;
         }
     } cmdDropDatabase;
@@ -216,7 +266,7 @@ namespace mongo {
         virtual void help( stringstream& help ) const {
             help << "repair database.  also compacts. note: slow.";
         }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         CmdRepairDatabase() : Command("repairDatabase") {}
         bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONElement e = cmdObj.firstElement();
@@ -231,7 +281,7 @@ namespace mongo {
             return repairDatabase( dbname, errmsg, preserveClonedFilesOnFailure, backupOriginalFiles );
         }
     } cmdRepairDatabase;
-    
+
     /* set db profiling level
        todo: how do we handle profiling information put in the db with replication?
              sensibly or not?
@@ -245,9 +295,10 @@ namespace mongo {
             help << "enable or disable performance profiling\n";
             help << "{ profile : <n> }\n";
             help << "0=off 1=log slow ops 2=log all\n";
+            help << "-1 to get current values\n";
             help << "http://www.mongodb.org/display/DOCS/Database+Profiler";
         }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         CmdProfile() : Command("profile") {}
         bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONElement e = cmdObj.firstElement();
@@ -256,7 +307,7 @@ namespace mongo {
 
             int p = (int) e.number();
             bool ok = false;
-            
+
             if ( p == -1 )
                 ok = true;
             else if ( p >= 0 && p <= 2 ) {
@@ -266,7 +317,7 @@ namespace mongo {
             BSONElement slow = cmdObj["slowms"];
             if ( slow.isNumber() )
                 cmdLine.slowMS = slow.numberInt();
-            
+
             return ok;
         }
     } cmdProfile;
@@ -279,8 +330,8 @@ namespace mongo {
         CmdServerStatus() : Command("serverStatus", true) {
             started = time(0);
         }
-        
-        virtual LockType locktype() const { return NONE; } 
+
+        virtual LockType locktype() const { return NONE; }
 
         virtual void help( stringstream& help ) const {
             help << "returns lots of administrative server statistics";
@@ -291,9 +342,11 @@ namespace mongo {
             BSONObjBuilder timeBuilder(128);
 
 
-			bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
+            bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
 
+            result.append( "host" , prettyHostName() );
             result.append("version", versionString);
+            result.append("process","mongod");
             result.append("uptime",(double) (time(0)-started));
             result.append("uptimeEstimate",(double) (start/1000));
             result.appendDate( "localTime" , jsTime() );
@@ -309,27 +362,41 @@ namespace mongo {
                 t.append("totalTime", tt);
                 t.append("lockTime", tl);
                 t.append("ratio", (tt ? tl/tt : 0));
-                
-                BSONObjBuilder ttt( t.subobjStart( "currentQueue" ) );
-                int w=0, r=0;
-                Client::recommendedYieldMicros( &w , &r );
-                ttt.append( "total" , w + r );
-                ttt.append( "readers" , r );
-                ttt.append( "writers" , w );
-                ttt.done();
+
+                {
+                    BSONObjBuilder ttt( t.subobjStart( "currentQueue" ) );
+                    int w=0, r=0;
+                    Client::recommendedYieldMicros( &w , &r );
+                    ttt.append( "total" , w + r );
+                    ttt.append( "readers" , r );
+                    ttt.append( "writers" , w );
+                    ttt.done();
+                }
+
+                {
+                    BSONObjBuilder ttt( t.subobjStart( "activeClients" ) );
+                    int w=0, r=0;
+                    Client::getActiveClientCount( w , r );
+                    ttt.append( "total" , w + r );
+                    ttt.append( "readers" , r );
+                    ttt.append( "writers" , w );
+                    ttt.done();
+                }
+
+
 
                 result.append( "globalLock" , t.obj() );
             }
             timeBuilder.appendNumber( "after basic" , Listener::getElapsedTimeMillis() - start );
 
-            if ( authed ){
-                
+            {
+
                 BSONObjBuilder t( result.subobjStart( "mem" ) );
-                
+
                 t.append("bits",  ( sizeof(int*) == 4 ? 32 : 64 ) );
 
                 ProcessInfo p;
-                if ( p.supported() ){
+                if ( p.supported() ) {
                     t.appendNumber( "resident" , p.getResidentSize() );
                     t.appendNumber( "virtual" , p.getVirtualMemorySize() );
                     t.appendBool( "supported" , true );
@@ -338,14 +405,16 @@ namespace mongo {
                     result.append( "note" , "not all mem info support on this platform" );
                     t.appendBool( "supported" , false );
                 }
-                    
+
+                timeBuilder.appendNumber( "middle of mem" , Listener::getElapsedTimeMillis() - start );
+
                 t.appendNumber( "mapped" , MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) );
 
                 t.done();
-                    
+
             }
-            timeBuilder.appendNumber( "after is authed" , Listener::getElapsedTimeMillis() - start );
-            
+            timeBuilder.appendNumber( "after mem" , Listener::getElapsedTimeMillis() - start );
+
             {
                 BSONObjBuilder bb( result.subobjStart( "connections" ) );
                 bb.append( "current" , connTicketHolder.used() );
@@ -353,15 +422,15 @@ namespace mongo {
                 bb.done();
             }
             timeBuilder.appendNumber( "after connections" , Listener::getElapsedTimeMillis() - start );
-            
-            if ( authed ){
+
+            {
                 BSONObjBuilder bb( result.subobjStart( "extra_info" ) );
                 bb.append("note", "fields vary by platform");
                 ProcessInfo p;
                 p.getExtraInfo(bb);
                 bb.done();
                 timeBuilder.appendNumber( "after extra info" , Listener::getElapsedTimeMillis() - start );
-            
+
             }
 
             {
@@ -369,31 +438,43 @@ namespace mongo {
                 globalIndexCounters.append( bb );
                 bb.done();
             }
-            
+
             {
                 BSONObjBuilder bb( result.subobjStart( "backgroundFlushing" ) );
                 globalFlushCounters.append( bb );
                 bb.done();
             }
-            
+
             {
                 BSONObjBuilder bb( result.subobjStart( "cursors" ) );
                 ClientCursor::appendStats( bb );
                 bb.done();
             }
-            
-            timeBuilder.appendNumber( "after counters" , Listener::getElapsedTimeMillis() - start );            
 
-            if ( anyReplEnabled() ){
+            {
+                BSONObjBuilder bb( result.subobjStart( "network" ) );
+                networkCounter.append( bb );
+                bb.done();
+            }
+
+
+            timeBuilder.appendNumber( "after counters" , Listener::getElapsedTimeMillis() - start );
+
+            if ( anyReplEnabled() ) {
                 BSONObjBuilder bb( result.subobjStart( "repl" ) );
                 appendReplicationInfo( bb , authed , cmdObj["repl"].numberInt() );
                 bb.done();
+
+                if ( ! _isMaster() ) {
+                    result.append( "opcountersRepl" , replOpCounters.getObj() );
+                }
+
             }
 
-            timeBuilder.appendNumber( "after repl" , Listener::getElapsedTimeMillis() - start );            
-            
+            timeBuilder.appendNumber( "after repl" , Listener::getElapsedTimeMillis() - start );
+
             result.append( "opcounters" , globalOpCounters.getObj() );
-            
+
             {
                 BSONObjBuilder asserts( result.subobjStart( "asserts" ) );
                 asserts.append( "regular" , assertionCount.regular );
@@ -404,12 +485,18 @@ namespace mongo {
                 asserts.done();
             }
 
-            timeBuilder.appendNumber( "after asserts" , Listener::getElapsedTimeMillis() - start );            
+            timeBuilder.appendNumber( "after asserts" , Listener::getElapsedTimeMillis() - start );
+
+            result.append( "writeBacksQueued" , ! writeBackManager.queuesEmpty() );
+
+            if( cmdLine.dur ) {
+                result.append("dur", dur::stats.asObj());
+            }
 
             if ( ! authed )
                 result.append( "note" , "run against admin for more info" );
-            
-            if ( Listener::getElapsedTimeMillis() - start > 1000 ){
+
+            if ( Listener::getElapsedTimeMillis() - start > 1000 ) {
                 BSONObj t = timeBuilder.obj();
                 log() << "serverStatus was very slow: " << t << endl;
                 result.append( "timing" , t );
@@ -426,7 +513,7 @@ namespace mongo {
             return true;
         }
         virtual void help( stringstream& help ) const { help << "internal"; }
-        virtual LockType locktype() const { return NONE; } 
+        virtual LockType locktype() const { return NONE; }
         CmdGetOpTime() : Command("getoptime") { }
         bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             writelock l( "" );
@@ -456,12 +543,10 @@ namespace mongo {
             return true;
         }
         void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Monitoring+and+Diagnostics#MonitoringandDiagnostics-DatabaseRecord%2FReplay"; }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() );
-            stringstream ss;
-            flushOpLog( ss );
-            out() << ss.str() << endl;
+            flushDiagLog();
             if ( !cmdLine.quiet )
                 tlog() << "CMD: diagLogging set to " << _diaglog.level << " from: " << was << endl;
             result.append( "was" , was );
@@ -471,7 +556,7 @@ namespace mongo {
 
     /* remove bit from a bit array - actually remove its slot, not a clear
        note: this function does not work with x == 63 -- that is ok
-             but keep in mind in the future if max indexes were extended to 
+             but keep in mind in the future if max indexes were extended to
              exactly 64 it would be a problem
     */
     unsigned long long removeBit(unsigned long long b, int x) {
@@ -499,6 +584,7 @@ namespace mongo {
 
         BackgroundOperation::assertNoBgOpInProgForNs(ns);
 
+        d = d->writingWithExtra();
         d->aboutToDeleteAnIndex();
 
         /* there may be pointers pointing at keys in the btree(s).  kill them. */
@@ -513,7 +599,8 @@ namespace mongo {
                 for ( int i = 0; i < d->nIndexes; i++ ) {
                     if ( !mayDeleteIdIndex && d->idx(i).isIdIndex() ) {
                         idIndex = &d->idx(i);
-                    } else {
+                    }
+                    else {
                         d->idx(i).kill_idx();
                     }
                 }
@@ -526,9 +613,9 @@ namespace mongo {
             /* assuming here that id index is not multikey: */
             d->multiKeyIndexBits = 0;
             assureSysIndexesEmptied(ns, idIndex);
-            anObjBuilder.append("msg", mayDeleteIdIndex ? 
-                "indexes dropped for collection" : 
-                "non-_id indexes dropped for collection");
+            anObjBuilder.append("msg", mayDeleteIdIndex ?
+                                "indexes dropped for collection" :
+                                "non-_id indexes dropped for collection");
         }
         else {
             // delete just one index
@@ -551,9 +638,10 @@ namespace mongo {
                 d->nIndexes--;
                 for ( int i = x; i < d->nIndexes; i++ )
                     d->idx(i) = d->idx(i+1);
-            } else {
+            }
+            else {
                 int n = removeFromSysIndexes(ns, name); // just in case an orphaned listing there - i.e. should have been repaired but wasn't
-                if( n ) { 
+                if( n ) {
                     log() << "info: removeFromSysIndexes cleaned up " << n << " entries" << endl;
                 }
                 log() << "dropIndexes: " << name << " not found" << endl;
@@ -578,7 +666,7 @@ namespace mongo {
             return false;
         }
         virtual void help( stringstream& help ) const { help << "drop a collection\n{drop : <collectionName>}"; }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             string nsToDrop = dbname + '.' + cmdObj.firstElement().valuestr();
             NamespaceDetails *d = nsdetails(nsToDrop.c_str());
@@ -597,7 +685,7 @@ namespace mongo {
     /* select count(*) */
     class CmdCount : public Command {
     public:
-        virtual LockType locktype() const { return READ; } 
+        virtual LockType locktype() const { return READ; }
         CmdCount() : Command("count") { }
         virtual bool logTheOp() {
             return false;
@@ -619,7 +707,7 @@ namespace mongo {
             long long n = runCount(ns.c_str(), cmdObj, err);
             long long nn = n;
             bool ok = true;
-            if ( n == -1 ){
+            if ( n == -1 ) {
                 nn = 0;
                 result.appendBool( "missing" , true );
             }
@@ -647,7 +735,7 @@ namespace mongo {
         virtual bool adminOnly() const {
             return false;
         }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         virtual void help( stringstream& help ) const {
             help << "create a collection";
         }
@@ -670,7 +758,7 @@ namespace mongo {
         virtual bool slaveOk() const {
             return false;
         }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         virtual void help( stringstream& help ) const {
             help << "drop indexes for a collection";
         }
@@ -686,9 +774,9 @@ namespace mongo {
                 if ( f.type() == String ) {
                     return dropIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false );
                 }
-                else if ( f.type() == Object ){
+                else if ( f.type() == Object ) {
                     int idxId = d->findIndexByKeyPattern( f.embeddedObject() );
-                    if ( idxId < 0 ){
+                    if ( idxId < 0 ) {
                         errmsg = "can't find index with key:";
                         errmsg += f.embeddedObject().toString();
                         return false;
@@ -715,7 +803,7 @@ namespace mongo {
     public:
         virtual bool logTheOp() { return false; } // only reindexes on the one node
         virtual bool slaveOk() const { return true; }    // can reindex on a secondary
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         virtual void help( stringstream& help ) const {
             help << "re-index a collection";
         }
@@ -729,7 +817,7 @@ namespace mongo {
             tlog() << "CMD: reIndex " << toDeleteNs << endl;
             BackgroundOperation::assertNoBgOpInProgForNs(toDeleteNs.c_str());
 
-            if ( ! d ){
+            if ( ! d ) {
                 errmsg = "ns not found";
                 return false;
             }
@@ -737,7 +825,7 @@ namespace mongo {
             list<BSONObj> all;
             auto_ptr<DBClientCursor> i = db.getIndexes( toDeleteNs );
             BSONObjBuilder b;
-            while ( i->more() ){
+            while ( i->more() ) {
                 BSONObj o = i->next().getOwned();
                 b.append( BSONObjBuilder::numStr( all.size() ) , o );
                 all.push_back( o );
@@ -745,12 +833,12 @@ namespace mongo {
 
 
             bool ok = dropIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true );
-            if ( ! ok ){
+            if ( ! ok ) {
                 errmsg = "dropIndexes failed";
                 return false;
             }
 
-            for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ){
+            for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ) {
                 BSONObj o = *i;
                 theDataFileMgr.insertWithObjMod( Namespace( toDeleteNs.c_str() ).getSisterNS( "system.indexes" ).c_str() , o , true );
             }
@@ -773,9 +861,9 @@ namespace mongo {
         virtual bool adminOnly() const {
             return true;
         }
-        virtual LockType locktype() const { return READ; } 
+        virtual LockType locktype() const { return READ; }
         virtual void help( stringstream& help ) const { help << "list databases on this server"; }
-        CmdListDatabases() : Command("listDatabases") {}
+        CmdListDatabases() : Command("listDatabases" , true ) {}
         bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             vector< string > dbNames;
             getDatabaseNames( dbNames );
@@ -795,11 +883,11 @@ namespace mongo {
 
                 seen.insert( i->c_str() );
             }
-            
+
             // TODO: erh 1/1/2010 I think this is broken where path != dbpath ??
             set<string> allShortNames;
             dbHolder.getAllShortNames( allShortNames );
-            for ( set<string>::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ){
+            for ( set<string>::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ) {
                 string name = *i;
 
                 if ( seen.count( name ) )
@@ -819,33 +907,45 @@ namespace mongo {
         }
     } cmdListDatabases;
 
-    /* note an access to a database right after this will open it back up - so this is mainly 
-       for diagnostic purposes. 
+    /* note an access to a database right after this will open it back up - so this is mainly
+       for diagnostic purposes.
        */
     class CmdCloseAllDatabases : public Command {
     public:
         virtual void help( stringstream& help ) const { help << "Close all database files.\nA new request will cause an immediate reopening; thus, this is mostly for testing purposes."; }
         virtual bool adminOnly() const { return true; }
         virtual bool slaveOk() const { return false; }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
 
         CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {}
         bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
-            return dbHolder.closeAll( dbpath , result, false );
+            bool ok;
+            try {
+                ok = dbHolder.closeAll( dbpath , result, false );
+            }
+            catch(DBException&) { 
+                throw;
+            }
+            catch(...) { 
+                log() << "ERROR uncaught exception in command closeAllDatabases" << endl;
+                errmsg = "unexpected uncaught exception";
+                return false;
+            }
+            return ok;
         }
     } cmdCloseAllDatabases;
 
     class CmdFileMD5 : public Command {
     public:
-        CmdFileMD5() : Command( "filemd5" ){}
+        CmdFileMD5() : Command( "filemd5" ) {}
         virtual bool slaveOk() const {
             return true;
         }
         virtual void help( stringstream& help ) const {
             help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }";
         }
-        virtual LockType locktype() const { return READ; } 
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+        virtual LockType locktype() const { return READ; }
+        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string ns = dbname;
             ns += ".";
             {
@@ -867,8 +967,8 @@ namespace mongo {
             scoped_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns.c_str()));
 
             int n = 0;
-            while ( cursor->ok() ){
-                if ( ! cursor->matcher()->matchesCurrent( cursor.get() ) ){
+            while ( cursor->ok() ) {
+                if ( ! cursor->matcher()->matchesCurrent( cursor.get() ) ) {
                     log() << "**** NOT MATCHING ****" << endl;
                     PRINT(cursor->current());
                     cursor->advance();
@@ -884,7 +984,7 @@ namespace mongo {
                     BSONElement ne = obj["n"];
                     assert(ne.isNumber());
                     int myn = ne.numberInt();
-                    if ( n != myn ){
+                    if ( n != myn ) {
                         log() << "should have chunk: " << n << " have:" << myn << endl;
 
                         DBDirectClient client;
@@ -902,12 +1002,13 @@ namespace mongo {
                     md5_append( &st , (const md5_byte_t*)(data) , len );
 
                     n++;
-                } catch (...) {
+                }
+                catch (...) {
                     yield.relock(); // needed before yield goes out of scope
                     throw;
                 }
 
-                if ( ! yield.stillOk() ){
+                if ( ! yield.stillOk() ) {
                     uasserted(13281, "File deleted during filemd5 command");
                 }
             }
@@ -932,15 +1033,15 @@ namespace mongo {
     public:
         CmdDatasize() : Command( "dataSize", false, "datasize" ) {}
         virtual bool slaveOk() const { return true; }
-        virtual LockType locktype() const { return READ; } 
+        virtual LockType locktype() const { return READ; }
         virtual void help( stringstream &help ) const {
             help <<
-                "determine data size for a set of data in a certain range"
-                "\nexample: { datasize:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }"
-                "\nkeyPattern, min, and max parameters are optional."
-                "\nnote: This command may take a while to run";
+                 "determine data size for a set of data in a certain range"
+                 "\nexample: { dataSize:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }"
+                 "\nkeyPattern, min, and max parameters are optional."
+                 "\nnote: This command may take a while to run";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             Timer timer;
 
             string ns = jsobj.firstElement().String();
@@ -951,39 +1052,39 @@ namespace mongo {
 
             Client::Context ctx( ns );
             NamespaceDetails *d = nsdetails(ns.c_str());
-            
-            if ( ! d || d->nrecords == 0 ){
+
+            if ( ! d || d->stats.nrecords == 0 ) {
                 result.appendNumber( "size" , 0 );
                 result.appendNumber( "numObjects" , 0 );
                 result.append( "millis" , timer.millis() );
                 return true;
             }
-            
+
             result.appendBool( "estimate" , estimate );
 
             shared_ptr<Cursor> c;
             if ( min.isEmpty() && max.isEmpty() ) {
-                if ( estimate ){
-                    result.appendNumber( "size" , d->datasize );
-                    result.appendNumber( "numObjects" , d->nrecords );
+                if ( estimate ) {
+                    result.appendNumber( "size" , d->stats.datasize );
+                    result.appendNumber( "numObjects" , d->stats.nrecords );
                     result.append( "millis" , timer.millis() );
                     return 1;
                 }
                 c = theDataFileMgr.findAll( ns.c_str() );
-            } 
+            }
             else if ( min.isEmpty() || max.isEmpty() ) {
                 errmsg = "only one of min or max specified";
                 return false;
-            } 
+            }
             else {
                 IndexDetails *idx = cmdIndexDetailsForRange( ns.c_str(), errmsg, min, max, keyPattern );
                 if ( idx == 0 )
                     return false;
-                
+
                 c.reset( new BtreeCursor( d, d->idxNo(*idx), *idx, min, max, false, 1 ) );
             }
-            
-            long long avgObjSize = d->datasize / d->nrecords;
+
+            long long avgObjSize = d->stats.datasize / d->stats.nrecords;
 
             long long maxSize = jsobj["maxSize"].numberLong();
             long long maxObjects = jsobj["maxObjects"].numberLong();
@@ -996,11 +1097,11 @@ namespace mongo {
                     size += avgObjSize;
                 else
                     size += c->currLoc().rec()->netLength();
-                
+
                 numObjects++;
-                
-                if ( ( maxSize && size > maxSize ) || 
-                     ( maxObjects && numObjects > maxObjects ) ){
+
+                if ( ( maxSize && size > maxSize ) ||
+                        ( maxObjects && numObjects > maxObjects ) ) {
                     result.appendBool( "maxReached" , true );
                     break;
                 }
@@ -1010,7 +1111,7 @@ namespace mongo {
 
             ostringstream os;
             os <<  "Finding size for ns: " << ns;
-            if ( ! min.isEmpty() ){
+            if ( ! min.isEmpty() ) {
                 os << " between " << min << " and " << max;
             }
             logIfSlow( timer , os.str() );
@@ -1023,27 +1124,27 @@ namespace mongo {
     } cmdDatasize;
 
     namespace {
-        long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ){
+        long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ) {
             dbMutex.assertAtLeastReadLocked();
 
             NamespaceDetails * nsd = nsdetails( ns.c_str() );
             if ( ! nsd )
                 return 0;
-            
-            long long totalSize = 0;            
+
+            long long totalSize = 0;
 
             NamespaceDetails::IndexIterator ii = nsd->ii();
-            while ( ii.more() ){
+            while ( ii.more() ) {
                 IndexDetails& d = ii.next();
                 string collNS = d.indexNamespace();
                 NamespaceDetails * mine = nsdetails( collNS.c_str() );
-                if ( ! mine ){
+                if ( ! mine ) {
                     log() << "error: have index ["  << collNS << "] but no NamespaceDetails" << endl;
                     continue;
                 }
-                totalSize += mine->datasize;
+                totalSize += mine->stats.datasize;
                 if ( details )
-                    details->appendNumber( d.indexName() , mine->datasize / scale );
+                    details->appendNumber( d.indexName() , mine->stats.datasize / scale );
             }
             return totalSize;
         }
@@ -1053,42 +1154,48 @@ namespace mongo {
     public:
         CollectionStats() : Command( "collStats", false, "collstats" ) {}
         virtual bool slaveOk() const { return true; }
-        virtual LockType locktype() const { return READ; } 
+        virtual LockType locktype() const { return READ; }
         virtual void help( stringstream &help ) const {
             help << "{ collStats:\"blog.posts\" , scale : 1 } scale divides sizes e.g. for KB use 1024";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string ns = dbname + "." + jsobj.firstElement().valuestr();
             Client::Context cx( ns );
-            
+
             NamespaceDetails * nsd = nsdetails( ns.c_str() );
-            if ( ! nsd ){
+            if ( ! nsd ) {
                 errmsg = "ns not found";
                 return false;
             }
 
             result.append( "ns" , ns.c_str() );
-            
+
             int scale = 1;
-            if ( jsobj["scale"].isNumber() ){
+            if ( jsobj["scale"].isNumber() ) {
                 scale = jsobj["scale"].numberInt();
-                if ( scale <= 0 ){
+                if ( scale <= 0 ) {
                     errmsg = "scale has to be > 0";
                     return false;
                 }
-                    
+
             }
-            else if ( jsobj["scale"].trueValue() ){
+            else if ( jsobj["scale"].trueValue() ) {
                 errmsg = "scale has to be a number > 0";
                 return false;
             }
 
-            long long size = nsd->datasize / scale;
-            result.appendNumber( "count" , nsd->nrecords );
+            bool verbose = jsobj["verbose"].trueValue();
+
+            long long size = nsd->stats.datasize / scale;
+            result.appendNumber( "count" , nsd->stats.nrecords );
             result.appendNumber( "size" , size );
-            result.append      ( "avgObjSize" , double(size) / double(nsd->nrecords) );
+            if( nsd->stats.nrecords )
+                result.append      ( "avgObjSize" , double(size) / double(nsd->stats.nrecords) );
+
             int numExtents;
-            result.appendNumber( "storageSize" , nsd->storageSize( &numExtents ) / scale );
+            BSONArrayBuilder extents;
+
+            result.appendNumber( "storageSize" , nsd->storageSize( &numExtents , verbose ? &extents : 0  ) / scale );
             result.append( "numExtents" , numExtents );
             result.append( "nindexes" , nsd->nIndexes );
             result.append( "lastExtentSize" , nsd->lastExtentSize / scale );
@@ -1098,12 +1205,15 @@ namespace mongo {
             BSONObjBuilder indexSizes;
             result.appendNumber( "totalIndexSize" , getIndexSizeForCollection(dbname, ns, &indexSizes, scale) / scale );
             result.append("indexSizes", indexSizes.obj());
-            
-            if ( nsd->capped ){
+
+            if ( nsd->capped ) {
                 result.append( "capped" , nsd->capped );
                 result.append( "max" , nsd->max );
             }
 
+            if ( verbose )
+                result.appendArray( "extents" , extents.arr() );
+
             return true;
         }
     } cmdCollectionStatis;
@@ -1112,11 +1222,11 @@ namespace mongo {
     public:
         DBStats() : Command( "dbStats", false, "dbstats" ) {}
         virtual bool slaveOk() const { return true; }
-        virtual LockType locktype() const { return READ; } 
+        virtual LockType locktype() const { return READ; }
         virtual void help( stringstream &help ) const {
-            help << " example: { dbstats:1 } ";
+            help << " example: { dbStats:1 } ";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             list<string> collections;
             Database* d = cc().database();
             if ( d )
@@ -1130,19 +1240,19 @@ namespace mongo {
             long long indexes = 0;
             long long indexSize = 0;
 
-            for (list<string>::const_iterator it = collections.begin(); it != collections.end(); ++it){
+            for (list<string>::const_iterator it = collections.begin(); it != collections.end(); ++it) {
                 const string ns = *it;
 
                 NamespaceDetails * nsd = nsdetails( ns.c_str() );
-                if ( ! nsd ){
+                if ( ! nsd ) {
                     errmsg = "missing ns: ";
                     errmsg += ns;
                     return false;
                 }
 
                 ncollections += 1;
-                objects += nsd->nrecords;
-                size += nsd->datasize;
+                objects += nsd->stats.nrecords;
+                size += nsd->stats.datasize;
 
                 int temp;
                 storageSize += nsd->storageSize( &temp );
@@ -1151,10 +1261,11 @@ namespace mongo {
                 indexes += nsd->nIndexes;
                 indexSize += getIndexSizeForCollection(dbname, ns);
             }
-
+            
+            result.append      ( "db" , dbname );
             result.appendNumber( "collections" , ncollections );
             result.appendNumber( "objects" , objects );
-            result.append      ( "avgObjSize" , double(size) / double(objects) );
+            result.append      ( "avgObjSize" , objects == 0 ? 0 : double(size) / double(objects) );
             result.appendNumber( "dataSize" , size );
             result.appendNumber( "storageSize" , storageSize);
             result.appendNumber( "numExtents" , numExtents );
@@ -1162,7 +1273,7 @@ namespace mongo {
             result.appendNumber( "indexSize" , indexSize );
             result.appendNumber( "fileSize" , d->fileSize() );
 
-                return true;
+            return true;
         }
     } cmdDBStats;
 
@@ -1171,11 +1282,11 @@ namespace mongo {
     public:
         CmdCloneCollectionAsCapped() : Command( "cloneCollectionAsCapped" ) {}
         virtual bool slaveOk() const { return false; }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         virtual void help( stringstream &help ) const {
             help << "{ cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string from = jsobj.getStringField( "cloneCollectionAsCapped" );
             string to = jsobj.getStringField( "toCollection" );
             long long size = (long long)jsobj.getField( "size" ).number();
@@ -1189,7 +1300,7 @@ namespace mongo {
             string toNs = dbname + "." + to;
             NamespaceDetails *nsd = nsdetails( fromNs.c_str() );
             massert( 10301 ,  "source collection " + fromNs + " does not exist", nsd );
-            long long excessSize = nsd->datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size'
+            long long excessSize = nsd->stats.datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size'
             DiskLoc extent = nsd->firstExtent;
             for( ; excessSize > extent.ext()->length && extent != nsd->lastExtent; extent = extent.ext()->xnext ) {
                 excessSize -= extent.ext()->length;
@@ -1202,7 +1313,7 @@ namespace mongo {
             {
                 shared_ptr<Cursor> c = theDataFileMgr.findAll( fromNs.c_str(), startLoc );
                 ClientCursor *cc = new ClientCursor(0, c, fromNs.c_str());
-                id = cc->cursorid;
+                id = cc->cursorid();
             }
 
             DBDirectClient client;
@@ -1223,20 +1334,20 @@ namespace mongo {
         }
     } cmdCloneCollectionAsCapped;
 
-    /* jan2010: 
-       Converts the given collection to a capped collection w/ the specified size. 
-       This command is not highly used, and is not currently supported with sharded 
-       environments. 
+    /* jan2010:
+       Converts the given collection to a capped collection w/ the specified size.
+       This command is not highly used, and is not currently supported with sharded
+       environments.
        */
     class CmdConvertToCapped : public Command {
     public:
         CmdConvertToCapped() : Command( "convertToCapped" ) {}
         virtual bool slaveOk() const { return false; }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         virtual void help( stringstream &help ) const {
             help << "{ convertToCapped:<fromCollectionName>, size:<sizeInBytes> }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             BackgroundOperation::assertNoBgOpInProgForDb(dbname.c_str());
 
             string from = jsobj.getStringField( "convertToCapped" );
@@ -1247,13 +1358,16 @@ namespace mongo {
                 return false;
             }
 
+            string shortTmpName = str::stream() << ".tmp.convertToCapped." << from;
+            string longTmpName = str::stream() << dbname << "." << shortTmpName;
+
             DBDirectClient client;
-            client.dropCollection( dbname + "." + from + ".$temp_convertToCapped" );
+            client.dropCollection( longTmpName );
 
             BSONObj info;
             if ( !client.runCommand( dbname ,
-                                    BSON( "cloneCollectionAsCapped" << from << "toCollection" << ( from + ".$temp_convertToCapped" ) << "size" << double( size ) ),
-                                    info ) ) {
+                                     BSON( "cloneCollectionAsCapped" << from << "toCollection" << shortTmpName << "size" << double( size ) ),
+                                     info ) ) {
                 errmsg = "cloneCollectionAsCapped failed: " + info.toString();
                 return false;
             }
@@ -1264,9 +1378,9 @@ namespace mongo {
             }
 
             if ( !client.runCommand( "admin",
-                                    BSON( "renameCollection" << ( dbname + "." + from + ".$temp_convertToCapped" ) 
-                                          << "to" << ( dbname + "." + from ) ),
-                                    info ) ) {
+                                     BSON( "renameCollection" << longTmpName <<
+                                           "to" << ( dbname + "." + from ) ),
+                                     info ) ) {
                 errmsg = "renameCollection failed: " + info.toString();
                 return false;
             }
@@ -1275,239 +1389,15 @@ namespace mongo {
         }
     } cmdConvertToCapped;
 
-    class GroupCommand : public Command {
-    public:
-        GroupCommand() : Command("group"){}
-        virtual LockType locktype() const { return READ; } 
-        virtual bool slaveOk() const { return true; }
-        virtual bool slaveOverrideOk() { return true; }
-        virtual void help( stringstream &help ) const {
-            help << "http://www.mongodb.org/display/DOCS/Aggregation";
-        }
-
-        BSONObj getKey( const BSONObj& obj , const BSONObj& keyPattern , ScriptingFunction func , double avgSize , Scope * s ){
-            if ( func ){
-                BSONObjBuilder b( obj.objsize() + 32 );
-                b.append( "0" , obj );
-                int res = s->invoke( func , b.obj() );
-                uassert( 10041 ,  (string)"invoke failed in $keyf: " + s->getError() , res == 0 );
-                int type = s->type("return");
-                uassert( 10042 ,  "return of $key has to be an object" , type == Object );
-                return s->getObject( "return" );
-            }
-            return obj.extractFields( keyPattern , true );
-        }
-
-        bool group( string realdbname , const string& ns , const BSONObj& query , 
-                    BSONObj keyPattern , string keyFunctionCode , string reduceCode , const char * reduceScope ,
-                    BSONObj initial , string finalize ,
-                    string& errmsg , BSONObjBuilder& result ){
-
-
-            auto_ptr<Scope> s = globalScriptEngine->getPooledScope( realdbname );
-            s->localConnect( realdbname.c_str() );
-
-            if ( reduceScope )
-                s->init( reduceScope );
-
-            s->setObject( "$initial" , initial , true );
-
-            s->exec( "$reduce = " + reduceCode , "reduce setup" , false , true , true , 100 );
-            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
-            ScriptingFunction f = s->createFunction(
-                "function(){ "
-                "  if ( $arr[n] == null ){ "
-                "    next = {}; "
-                "    Object.extend( next , $key ); "
-                "    Object.extend( next , $initial , true ); "
-                "    $arr[n] = next; "
-                "    next = null; "
-                "  } "
-                "  $reduce( obj , $arr[n] ); "
-                "}" );
-
-            ScriptingFunction keyFunction = 0;
-            if ( keyFunctionCode.size() ){
-                keyFunction = s->createFunction( keyFunctionCode.c_str() );
-            }
-
-
-            double keysize = keyPattern.objsize() * 3;
-            double keynum = 1;
-
-            map<BSONObj,int,BSONObjCmp> map;
-            list<BSONObj> blah;
-
-            shared_ptr<Cursor> cursor = bestGuessCursor(ns.c_str() , query , BSONObj() );
-
-            while ( cursor->ok() ){
-                if ( cursor->matcher() && ! cursor->matcher()->matchesCurrent( cursor.get() ) ){
-                    cursor->advance();
-                    continue;
-                }
-
-                BSONObj obj = cursor->current();
-                cursor->advance();
-
-                BSONObj key = getKey( obj , keyPattern , keyFunction , keysize / keynum , s.get() );
-                keysize += key.objsize();
-                keynum++;
-
-                int& n = map[key];
-                if ( n == 0 ){
-                    n = map.size();
-                    s->setObject( "$key" , key , true );
-
-                    uassert( 10043 ,  "group() can't handle more than 10000 unique keys" , n <= 10000 );
-                }
-
-                s->setObject( "obj" , obj , true );
-                s->setNumber( "n" , n - 1 );
-                if ( s->invoke( f , BSONObj() , 0 , true ) ){
-                    throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() );
-                }
-            }
-
-            if (!finalize.empty()){
-                s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 );
-                ScriptingFunction g = s->createFunction(
-                    "function(){ "
-                    "  for(var i=0; i < $arr.length; i++){ "
-                    "  var ret = $finalize($arr[i]); "
-                    "  if (ret !== undefined) "
-                    "    $arr[i] = ret; "
-                    "  } "
-                    "}" );
-                s->invoke( g , BSONObj() , 0 , true );
-            }
-            
-            result.appendArray( "retval" , s->getObject( "$arr" ) );
-            result.append( "count" , keynum - 1 );
-            result.append( "keys" , (int)(map.size()) );
-            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
-            s->gc();
-
-            return true;
-        }
-
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
-
-            /* db.$cmd.findOne( { group : <p> } ) */
-            const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck();
-
-            BSONObj q;
-            if ( p["cond"].type() == Object )
-                q = p["cond"].embeddedObject();
-            else if ( p["condition"].type() == Object )
-                q = p["condition"].embeddedObject();
-            else 
-                q = getQuery( p );
-
-            if ( p["ns"].type() != String ){
-                errmsg = "ns has to be set";
-                return false;
-            }
-            
-            string ns = dbname + "." + p["ns"].String();
-
-            BSONObj key;
-            string keyf;
-            if ( p["key"].type() == Object ){
-                key = p["key"].embeddedObjectUserCheck();
-                if ( ! p["$keyf"].eoo() ){
-                    errmsg = "can't have key and $keyf";
-                    return false;
-                }
-            }
-            else if ( p["$keyf"].type() ){
-                keyf = p["$keyf"]._asCode();
-            }
-            else {
-                // no key specified, will use entire object as key
-            }
-
-            BSONElement reduce = p["$reduce"];
-            if ( reduce.eoo() ){
-                errmsg = "$reduce has to be set";
-                return false;
-            }
-
-            BSONElement initial = p["initial"];
-            if ( initial.type() != Object ){
-                errmsg = "initial has to be an object";
-                return false;
-            }
-
-
-            string finalize;
-            if (p["finalize"].type())
-                finalize = p["finalize"]._asCode();
-
-            return group( dbname , ns , q ,
-                          key , keyf , reduce._asCode() , reduce.type() != CodeWScope ? 0 : reduce.codeWScopeScopeData() ,
-                          initial.embeddedObject() , finalize ,
-                          errmsg , result );
-        }
-
-    } cmdGroup;
-
-
-    class DistinctCommand : public Command {
-    public:
-        DistinctCommand() : Command("distinct"){}
-        virtual bool slaveOk() const { return true; }
-        virtual LockType locktype() const { return READ; } 
-        virtual void help( stringstream &help ) const {
-            help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }";
-        }
-
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
-            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
-
-            string key = cmdObj["key"].valuestrsafe();
-            BSONObj keyPattern = BSON( key << 1 );
-
-            BSONObj query = getQuery( cmdObj );
-            
-            BSONElementSet values;
-            shared_ptr<Cursor> cursor = bestGuessCursor(ns.c_str() , query , BSONObj() );
-            scoped_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns));
-
-            while ( cursor->ok() ){
-                if ( !cursor->matcher() || cursor->matcher()->matchesCurrent( cursor.get() ) ){
-                    BSONObj o = cursor->current();
-                    o.getFieldsDotted( key, values );
-                }
-
-                cursor->advance();
-
-                if (!cc->yieldSometimes())
-                    break;
-            }
-
-            BSONArrayBuilder b( result.subarrayStart( "values" ) );
-            for ( BSONElementSet::iterator i = values.begin() ; i != values.end(); i++ ){
-                b.append( *i );
-            }
-            BSONObj arr = b.done();
-
-            uassert(10044,  "distinct too big, 4mb cap",
-                    (arr.objsize() + 1024) < (4 * 1024 * 1024));
-
-            return true;
-        }
-
-    } distinctCmd;
-
     /* Find and Modify an object returning either the old (default) or new value*/
     class CmdFindAndModify : public Command {
     public:
         virtual void help( stringstream &help ) const {
-            help << 
-                "{ findandmodify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n"
-                "{ findandmodify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n"
-                "Either update or remove is required, all other fields have default values.\n"
-                "Output is in the \"value\" field\n";
+            help <<
+                 "{ findAndModify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n"
+                 "{ findAndModify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n"
+                 "Either update or remove is required, all other fields have default values.\n"
+                 "Output is in the \"value\" field\n";
         }
 
         CmdFindAndModify() : Command("findAndModify", false, "findandmodify") { }
@@ -1517,7 +1407,7 @@ namespace mongo {
         virtual bool slaveOk() const {
             return false;
         }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return WRITE; }
         virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             static DBDirectClient db;
 
@@ -1535,8 +1425,8 @@ namespace mongo {
             const BSONObj* fields = (fieldsHolder.isEmpty() ? NULL : &fieldsHolder);
 
             BSONObj out = db.findOne(ns, q, fields);
-            if (out.isEmpty()){
-                if (!upsert){
+            if (out.isEmpty()) {
+                if (!upsert) {
                     errmsg = "No matching object found";
                     return false;
                 }
@@ -1546,9 +1436,13 @@ namespace mongo {
                 uassert(13330, "upsert mode requires query field", !origQuery.isEmpty());
                 db.update(ns, origQuery, update.embeddedObjectUserCheck(), true);
 
-                if (cmdObj["new"].trueValue()){
-                    BSONObj gle = db.getLastErrorDetailed();
+                BSONObj gle = db.getLastErrorDetailed();
+                if (gle["err"].type() == String) {
+                    errmsg = gle["err"].String();
+                    return false;
+                }
 
+                if (cmdObj["new"].trueValue()) {
                     BSONElement _id = gle["upserted"];
                     if (_id.eoo())
                         _id = origQuery["_id"];
@@ -1556,33 +1450,46 @@ namespace mongo {
                     out = db.findOne(ns, QUERY("_id" << _id), fields);
                 }
 
-            } else {
-
-                Query idQuery = QUERY( "_id" << out["_id"]);
+            }
+            else {
 
-                if (cmdObj["remove"].trueValue()){
+                if (cmdObj["remove"].trueValue()) {
                     uassert(12515, "can't remove and update", cmdObj["update"].eoo());
-                    db.remove(ns, idQuery, 1);
-
-                } else { // update
-
-                    // need to include original query for $ positional operator
-                    BSONObjBuilder b;
-                    b.append(out["_id"]);
-                    BSONObjIterator it(origQuery);
-                    while (it.more()){
-                        BSONElement e = it.next();
-                        if (strcmp(e.fieldName(), "_id"))
-                            b.append(e);
+                    db.remove(ns, QUERY("_id" << out["_id"]), 1);
+
+                }
+                else {   // update
+
+                    BSONElement queryId = origQuery["_id"];
+                    if (queryId.eoo() || getGtLtOp(queryId) != BSONObj::Equality) {
+                        // need to include original query for $ positional operator
+
+                        BSONObjBuilder b;
+                        b.append(out["_id"]);
+                        BSONObjIterator it(origQuery);
+                        while (it.more()) {
+                            BSONElement e = it.next();
+                            if (strcmp(e.fieldName(), "_id"))
+                                b.append(e);
+                        }
+                        q = Query(b.obj());
                     }
-                    q = Query(b.obj());
+
+                    if (q.isComplex()) // update doesn't work with complex queries
+                        q = Query(q.getFilter().getOwned());
 
                     BSONElement update = cmdObj["update"];
                     uassert(12516, "must specify remove or update", !update.eoo());
                     db.update(ns, q, update.embeddedObjectUserCheck());
 
+                    BSONObj gle = db.getLastErrorDetailed();
+                    if (gle["err"].type() == String) {
+                        errmsg = gle["err"].String();
+                        return false;
+                    }
+
                     if (cmdObj["new"].trueValue())
-                        out = db.findOne(ns, idQuery, fields);
+                        out = db.findOne(ns, QUERY("_id" << out["_id"]), fields);
                 }
             }
 
@@ -1591,7 +1498,7 @@ namespace mongo {
             return true;
         }
     } cmdFindAndModify;
-    
+
     /* Returns client's uri */
     class CmdWhatsMyUri : public Command {
     public:
@@ -1599,20 +1506,20 @@ namespace mongo {
         virtual bool slaveOk() const {
             return true;
         }
-        virtual LockType locktype() const { return NONE; } 
+        virtual LockType locktype() const { return NONE; }
         virtual bool requiresAuth() {
             return false;
         }
         virtual void help( stringstream &help ) const {
             help << "{whatsmyuri:1}";
-        }        
+        }
         virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             BSONObj info = cc().curop()->infoNoauth();
             result << "you" << info[ "client" ];
             return true;
         }
     } cmdWhatsMyUri;
-    
+
     /* For testing only, not for general use */
     class GodInsert : public Command {
     public:
@@ -1629,7 +1536,7 @@ namespace mongo {
         }
         virtual void help( stringstream &help ) const {
             help << "internal. for testing only.";
-        }        
+        }
         virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             string coll = cmdObj[ "godinsert" ].valuestrsafe();
             uassert( 13049, "godinsert must specify a collection", !coll.empty() );
@@ -1642,31 +1549,32 @@ namespace mongo {
 
     class DBHashCmd : public Command {
     public:
-        DBHashCmd() : Command( "dbHash", false, "dbhash" ){}
+        DBHashCmd() : Command( "dbHash", false, "dbhash" ) {}
         virtual bool slaveOk() const { return true; }
         virtual LockType locktype() const { return READ; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){
+        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             list<string> colls;
             Database* db = cc().database();
             if ( db )
                 db->namespaceIndex.getNamespaces( colls );
             colls.sort();
-            
+
             result.appendNumber( "numCollections" , (long long)colls.size() );
-            
+            result.append( "host" , prettyHostName() );
+
             md5_state_t globalState;
             md5_init(&globalState);
 
             BSONObjBuilder bb( result.subobjStart( "collections" ) );
-            for ( list<string>::iterator i=colls.begin(); i != colls.end(); i++ ){
+            for ( list<string>::iterator i=colls.begin(); i != colls.end(); i++ ) {
                 string c = *i;
                 if ( c.find( ".system.profil" ) != string::npos )
                     continue;
-                
+
                 shared_ptr<Cursor> cursor;
 
                 NamespaceDetails * nsd = nsdetails( c.c_str() );
-                
+
                 // debug SERVER-761
                 NamespaceDetails::IndexIterator ii = nsd->ii();
                 while( ii.more() ) {
@@ -1678,15 +1586,15 @@ namespace mongo {
                         log() << endl;
                     }
                 }
-                
+
                 int idNum = nsd->findIdIndex();
-                if ( idNum >= 0 ){
+                if ( idNum >= 0 ) {
                     cursor.reset( new BtreeCursor( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) );
                 }
-                else if ( c.find( ".system." ) != string::npos ){
+                else if ( c.find( ".system." ) != string::npos ) {
                     continue;
                 }
-                else if ( nsd->capped ){
+                else if ( nsd->capped ) {
                     cursor = findTableScan( c.c_str() , BSONObj() );
                 }
                 else {
@@ -1697,9 +1605,9 @@ namespace mongo {
 
                 md5_state_t st;
                 md5_init(&st);
-                
+
                 long long n = 0;
-                while ( cursor->ok() ){
+                while ( cursor->ok() ) {
                     BSONObj c = cursor->current();
                     md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() );
                     n++;
@@ -1708,7 +1616,7 @@ namespace mongo {
                 md5digest d;
                 md5_finish(&st, d);
                 string hash = digestToString( d );
-                
+
                 bb.append( c.c_str() + ( dbname.size() + 1 ) , hash );
 
                 md5_append( &globalState , (const md5_byte_t*)hash.c_str() , hash.size() );
@@ -1727,9 +1635,9 @@ namespace mongo {
     } dbhashCmd;
 
     /* for diagnostic / testing purposes. */
-    class CmdSleep : public Command { 
+    class CmdSleep : public Command {
     public:
-        virtual LockType locktype() const { return NONE; } 
+        virtual LockType locktype() const { return NONE; }
         virtual bool adminOnly() const { return true; }
         virtual bool logTheOp() { return false; }
         virtual bool slaveOk() const { return true; }
@@ -1739,46 +1647,43 @@ namespace mongo {
         }
         CmdSleep() : Command("sleep") { }
         bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            if( cmdObj.getBoolField("w") ) { 
+
+
+            int secs = 100;
+            if ( cmdObj["secs"].isNumber() )
+                secs = cmdObj["secs"].numberInt();
+
+            if( cmdObj.getBoolField("w") ) {
                 writelock lk("");
-                sleepsecs(100);
+                sleepsecs(secs);
             }
             else {
                 readlock lk("");
-                sleepsecs(100);
+                sleepsecs(secs);
             }
+
             return true;
         }
     } cmdSleep;
 
-    class AvailableQueryOptions : public Command {
-    public:
-        AvailableQueryOptions() : Command( "availablequeryoptions" ){}
-        virtual bool slaveOk() const { return true; }
-        virtual LockType locktype() const { return NONE; }
-        virtual bool requiresAuth() { return false; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){
-            result << "options" << QueryOption_AllSupported;
-            return true;
-        }
-    } availableQueryOptionsCmd;    
-    
     // just for testing
     class CapTrunc : public Command {
     public:
-        CapTrunc() : Command( "captrunc" ){}
+        CapTrunc() : Command( "captrunc" ) {}
         virtual bool slaveOk() const { return false; }
         virtual LockType locktype() const { return WRITE; }
         virtual bool requiresAuth() { return true; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){
+        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             string coll = cmdObj[ "captrunc" ].valuestrsafe();
             uassert( 13416, "captrunc must specify a collection", !coll.empty() );
             string ns = dbname + "." + coll;
             int n = cmdObj.getIntField( "n" );
+
+            // inclusive range?
             bool inc = cmdObj.getBoolField( "inc" );
             NamespaceDetails *nsd = nsdetails( ns.c_str() );
             ReverseCappedCursor c( nsd );
-            massert( 13417, "captrunc invalid collection", c.ok() );
+            massert( 13417, "captrunc collection not found or empty", c.ok() );
             for( int i = 0; i < n; ++i ) {
                 massert( 13418, "captrunc invalid n", c.advance() );
             }
@@ -1786,16 +1691,16 @@ namespace mongo {
             nsd->cappedTruncateAfter( ns.c_str(), end, inc );
             return true;
         }
-    } capTruncCmd;    
-    
+    } capTruncCmd;
+
     // just for testing
     class EmptyCapped : public Command {
     public:
-        EmptyCapped() : Command( "emptycapped" ){}
+        EmptyCapped() : Command( "emptycapped" ) {}
         virtual bool slaveOk() const { return false; }
         virtual LockType locktype() const { return WRITE; }
         virtual bool requiresAuth() { return true; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){
+        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             string coll = cmdObj[ "emptycapped" ].valuestrsafe();
             uassert( 13428, "emptycapped must specify a collection", !coll.empty() );
             string ns = dbname + "." + coll;
@@ -1804,9 +1709,9 @@ namespace mongo {
             nsd->emptyCappedCollection( ns.c_str() );
             return true;
         }
-    } emptyCappedCmd;    
-    
-    /** 
+    } emptyCappedCmd;
+
+    /**
      * this handles
      - auth
      - locking
@@ -1814,53 +1719,52 @@ namespace mongo {
      then calls run()
     */
     bool execCommand( Command * c ,
-                      Client& client , int queryOptions , 
-                      const char *cmdns, BSONObj& cmdObj , 
-                      BSONObjBuilder& result, 
-                      bool fromRepl ){
-        
+                      Client& client , int queryOptions ,
+                      const char *cmdns, BSONObj& cmdObj ,
+                      BSONObjBuilder& result,
+                      bool fromRepl ) {
+
         string dbname = nsToDatabase( cmdns );
-        
-        AuthenticationInfo *ai = client.getAuthenticationInfo();    
 
-        if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) { 
-            result.append( "errmsg" , 
+        AuthenticationInfo *ai = client.getAuthenticationInfo();
+
+        if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) {
+            result.append( "errmsg" ,
                            "unauthorized: this command must run from localhost when running db without auth" );
             log() << "command denied: " << cmdObj.toString() << endl;
             return false;
         }
-        
 
         if ( c->adminOnly() && ! fromRepl && dbname != "admin" ) {
             result.append( "errmsg" ,  "access denied; use admin db" );
             log() << "command denied: " << cmdObj.toString() << endl;
             return false;
-        }        
+        }
 
-        if ( cmdObj["help"].trueValue() ){
+        if ( cmdObj["help"].trueValue() ) {
             stringstream ss;
             ss << "help for: " << c->name << " ";
             c->help( ss );
             result.append( "help" , ss.str() );
             result.append( "lockType" , c->locktype() );
             return true;
-        } 
+        }
 
-        bool canRunHere = 
+        bool canRunHere =
             isMaster( dbname.c_str() ) ||
             c->slaveOk() ||
             ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) ||
             fromRepl;
 
-        if ( ! canRunHere ){
+        if ( ! canRunHere ) {
             result.append( "errmsg" , "not master" );
             return false;
         }
 
         if ( c->adminOnly() )
             log( 2 ) << "command: " << cmdObj << endl;
-        
-        if ( c->locktype() == Command::NONE ){
+
+        if ( c->locktype() == Command::NONE ) {
             // we also trust that this won't crash
             string errmsg;
             int ok = c->run( dbname , cmdObj , errmsg , result , fromRepl );
@@ -1868,35 +1772,35 @@ namespace mongo {
                 result.append( "errmsg" , errmsg );
             return ok;
         }
-     
+
         bool needWriteLock = c->locktype() == Command::WRITE;
-        
-        if ( ! needWriteLock ){
+
+        if ( ! needWriteLock ) {
             assert( ! c->logTheOp() );
         }
 
         mongolock lk( needWriteLock );
         Client::Context ctx( dbname , dbpath , &lk , c->requiresAuth() );
-        
+
         try {
             string errmsg;
-            if ( ! c->run(dbname, cmdObj, errmsg, result, fromRepl ) ){
+            if ( ! c->run(dbname, cmdObj, errmsg, result, fromRepl ) ) {
                 result.append( "errmsg" , errmsg );
                 return false;
             }
         }
-        catch ( DBException& e ){
+        catch ( DBException& e ) {
             stringstream ss;
             ss << "exception: " << e.what();
             result.append( "errmsg" , ss.str() );
             result.append( "code" , e.getCode() );
             return false;
         }
-        
-        if ( c->logTheOp() && ! fromRepl ){
+
+        if ( c->logTheOp() && ! fromRepl ) {
             logOp("c", cmdns, cmdObj);
         }
-        
+
         return true;
     }
 
@@ -1912,9 +1816,9 @@ namespace mongo {
         cc().curop()->ensureStarted();
         string dbname = nsToDatabase( ns );
 
-        if( logLevel >= 1 ) 
+        if( logLevel >= 1 )
             log() << "run command " << ns << ' ' << _cmdobj << endl;
-        
+
         const char *p = strchr(ns, '.');
         if ( !p ) return false;
         if ( strcmp(p, ".$cmd") != 0 ) return false;
@@ -1934,14 +1838,14 @@ namespace mongo {
         bool ok = false;
 
         BSONElement e = jsobj.firstElement();
-        
+
         Command * c = e.type() ? Command::findCommand( e.fieldName() ) : 0;
 
-        if ( c ){
+        if ( c ) {
             ok = execCommand( c , client , queryOptions , ns , jsobj , anObjBuilder , fromRepl );
         }
         else {
-            anObjBuilder.append("errmsg", "no such cmd");
+            anObjBuilder.append("errmsg", str::stream() << "no such cmd: " << e.fieldName() );
             anObjBuilder.append("bad cmd" , _cmdobj );
         }
 
@@ -1953,5 +1857,5 @@ namespace mongo {
 
         return true;
     }
-    
+
 } // namespace mongo
diff --git a/db/dbcommands_admin.cpp b/db/dbcommands_admin.cpp
index 2d08ac8..82a9c91 100644
--- a/db/dbcommands_admin.cpp
+++ b/db/dbcommands_admin.cpp
@@ -25,34 +25,36 @@
 #include "pch.h"
 #include "jsobj.h"
 #include "pdfile.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "commands.h"
 #include "cmdline.h"
 #include "btree.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "../util/background.h"
+#include "../util/logfile.h"
+#include "../util/alignedbuilder.h"
 #include "../scripting/engine.h"
 
 namespace mongo {
 
     class CleanCmd : public Command {
     public:
-        CleanCmd() : Command( "clean" ){}
+        CleanCmd() : Command( "clean" ) {}
 
         virtual bool slaveOk() const { return true; }
-        virtual LockType locktype() const { return WRITE; } 
-        
+        virtual LockType locktype() const { return WRITE; }
+
         virtual void help(stringstream& h) const { h << "internal"; }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string dropns = dbname + "." + cmdObj.firstElement().valuestrsafe();
-            
+
             if ( !cmdLine.quiet )
                 tlog() << "CMD: clean " << dropns << endl;
-            
+
             NamespaceDetails *d = nsdetails(dropns.c_str());
-            
-            if ( ! d ){
+
+            if ( ! d ) {
                 errmsg = "ns not found";
                 return 0;
             }
@@ -63,39 +65,108 @@ namespace mongo {
             result.append("ns", dropns.c_str());
             return 1;
         }
-        
+
     } cleanCmd;
-    
+
+    namespace dur {
+        filesystem::path getJournalDir();
+    }
+ 
+    class JournalLatencyTestCmd : public Command {
+    public:
+        JournalLatencyTestCmd() : Command( "journalLatencyTest" ) {}
+
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return true; }
+        virtual void help(stringstream& h) const { h << "test how long to write and fsync to a test file in the journal/ directory"; }
+
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            filesystem::path p = dur::getJournalDir();
+            p /= "journalLatencyTest";
+        
+            // remove file if already present
+            try { 
+                remove(p);
+            }
+            catch(...) { }
+
+            BSONObjBuilder bb[2];
+            for( int pass = 0; pass < 2; pass++ ) {
+                LogFile f(p.string());
+                AlignedBuilder b(1024 * 1024);
+                {
+                    Timer t;
+                    for( int i = 0 ; i < 100; i++ ) { 
+                        f.synchronousAppend(b.buf(), 8192);
+                    }
+                    bb[pass].append("8KB", t.millis() / 100.0);
+                }
+                {
+                    const int N = 50;
+                    Timer t2;
+                    long long x = 0;
+                    for( int i = 0 ; i < N; i++ ) { 
+                        Timer t;
+                        f.synchronousAppend(b.buf(), 8192);
+                        x += t.micros();
+                        sleepmillis(4);
+                    }
+                    long long y = t2.micros() - 4*N*1000;
+                    // not really trusting the timer granularity on all platforms so whichever is higher of x and y
+                    bb[pass].append("8KBWithPauses", max(x,y) / (N*1000.0));
+                }
+                {
+                    Timer t;
+                    for( int i = 0 ; i < 20; i++ ) { 
+                        f.synchronousAppend(b.buf(), 1024 * 1024);
+                    }
+                    bb[pass].append("1MB", t.millis() / 20.0);
+                }
+                // second time around, we are prealloced.
+            }
+            result.append("timeMillis", bb[0].obj());
+            result.append("timeMillisWithPrealloc", bb[1].obj());
+
+            try { 
+                remove(p);
+            }
+            catch(...) { }
+
+            return 1;
+        }
+    } journalLatencyTestCmd;
+
     class ValidateCmd : public Command {
     public:
-        ValidateCmd() : Command( "validate" ){}
+        ValidateCmd() : Command( "validate" ) {}
 
         virtual bool slaveOk() const {
             return true;
         }
-        
+
         virtual void help(stringstream& h) const { h << "Validate contents of a namespace by scanning its data structures for correctness.  Slow."; }
 
-        virtual LockType locktype() const { return READ; } 
+        virtual LockType locktype() const { return READ; }
         //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] } */
-        
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+
+        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string ns = dbname + "." + cmdObj.firstElement().valuestrsafe();
             NamespaceDetails * d = nsdetails( ns.c_str() );
             if ( !cmdLine.quiet )
                 tlog() << "CMD: validate " << ns << endl;
 
-            if ( ! d ){
+            if ( ! d ) {
                 errmsg = "ns not found";
                 return 0;
             }
-            
+
             result.append( "ns", ns );
             result.append( "result" , validateNS( ns.c_str() , d, &cmdObj ) );
             return 1;
         }
-                    
-        
+
+
         string validateNS(const char *ns, NamespaceDetails *d, BSONObj *cmdObj) {
             bool scanData = true;
             if( cmdObj && cmdObj->hasElement("scandata") && !cmdObj->getBoolField("scandata") )
@@ -106,13 +177,13 @@ namespace mongo {
             //ss << "  details: " << hex << d << " ofs:" << nsindex(ns)->detailsOffset(d) << dec << endl;
             if ( d->capped )
                 ss << "  capped:" << d->capped << " max:" << d->max << '\n';
-            
-            ss << "  firstExtent:" << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.buf << '\n';
-            ss << "  lastExtent:" << d->lastExtent.toString()    << " ns:" << d->lastExtent.ext()->nsDiagnostic.buf << '\n';
+
+            ss << "  firstExtent:" << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString()<< '\n';
+            ss << "  lastExtent:" << d->lastExtent.toString()    << " ns:" << d->lastExtent.ext()->nsDiagnostic.toString() << '\n';
             try {
                 d->firstExtent.ext()->assertOk();
                 d->lastExtent.ext()->assertOk();
-                
+
                 DiskLoc el = d->firstExtent;
                 int ne = 0;
                 while( !el.isNull() ) {
@@ -123,12 +194,13 @@ namespace mongo {
                     killCurrentOp.checkForInterrupt();
                 }
                 ss << "  # extents:" << ne << '\n';
-            } catch (...) {
+            }
+            catch (...) {
                 valid=false;
                 ss << " extent asserted ";
             }
 
-            ss << "  datasize?:" << d->datasize << " nrecords?:" << d->nrecords << " lastExtentSize:" << d->lastExtentSize << '\n';
+            ss << "  datasize?:" << d->stats.datasize << " nrecords?:" << d->stats.nrecords << " lastExtentSize:" << d->lastExtentSize << '\n';
             ss << "  padding:" << d->paddingFactor << '\n';
             try {
 
@@ -175,7 +247,7 @@ namespace mongo {
                         else ss << " (OK)";
                         ss << '\n';
                     }
-                    ss << "  " << n << " objects found, nobj:" << d->nrecords << '\n';
+                    ss << "  " << n << " objects found, nobj:" << d->stats.nrecords << '\n';
                     ss << "  " << len << " bytes data w/headers\n";
                     ss << "  " << nlen << " bytes data wout/headers\n";
                 }
@@ -198,7 +270,7 @@ namespace mongo {
                             ndel++;
 
                             if ( loc.questionable() ) {
-                                if( d->capped && !loc.isValid() && i == 1 ) { 
+                                if( d->capped && !loc.isValid() && i == 1 ) {
                                     /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
                                        see comments in namespace.h
                                     */
@@ -218,7 +290,8 @@ namespace mongo {
                             k++;
                             killCurrentOp.checkForInterrupt();
                         }
-                    } catch (...) {
+                    }
+                    catch (...) {
                         ss <<"    ?exception in deleted chain for bucket " << i << endl;
                         valid = false;
                     }
@@ -236,7 +309,7 @@ namespace mongo {
                     while( i.more() ) {
                         IndexDetails& id = i.next();
                         ss << "    " << id.indexNamespace() << " keys:" <<
-                            id.head.btree()->fullValidate(id.head, id.keyPattern()) << endl;
+                           id.head.btree()->fullValidate(id.head, id.keyPattern()) << endl;
                     }
                 }
                 catch (...) {
@@ -261,36 +334,36 @@ namespace mongo {
     extern unsigned lockedForWriting;
     extern mongo::mutex lockedForWritingMutex;
 
-/*
-    class UnlockCommand : public Command { 
-    public:
-        UnlockCommand() : Command( "unlock" ) { }
-        virtual bool readOnly() { return true; }
-        virtual bool slaveOk() const { return true; }
-        virtual bool adminOnly() const { return true; }
-        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            if( lockedForWriting ) { 
-				log() << "command: unlock requested" << endl;
-                errmsg = "unlock requested";
-                unlockRequested = true;
-            }
-            else { 
-                errmsg = "not locked, so cannot unlock";
-                return 0;
+    /*
+        class UnlockCommand : public Command {
+        public:
+            UnlockCommand() : Command( "unlock" ) { }
+            virtual bool readOnly() { return true; }
+            virtual bool slaveOk() const { return true; }
+            virtual bool adminOnly() const { return true; }
+            virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+                if( lockedForWriting ) {
+                    log() << "command: unlock requested" << endl;
+                    errmsg = "unlock requested";
+                    unlockRequested = true;
+                }
+                else {
+                    errmsg = "not locked, so cannot unlock";
+                    return 0;
+                }
+                return 1;
             }
-            return 1;
-        }
-        
-    } unlockCommand;
-*/
+
+        } unlockCommand;
+    */
     /* see unlockFsync() for unlocking:
        db.$cmd.sys.unlock.findOne()
     */
     class FSyncCommand : public Command {
-        class LockDBJob : public BackgroundJob { 
+        class LockDBJob : public BackgroundJob {
         protected:
-            string name() { return "lockdbjob"; }
-            void run() { 
+            virtual string name() const { return "lockdbjob"; }
+            void run() {
                 Client::initThread("fsyncjob");
                 Client& c = cc();
                 {
@@ -301,8 +374,8 @@ namespace mongo {
                 MemoryMappedFile::flushAll(true);
                 log() << "db is now locked for snapshotting, no writes allowed. use db.$cmd.sys.unlock.findOne() to unlock" << endl;
                 _ready = true;
-                while( 1 ) { 
-                    if( unlockRequested ) { 
+                while( 1 ) {
+                    if( unlockRequested ) {
                         unlockRequested = false;
                         break;
                     }
@@ -316,54 +389,70 @@ namespace mongo {
             }
         public:
             bool& _ready;
-            LockDBJob(bool& ready) : _ready(ready) {
-                deleteSelf = true;
+            LockDBJob(bool& ready) : BackgroundJob( true /* delete self */ ), _ready(ready) {
                 _ready = false;
             }
         };
     public:
-        FSyncCommand() : Command( "fsync" ){}
-        virtual LockType locktype() const { return WRITE; } 
+        FSyncCommand() : Command( "fsync" ) {}
+        virtual LockType locktype() const { return WRITE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return true; }
-        /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { 
+        /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) {
             string x = cmdObj["exec"].valuestrsafe();
             return !x.empty();
         }*/
         virtual void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/fsync+Command"; }
         virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            /* async means do an fsync, but return immediately */
-            bool sync = ! cmdObj["async"].trueValue();
+            bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately
             bool lock = cmdObj["lock"].trueValue();
             log() << "CMD fsync:  sync:" << sync << " lock:" << lock << endl;
 
-            if( lock ) { 
+            if( lock ) {
+                // fsync and lock variation 
+
                 uassert(12034, "fsync: can't lock while an unlock is pending", !unlockRequested);
                 uassert(12032, "fsync: sync option must be true when using lock", sync);
-                /* With releaseEarly(), we must be extremely careful we don't do anything 
-                   where we would have assumed we were locked.  profiling is one of those things. 
-                   Perhaps at profile time we could check if we released early -- however, 
+                /* With releaseEarly(), we must be extremely careful we don't do anything
+                   where we would have assumed we were locked.  profiling is one of those things.
+                   Perhaps at profile time we could check if we released early -- however,
                    we need to be careful to keep that code very fast it's a very common code path when on.
                 */
                 uassert(12033, "fsync: profiling must be off to enter locked mode", cc().database()->profile == 0);
+
+                // todo future: Perhaps we could do this in the background thread.  As is now, writes may interleave between 
+                //              the releaseEarly below and the acquisition of the readlock in the background thread. 
+                //              However the real problem is that it seems complex to unlock here and then have a window for 
+                //              writes before the bg job -- can be done correctly but harder to reason about correctness.
+                //              If this command ran within a read lock in the first place, would it work, and then that 
+                //              would be quite easy?
+                //              Or, could we downgrade the write lock to a read lock, wait for ready, then release?
+                getDur().syncDataAndTruncateJournal();
+
                 bool ready = false;
                 LockDBJob *l = new LockDBJob(ready);
+
                 dbMutex.releaseEarly();
+
                 l->go();
-                // don't return until background thread has acquired the write lock
-                while( !ready ) { 
+                // don't return until background thread has acquired the read lock
+                while( !ready ) {
                     sleepmillis(10);
                 }
                 result.append("info", "now locked against writes, use db.$cmd.sys.unlock.findOne() to unlock");
             }
             else {
+                // the simple fsync command case
+
+                if (sync)
+                    getDur().commitNow();
                 result.append( "numFiles" , MemoryMappedFile::flushAll( sync ) );
             }
             return 1;
         }
-        
+
     } fsyncCmd;
-    
+
 
 
 }
diff --git a/db/dbcommands_generic.cpp b/db/dbcommands_generic.cpp
index 25c6a93..a555b6c 100644
--- a/db/dbcommands_generic.cpp
+++ b/db/dbcommands_generic.cpp
@@ -52,114 +52,192 @@ namespace mongo {
         CmdBuildInfo() : Command( "buildInfo", true, "buildinfo" ) {}
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return true; }
-        virtual LockType locktype() const { return NONE; } 
+        virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream &help ) const {
             help << "get version #, etc.\n";
             help << "{ buildinfo:1 }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             result << "version" << versionString << "gitVersion" << gitVersion() << "sysInfo" << sysInfo();
             result << "bits" << ( sizeof( int* ) == 4 ? 32 : 64 );
-            result.appendBool( "debug" , 
-#ifdef _DEBUG
-                               true
-#else
-                               false
-#endif
-                               );
+            result.appendBool( "debug" , debug );
+            result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
             return true;
         }
     } cmdBuildInfo;
 
+    /** experimental. either remove or add support in repl sets also.  in a repl set, getting this setting from the
+        repl set config could make sense.
+        */
+    unsigned replApplyBatchSize = 1;
 
-    /* just to check if the db has asserted */
-    class CmdAssertInfo : public Command {
+    class CmdGet : public Command {
     public:
-        virtual bool slaveOk() const {
+        CmdGet() : Command( "getParameter" ) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "get administrative option(s)\nexample:\n";
+            help << "{ getParameter:1, notablescan:1 }\n";
+            help << "supported so far:\n";
+            help << "  quiet\n";
+            help << "  notablescan\n";
+            help << "  logLevel\n";
+            help << "  syncdelay\n";
+            help << "{ getParameter:'*' } to get everything\n";
+        }
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            bool all = *cmdObj.firstElement().valuestrsafe() == '*';
+
+            int before = result.len();
+
+            if( all || cmdObj.hasElement("quiet") ) {
+                result.append("quiet", cmdLine.quiet );
+            }
+            if( all || cmdObj.hasElement("notablescan") ) {
+                result.append("notablescan", cmdLine.noTableScan);
+            }
+            if( all || cmdObj.hasElement("logLevel") ) {
+                result.append("logLevel", logLevel);
+            }
+            if( all || cmdObj.hasElement("syncdelay") ) {
+                result.append("syncdelay", cmdLine.syncdelay);
+            }
+            if( all || cmdObj.hasElement("replApplyBatchSize") ) {
+                result.append("replApplyBatchSize", replApplyBatchSize);
+            }
+
+            if ( before == result.len() ) {
+                errmsg = "no option found to get";
+                return false;
+            }
             return true;
         }
-        virtual void help( stringstream& help ) const {
-            help << "check if any asserts have occurred on the server";
+    } cmdGet;
+
+    class CmdSet : public Command {
+    public:
+        CmdSet() : Command( "setParameter" ) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "set administrative option(s)\nexample:\n";
+            help << "{ setParameter:1, notablescan:true }\n";
+            help << "supported so far:\n";
+            help << "  notablescan\n";
+            help << "  logLevel\n";
+            help << "  quiet\n";
         }
-        virtual LockType locktype() const { return WRITE; } 
-        CmdAssertInfo() : Command("assertInfo",true,"assertinfo") {}
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            result.appendBool("dbasserted", lastAssert[0].isSet() || lastAssert[1].isSet() || lastAssert[2].isSet());
-            result.appendBool("asserted", lastAssert[0].isSet() || lastAssert[1].isSet() || lastAssert[2].isSet() || lastAssert[3].isSet());
-            result.append("assert", lastAssert[AssertRegular].toString());
-            result.append("assertw", lastAssert[AssertW].toString());
-            result.append("assertmsg", lastAssert[AssertMsg].toString());
-            result.append("assertuser", lastAssert[AssertUser].toString());
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            int s = 0;
+            if( cmdObj.hasElement("notablescan") ) {
+                result.append("was", cmdLine.noTableScan);
+                cmdLine.noTableScan = cmdObj["notablescan"].Bool();
+                s++;
+            }
+            if( cmdObj.hasElement("quiet") ) {
+                result.append("was", cmdLine.quiet );
+                cmdLine.quiet = cmdObj["quiet"].Bool();
+                s++;
+            }
+            if( cmdObj.hasElement("syncdelay") ) {
+                result.append("was", cmdLine.syncdelay );
+                cmdLine.syncdelay = cmdObj["syncdelay"].Number();
+                s++;
+            }
+            if( cmdObj.hasElement( "logLevel" ) ) {
+                result.append("was", logLevel );
+                logLevel = cmdObj["logLevel"].numberInt();
+                s++;
+            }
+            if( cmdObj.hasElement( "replApplyBatchSize" ) ) {
+                result.append("was", replApplyBatchSize );
+                BSONElement e = cmdObj["replApplyBatchSize"];
+                ParameterValidator * v = ParameterValidator::get( e.fieldName() );
+                assert( v );
+                if ( ! v->isValid( e , errmsg ) )
+                    return false;
+                replApplyBatchSize = e.numberInt();
+                s++;
+            }
+
+            if( s == 0 ) {
+                errmsg = "no option found to set, use '*' to get all ";
+                return false;
+            }
+
             return true;
         }
-    } cmdAsserts;
+    } cmdSet;
 
     class PingCommand : public Command {
     public:
-        PingCommand() : Command( "ping" ){}
+        PingCommand() : Command( "ping" ) {}
         virtual bool slaveOk() const { return true; }
         virtual void help( stringstream &help ) const { help << "a way to check that the server is alive. responds immediately even if server is in a db lock."; }
         virtual LockType locktype() const { return NONE; }
         virtual bool requiresAuth() { return false; }
-        virtual bool run(const string& badns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){
+        virtual bool run(const string& badns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             // IMPORTANT: Don't put anything in here that might lock db - including authentication
             return true;
         }
     } pingCmd;
-    
+
     class FeaturesCmd : public Command {
     public:
-        FeaturesCmd() : Command( "features", true ){}
-        void help(stringstream& h) const { h << "return on build level feature settings"; }
+        FeaturesCmd() : Command( "features", true ) {}
+        void help(stringstream& h) const { h << "return build level feature settings"; }
         virtual bool slaveOk() const { return true; }
-        virtual bool readOnly(){ return true; }
-        virtual LockType locktype() const { return READ; } 
-        virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
-            if ( globalScriptEngine ){
+        virtual bool readOnly() { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( globalScriptEngine ) {
                 BSONObjBuilder bb( result.subobjStart( "js" ) );
                 result.append( "utf8" , globalScriptEngine->utf8Ok() );
                 bb.done();
             }
-            if ( cmdObj["oidReset"].trueValue() ){
-                result.append( "oidMachineOld" , OID::staticMachine() );
-                OID::newState();
+            if ( cmdObj["oidReset"].trueValue() ) {
+                result.append( "oidMachineOld" , OID::getMachineId() );
+                OID::regenMachineId();
             }
-            result.append( "oidMachine" , OID::staticMachine() );
+            result.append( "oidMachine" , OID::getMachineId() );
             return true;
         }
-        
+
     } featuresCmd;
 
     class LogRotateCmd : public Command {
     public:
-        LogRotateCmd() : Command( "logRotate" ){}
-        virtual LockType locktype() const { return NONE; } 
+        LogRotateCmd() : Command( "logRotate" ) {}
+        virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return true; }
         virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             rotateLogs();
             return 1;
-        }        
-        
+        }
+
     } logRotateCmd;
-    
+
     class ListCommandsCmd : public Command {
     public:
         virtual void help( stringstream &help ) const { help << "get a list of all db commands"; }
-        ListCommandsCmd() : Command( "listCommands", false ){}
-        virtual LockType locktype() const { return NONE; } 
+        ListCommandsCmd() : Command( "listCommands", false ) {}
+        virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return false; }
         virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONObjBuilder b( result.subobjStart( "commands" ) );
-            for ( map<string,Command*>::iterator i=_commands->begin(); i!=_commands->end(); ++i ){
+            for ( map<string,Command*>::iterator i=_commands->begin(); i!=_commands->end(); ++i ) {
                 Command * c = i->second;
 
                 // don't show oldnames
                 if (i->first != c->name)
                     continue;
 
-                BSONObjBuilder temp( b.subobjStart( c->name.c_str() ) );
+                BSONObjBuilder temp( b.subobjStart( c->name ) );
 
                 {
                     stringstream help;
@@ -174,10 +252,10 @@ namespace mongo {
             b.done();
 
             return 1;
-        }        
+        }
 
     } listCommandsCmd;
-    
+
     class CmdShutdown : public Command {
     public:
         virtual bool requiresAuth() { return true; }
@@ -189,7 +267,7 @@ namespace mongo {
         virtual bool slaveOk() const {
             return true;
         }
-        virtual LockType locktype() const { return WRITE; } 
+        virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream& help ) const {
             help << "shutdown the database.  must be ran against admin db and either (1) ran from localhost or (2) authenticated.\n";
         }
@@ -199,8 +277,11 @@ namespace mongo {
             if ( c ) {
                 c->shutdown();
             }
+
             log() << "terminating, shutdown command received" << endl;
-            dbexit( EXIT_CLEAN ); // this never returns
+
+            dbexit( EXIT_CLEAN , "shutdown called" , true ); // this never returns
+            assert(0);
             return true;
         }
     } cmdShutdown;
@@ -217,7 +298,7 @@ namespace mongo {
         virtual bool slaveOk() const {
             return true;
         }
-        virtual LockType locktype() const { return NONE; } 
+        virtual LockType locktype() const { return NONE; }
         CmdForceError() : Command("forceerror") {}
         bool run(const string& dbnamne, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             uassert( 10038 , "forced error", false);
@@ -225,6 +306,17 @@ namespace mongo {
         }
     } cmdForceError;
 
-    
+    class AvailableQueryOptions : public Command {
+    public:
+        AvailableQueryOptions() : Command( "availableQueryOptions" , false , "availablequeryoptions" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool requiresAuth() { return false; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            result << "options" << QueryOption_AllSupported;
+            return true;
+        }
+    } availableQueryOptionsCmd;
+
 
 }
diff --git a/db/dbeval.cpp b/db/dbeval.cpp
index e8a42b2..31d5260 100644
--- a/db/dbeval.cpp
+++ b/db/dbeval.cpp
@@ -37,7 +37,7 @@ namespace mongo {
 
     const int edebug=0;
 
-    bool dbEval(const char *ns, BSONObj& cmd, BSONObjBuilder& result, string& errmsg) {
+    bool dbEval(const string& dbName, BSONObj& cmd, BSONObjBuilder& result, string& errmsg) {
         BSONElement e = cmd.firstElement();
         uassert( 10046 ,  "eval needs Code" , e.type() == Code || e.type() == CodeWScope || e.type() == String );
 
@@ -60,16 +60,16 @@ namespace mongo {
             return false;
         }
 
-        auto_ptr<Scope> s = globalScriptEngine->getPooledScope( ns );
+        auto_ptr<Scope> s = globalScriptEngine->getPooledScope( dbName );
         ScriptingFunction f = s->createFunction(code);
         if ( f == 0 ) {
             errmsg = (string)"compile failed: " + s->getError();
             return false;
         }
-        
+
         if ( e.type() == CodeWScope )
             s->init( e.codeWScopeScopeData() );
-        s->localConnect( cc().database()->name.c_str() );
+        s->localConnect( dbName.c_str() );
 
         BSONObj args;
         {
@@ -89,7 +89,7 @@ namespace mongo {
             res = s->invoke(f,args, cmdLine.quota ? 10 * 60 * 1000 : 0 );
             int m = t.millis();
             if ( m > cmdLine.slowMS ) {
-                out() << "dbeval slow, time: " << dec << m << "ms " << ns << endl;
+                out() << "dbeval slow, time: " << dec << m << "ms " << dbName << endl;
                 if ( m >= 1000 ) log() << code << endl;
                 else OCCASIONALLY log() << code << endl;
             }
@@ -100,7 +100,7 @@ namespace mongo {
             errmsg += s->getError();
             return false;
         }
-        
+
         s->append( result , "retval" , "return" );
 
         return true;
@@ -122,16 +122,19 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         CmdEval() : Command("eval", false, "$eval") { }
         bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            
+
             AuthenticationInfo *ai = cc().getAuthenticationInfo();
             uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(dbname.c_str()) );
-            
+
+            if ( cmdObj["nolock"].trueValue() ) {
+                return dbEval(dbname, cmdObj, result, errmsg);
+            }
+
             // write security will be enforced in DBDirectClient
             mongolock lk( ai->isAuthorized( dbname.c_str() ) );
             Client::Context ctx( dbname );
-            
 
-            return dbEval(dbname.c_str(), cmdObj, result, errmsg);
+            return dbEval(dbname, cmdObj, result, errmsg);
         }
     } cmdeval;
 
diff --git a/db/dbhelpers.cpp b/db/dbhelpers.cpp
index 205787e..75db430 100644
--- a/db/dbhelpers.cpp
+++ b/db/dbhelpers.cpp
@@ -28,39 +28,6 @@
 
 namespace mongo {
 
-    CursorIterator::CursorIterator( shared_ptr<Cursor> c , BSONObj filter )
-        : _cursor( c ){
-            if ( ! filter.isEmpty() )
-                _matcher.reset( new CoveredIndexMatcher( filter , BSONObj() ) );
-            _advance();
-    }
-
-    BSONObj CursorIterator::next(){
-        BSONObj o = _o;
-        _advance();
-        return o;
-    }
-    
-    bool CursorIterator::hasNext(){
-        return ! _o.isEmpty();
-    }
-
-    void CursorIterator::_advance(){
-        if ( ! _cursor->ok() ){
-            _o = BSONObj();
-            return;
-        }
-        
-        while ( _cursor->ok() ){
-            _o = _cursor->current();
-            _cursor->advance();
-            if ( _matcher.get() == 0 || _matcher->matches( _o ) )
-                return;
-        }
-
-        _o = BSONObj();
-    }
-
     void Helpers::ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name) {
         NamespaceDetails *d = nsdetails(ns);
         if( d == 0 )
@@ -74,7 +41,7 @@ namespace mongo {
             }
         }
 
-        if( d->nIndexes >= NamespaceDetails::NIndexesMax ) { 
+        if( d->nIndexes >= NamespaceDetails::NIndexesMax ) {
             problem() << "Helper::ensureIndex fails, MaxIndexes exceeded " << ns << '\n';
             return;
         }
@@ -91,6 +58,7 @@ namespace mongo {
         theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize());
     }
 
+    /** Simple QueryOp implementation to return first match.  Does not support yielding. */
     class FindOne : public QueryOp {
     public:
         FindOne( bool requireIndex ) : requireIndex_( requireIndex ) {}
@@ -111,10 +79,15 @@ namespace mongo {
                 one_ = c_->current();
                 loc_ = c_->currLoc();
                 setStop();
-            } else {
+            }
+            else {
                 c_->advance();
             }
         }
+        virtual long long nscanned() {
+            assert( c_.get() );
+            return c_->nscanned();
+        }
         virtual bool mayRecordPlan() const { return false; }
         virtual QueryOp *_createChild() const { return new FindOne( requireIndex_ ); }
         BSONObj one() const { return one_; }
@@ -125,11 +98,11 @@ namespace mongo {
         BSONObj one_;
         DiskLoc loc_;
     };
-    
-    /* fetch a single object from collection ns that matches query 
+
+    /* fetch a single object from collection ns that matches query
        set your db SavedContext first
     */
-    bool Helpers::findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex) { 
+    bool Helpers::findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex) {
         MultiPlanScanner s( ns, query, BSONObj(), 0, !requireIndex );
         FindOne original( requireIndex );
         shared_ptr< FindOne > res = s.runOp( original );
@@ -141,10 +114,10 @@ namespace mongo {
         return true;
     }
 
-    /* fetch a single object from collection ns that matches query 
+    /* fetch a single object from collection ns that matches query
        set your db SavedContext first
     */
-    DiskLoc Helpers::findOne(const char *ns, const BSONObj &query, bool requireIndex) { 
+    DiskLoc Helpers::findOne(const char *ns, const BSONObj &query, bool requireIndex) {
         MultiPlanScanner s( ns, query, BSONObj(), 0, !requireIndex );
         FindOne original( requireIndex );
         shared_ptr< FindOne > res = s.runOp( original );
@@ -153,15 +126,8 @@ namespace mongo {
         return res->loc();
     }
 
-    auto_ptr<CursorIterator> Helpers::find( const char *ns , BSONObj query , bool requireIndex ){
-        uassert( 10047 ,  "requireIndex not supported in Helpers::find yet" , ! requireIndex );
-        auto_ptr<CursorIterator> i;
-        i.reset( new CursorIterator( DataFileMgr::findAll( ns ) , query ) );
-        return i;
-    }
-    
     bool Helpers::findById(Client& c, const char *ns, BSONObj query, BSONObj& result ,
-                           bool * nsFound , bool * indexFound ){
+                           bool * nsFound , bool * indexFound ) {
         dbMutex.assertAtLeastReadLocked();
         Database *database = c.database();
         assert( database );
@@ -170,7 +136,7 @@ namespace mongo {
             return false;
         if ( nsFound )
             *nsFound = 1;
-        
+
         int idxNo = d->findIdIndex();
         if ( idxNo < 0 )
             return false;
@@ -178,9 +144,9 @@ namespace mongo {
             *indexFound = 1;
 
         IndexDetails& i = d->idx( idxNo );
-        
+
         BSONObj key = i.getKeyFromQuery( query );
-        
+
         DiskLoc loc = i.head.btree()->findSingle( i , i.head , key );
         if ( loc.isNull() )
             return false;
@@ -188,16 +154,16 @@ namespace mongo {
         return true;
     }
 
-     DiskLoc Helpers::findById(NamespaceDetails *d, BSONObj idquery) {
-         int idxNo = d->findIdIndex();
-         uassert(13430, "no _id index", idxNo>=0);
-         IndexDetails& i = d->idx( idxNo );        
-         BSONObj key = i.getKeyFromQuery( idquery );
-         return i.head.btree()->findSingle( i , i.head , key );
+    DiskLoc Helpers::findById(NamespaceDetails *d, BSONObj idquery) {
+        int idxNo = d->findIdIndex();
+        uassert(13430, "no _id index", idxNo>=0);
+        IndexDetails& i = d->idx( idxNo );
+        BSONObj key = i.getKeyFromQuery( idquery );
+        return i.head.btree()->findSingle( i , i.head , key );
     }
 
-    bool Helpers::isEmpty(const char *ns) {
-        Client::Context context(ns);
+    bool Helpers::isEmpty(const char *ns, bool doAuth) {
+        Client::Context context(ns, dbpath, NULL, doAuth);
         shared_ptr<Cursor> c = DataFileMgr::findAll(ns);
         return !c->ok();
     }
@@ -221,17 +187,17 @@ namespace mongo {
     bool Helpers::getLast(const char *ns, BSONObj& result) {
         Client::Context ctx(ns);
         shared_ptr<Cursor> c = findTableScan(ns, reverseNaturalObj);
-        if( !c->ok() ) 
+        if( !c->ok() )
             return false;
         result = c->current();
         return true;
     }
 
-    void Helpers::upsert( const string& ns , const BSONObj& o ){
+    void Helpers::upsert( const string& ns , const BSONObj& o ) {
         BSONElement e = o["_id"];
         assert( e.type() );
         BSONObj id = e.wrap();
-        
+
         OpDebug debug;
         Client::Context context(ns);
         updateObjects(ns.c_str(), o, /*pattern=*/id, /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug );
@@ -249,12 +215,12 @@ namespace mongo {
         _updateObjects(/*god=*/true, ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , logTheOp , debug );
     }
 
-    BSONObj Helpers::toKeyFormat( const BSONObj& o , BSONObj& key ){
+    BSONObj Helpers::toKeyFormat( const BSONObj& o , BSONObj& key ) {
         BSONObjBuilder me;
         BSONObjBuilder k;
 
         BSONObjIterator i( o );
-        while ( i.more() ){
+        while ( i.more() ) {
             BSONElement e = i.next();
             k.append( e.fieldName() , 1 );
             me.appendAs( e , "" );
@@ -262,8 +228,8 @@ namespace mongo {
         key = k.obj();
         return me.obj();
     }
-    
-    long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback ){
+
+    long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback ) {
         BSONObj keya , keyb;
         BSONObj minClean = toKeyFormat( min , keya );
         BSONObj maxClean = toKeyFormat( max , keyb );
@@ -276,33 +242,35 @@ namespace mongo {
 
         int ii = nsd->findIndexByKeyPattern( keya );
         assert( ii >= 0 );
-        
+
         long long num = 0;
-        
+
         IndexDetails& i = nsd->idx( ii );
 
         shared_ptr<Cursor> c( new BtreeCursor( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) );
         auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
         cc->setDoingDeletes( true );
-        
-        while ( c->ok() ){
+
+        while ( c->ok() ) {
             DiskLoc rloc = c->currLoc();
-            BSONObj key = c->currKey();
 
             if ( callback )
                 callback->goingToDelete( c->current() );
-            
+
             c->advance();
             c->noteLocation();
-            
+
             logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() );
             theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc);
             num++;
 
             c->checkLocation();
 
-            if ( yield && ! cc->yieldSometimes() ){
+            getDur().commitIfNeeded();
+
+            if ( yield && ! cc->yieldSometimes() ) {
                 // cursor got finished by someone else, so we're done
+                cc.release(); // if the collection/db is dropped, cc may be deleted
                 break;
             }
         }
@@ -325,11 +293,12 @@ namespace mongo {
                 BSONObjBuilder result;
                 dropCollection( name_, errmsg, result );
             }
-        } catch ( ... ) {
+        }
+        catch ( ... ) {
             problem() << "exception cleaning up DbSet" << endl;
         }
     }
-    
+
     void DbSet::reset( const string &name, const BSONObj &key ) {
         if ( !name.empty() )
             name_ = name;
@@ -338,74 +307,77 @@ namespace mongo {
         Client::Context c( name_.c_str() );
         if ( nsdetails( name_.c_str() ) ) {
             Helpers::emptyCollection( name_.c_str() );
-        } else {
+        }
+        else {
             string err;
             massert( 10303 ,  err, userCreateNS( name_.c_str(), fromjson( "{autoIndexId:false}" ), err, false ) );
         }
-        Helpers::ensureIndex( name_.c_str(), key_, true, "setIdx" );            
+        Helpers::ensureIndex( name_.c_str(), key_, true, "setIdx" );
     }
-    
+
     bool DbSet::get( const BSONObj &obj ) const {
         Client::Context c( name_.c_str() );
         BSONObj temp;
         return Helpers::findOne( name_.c_str(), obj, temp, true );
     }
-    
+
     void DbSet::set( const BSONObj &obj, bool val ) {
         Client::Context c( name_.c_str() );
         if ( val ) {
             try {
                 BSONObj k = obj;
                 theDataFileMgr.insertWithObjMod( name_.c_str(), k, false );
-            } catch ( DBException& ) {
+            }
+            catch ( DBException& ) {
                 // dup key - already in set
             }
-        } else {
+        }
+        else {
             deleteObjects( name_.c_str(), obj, true, false, false );
-        }                        
+        }
     }
 
-    RemoveSaver::RemoveSaver( const string& a , const string& b , const string& why) : _out(0){
+    RemoveSaver::RemoveSaver( const string& a , const string& b , const string& why) : _out(0) {
         static int NUM = 0;
-        
+
         _root = dbpath;
         if ( a.size() )
             _root /= a;
         if ( b.size() )
             _root /= b;
         assert( a.size() || b.size() );
-        
+
         _file = _root;
-        
+
         stringstream ss;
         ss << why << "." << terseCurrentTime(false) << "." << NUM++ << ".bson";
         _file /= ss.str();
 
     }
-    
-    RemoveSaver::~RemoveSaver(){
-        if ( _out ){
+
+    RemoveSaver::~RemoveSaver() {
+        if ( _out ) {
             _out->close();
             delete _out;
             _out = 0;
         }
     }
-    
-    void RemoveSaver::goingToDelete( const BSONObj& o ){
-        if ( ! _out ){
+
+    void RemoveSaver::goingToDelete( const BSONObj& o ) {
+        if ( ! _out ) {
             create_directories( _root );
             _out = new ofstream();
             _out->open( _file.string().c_str() , ios_base::out | ios_base::binary );
-            if ( ! _out->good() ){
+            if ( ! _out->good() ) {
                 log( LL_WARNING ) << "couldn't create file: " << _file.string() << " for remove saving" << endl;
                 delete _out;
                 _out = 0;
                 return;
             }
-            
+
         }
         _out->write( o.objdata() , o.objsize() );
     }
-    
-        
+
+
 } // namespace mongo
diff --git a/db/dbhelpers.h b/db/dbhelpers.h
index ee9a59c..e793d3f 100644
--- a/db/dbhelpers.h
+++ b/db/dbhelpers.h
@@ -33,24 +33,10 @@ namespace mongo {
     class Cursor;
     class CoveredIndexMatcher;
 
-    class CursorIterator {
-    public:
-        CursorIterator( shared_ptr<Cursor> c , BSONObj filter = BSONObj() );
-        BSONObj next();
-        bool hasNext();
-
-    private:
-        void _advance();
-
-        shared_ptr<Cursor> _cursor;
-        auto_ptr<CoveredIndexMatcher> _matcher;
-        BSONObj _o;
-    };
-
     /**
        all helpers assume locking is handled above them
      */
-    struct Helpers { 
+    struct Helpers {
 
         /* ensure the specified index exists.
 
@@ -68,7 +54,7 @@ namespace mongo {
         /* fetch a single object from collection ns that matches query.
            set your db SavedContext first.
 
-           @param query - the query to perform.  note this is the low level portion of query so "orderby : ..." 
+           @param query - the query to perform.  note this is the low level portion of query so "orderby : ..."
                           won't work.
 
            @param requireIndex if true, complain if no index for the query.  a way to guard against
@@ -77,21 +63,19 @@ namespace mongo {
            @return true if object found
         */
         static bool findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex = false);
-        static DiskLoc findOne(const char *ns, const BSONObj &query, bool requireIndex);        
+        static DiskLoc findOne(const char *ns, const BSONObj &query, bool requireIndex);
 
         /**
          * @param foundIndex if passed in will be set to 1 if ns and index found
          * @return true if object found
          */
-        static bool findById(Client&, const char *ns, BSONObj query, BSONObj& result , 
+        static bool findById(Client&, const char *ns, BSONObj query, BSONObj& result ,
                              bool * nsFound = 0 , bool * indexFound = 0 );
 
-        /* uasserts if no _id index. 
+        /* uasserts if no _id index.
            @return null loc if not found */
         static DiskLoc findById(NamespaceDetails *d, BSONObj query);
 
-        static auto_ptr<CursorIterator> find( const char *ns , BSONObj query = BSONObj() , bool requireIndex = false );
-
         /** Get/put the first (or last) object from a collection.  Generally only useful if the collection
             only ever has a single object -- which is a "singleton collection".
 
@@ -103,7 +87,7 @@ namespace mongo {
         static void putSingleton(const char *ns, BSONObj obj);
         static void putSingletonGod(const char *ns, BSONObj obj, bool logTheOp);
         static bool getFirst(const char *ns, BSONObj& result) { return getSingleton(ns, result); }
-        static bool getLast(const char *ns, BSONObj& result); // get last object int he collection; e.g. {$natural : -1}        
+        static bool getLast(const char *ns, BSONObj& result); // get last object int he collection; e.g. {$natural : -1}
 
         /**
          * you have to lock
@@ -115,14 +99,14 @@ namespace mongo {
         /** You do not need to set the database before calling.
             @return true if collection is empty.
         */
-        static bool isEmpty(const char *ns);
+        static bool isEmpty(const char *ns, bool doAuth=true);
 
         // TODO: this should be somewhere else probably
         static BSONObj toKeyFormat( const BSONObj& o , BSONObj& key );
 
         class RemoveCallback {
         public:
-            virtual ~RemoveCallback(){}
+            virtual ~RemoveCallback() {}
             virtual void goingToDelete( const BSONObj& o ) = 0;
         };
         /* removeRange: operation is oplog'd */
@@ -163,13 +147,13 @@ namespace mongo {
         ~RemoveSaver();
 
         void goingToDelete( const BSONObj& o );
-        
+
     private:
         path _root;
         path _file;
         ofstream* _out;
-        
+
     };
 
-    
+
 } // namespace mongo
diff --git a/db/dbmessage.h b/db/dbmessage.h
index 2849de8..cc1d1d8 100644
--- a/db/dbmessage.h
+++ b/db/dbmessage.h
@@ -18,7 +18,7 @@
 
 #include "diskloc.h"
 #include "jsobj.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "../util/message.h"
 #include "../client/constants.h"
 
@@ -35,7 +35,7 @@ namespace mongo {
     */
 
     extern bool objcheck;
-    
+
 #pragma pack(1)
     struct QueryResult : public MsgData {
         long long cursorId;
@@ -50,7 +50,7 @@ namespace mongo {
         int& _resultFlags() {
             return dataAsInt();
         }
-        void setResultFlagsToOk() { 
+        void setResultFlagsToOk() {
             _resultFlags() = ResultFlag_AwaitCapable;
         }
     };
@@ -63,8 +63,7 @@ namespace mongo {
     */
     class DbMessage {
     public:
-        DbMessage(const Message& _m) : m(_m)
-        {
+        DbMessage(const Message& _m) : m(_m) , mark(0) {
             // for received messages, Message has only one buffer
             theEnd = _m.singleData()->_data + _m.header()->dataLen();
             char *r = _m.singleData()->_data;
@@ -86,7 +85,7 @@ namespace mongo {
         const char * afterNS() const {
             return data + strlen( data ) + 1;
         }
-        
+
         int getInt( int num ) const {
             const int * foo = (const int*)afterNS();
             return foo[num];
@@ -96,7 +95,17 @@ namespace mongo {
             return getInt( 1 );
         }
 
-        void resetPull(){ nextjsobj = data; }
+        /**
+         * get an int64 at specified offsetBytes after ns
+         */
+        long long getInt64( int offsetBytes ) const {
+            const char * x = afterNS();
+            x += offsetBytes;
+            const long long * ll = (const long long*)x;
+            return ll[0];
+        }
+
+        void resetPull() { nextjsobj = data; }
         int pullInt() const { return pullInt(); }
         int& pullInt() {
             if ( nextjsobj == data )
@@ -140,10 +149,10 @@ namespace mongo {
             BSONObj js(nextjsobj);
             massert( 10305 ,  "Client Error: Invalid object size", js.objsize() > 3 );
             massert( 10306 ,  "Client Error: Next object larger than space left in message",
-                    js.objsize() < ( theEnd - data ) );
+                     js.objsize() < ( theEnd - data ) );
             if ( objcheck && !js.valid() ) {
                 massert( 10307 , "Client Error: bad object in message", false);
-            }            
+            }
             nextjsobj += js.objsize();
             if ( nextjsobj >= theEnd )
                 nextjsobj = 0;
@@ -152,11 +161,12 @@ namespace mongo {
 
         const Message& msg() const { return m; }
 
-        void markSet(){
+        void markSet() {
             mark = nextjsobj;
         }
-        
-        void markReset(){
+
+        void markReset() {
+            assert( mark );
             nextjsobj = mark;
         }
 
@@ -180,7 +190,7 @@ namespace mongo {
         int queryOptions;
         BSONObj query;
         BSONObj fields;
-        
+
         /* parses the message into the above fields */
         QueryMessage(DbMessage& d) {
             ns = d.getns();
@@ -232,8 +242,7 @@ namespace mongo {
     /* object reply helper. */
     inline void replyToQuery(int queryResultFlags,
                              AbstractMessagingPort* p, Message& requestMsg,
-                             BSONObj& responseObj)
-    {
+                             BSONObj& responseObj) {
         replyToQuery(queryResultFlags,
                      p, requestMsg,
                      (void *) responseObj.objdata(), responseObj.objsize(), 1);
diff --git a/db/dbwebserver.cpp b/db/dbwebserver.cpp
index f17a283..7aa6148 100644
--- a/db/dbwebserver.cpp
+++ b/db/dbwebserver.cpp
@@ -32,6 +32,7 @@
 #include "../util/version.h"
 #include "../util/ramlog.h"
 #include <pcrecpp.h>
+#include "../util/admin_access.h"
 #include "dbwebserver.h"
 #include <boost/date_time/posix_time/posix_time.hpp>
 #undef assert
@@ -52,18 +53,20 @@ namespace mongo {
     };
 
     bool execCommand( Command * c ,
-                      Client& client , int queryOptions , 
-                      const char *ns, BSONObj& cmdObj , 
-                      BSONObjBuilder& result, 
+                      Client& client , int queryOptions ,
+                      const char *ns, BSONObj& cmdObj ,
+                      BSONObjBuilder& result,
                       bool fromRepl );
 
     class DbWebServer : public MiniWebServer {
     public:
-        DbWebServer(const string& ip, int port) : MiniWebServer(ip, port) {
+        DbWebServer(const string& ip, int port, const AdminAccess* webUsers)
+            : MiniWebServer(ip, port), _webUsers(webUsers) {
             WebStatusPlugin::initAll();
         }
 
     private:
+        const AdminAccess* _webUsers; // not owned here
 
         void doUnlockedStuff(stringstream& ss) {
             /* this is in the header already ss << "port:      " << port << '\n'; */
@@ -75,37 +78,35 @@ namespace mongo {
             ss << "</pre>";
         }
 
-    private:
-        
         bool allowed( const char * rq , vector<string>& headers, const SockAddr &from ) {
             if ( from.isLocalHost() )
                 return true;
 
-            if ( ! webHaveAdminUsers() )
+            if ( ! _webUsers->haveAdminUsers() )
                 return true;
 
             string auth = getHeader( rq , "Authorization" );
 
-            if ( auth.size() > 0 && auth.find( "Digest " ) == 0 ){
+            if ( auth.size() > 0 && auth.find( "Digest " ) == 0 ) {
                 auth = auth.substr( 7 ) + ", ";
 
                 map<string,string> parms;
                 pcrecpp::StringPiece input( auth );
-                
+
                 string name, val;
                 pcrecpp::RE re("(\\w+)=\"?(.*?)\"?, ");
-                while ( re.Consume( &input, &name, &val) ){
+                while ( re.Consume( &input, &name, &val) ) {
                     parms[name] = val;
                 }
 
-                BSONObj user = webGetAdminUser( parms["username"] );
-                if ( ! user.isEmpty() ){
+                BSONObj user = _webUsers->getAdminUser( parms["username"] );
+                if ( ! user.isEmpty() ) {
                     string ha1 = user["pwd"].str();
                     string ha2 = md5simpledigest( (string)"GET" + ":" + parms["uri"] );
-                    
+
                     stringstream r;
                     r << ha1 << ':' << parms["nonce"];
-                    if ( parms["nc"].size() && parms["cnonce"].size() && parms["qop"].size() ){
+                    if ( parms["nc"].size() && parms["cnonce"].size() && parms["qop"].size() ) {
                         r << ':';
                         r << parms["nc"];
                         r << ':';
@@ -116,22 +117,20 @@ namespace mongo {
                     r << ':';
                     r << ha2;
                     string r1 = md5simpledigest( r.str() );
-                    
+
                     if ( r1 == parms["response"] )
                         return true;
                 }
-
-                
             }
-            
+
             stringstream authHeader;
-            authHeader 
-                << "WWW-Authenticate: "
-                << "Digest realm=\"mongo\", "
-                << "nonce=\"abc\", " 
-                << "algorithm=MD5, qop=\"auth\" "
-                ;
-            
+            authHeader
+                    << "WWW-Authenticate: "
+                    << "Digest realm=\"mongo\", "
+                    << "nonce=\"abc\", "
+                    << "algorithm=MD5, qop=\"auth\" "
+                    ;
+
             headers.push_back( authHeader.str() );
             return 0;
         }
@@ -144,24 +143,39 @@ namespace mongo {
             int& responseCode,
             vector<string>& headers, // if completely empty, content-type: text/html will be added
             const SockAddr &from
-        )
-        {
+        ) {
             if ( url.size() > 1 ) {
-                
+
                 if ( ! allowed( rq , headers, from ) ) {
                     responseCode = 401;
                     headers.push_back( "Content-Type: text/plain" );
                     responseMsg = "not allowed\n";
                     return;
-                }              
+                }
 
                 {
+                    BSONObj params;
+                    const size_t pos = url.find( "?" );
+                    if ( pos != string::npos ) {
+                        MiniWebServer::parseParams( params , url.substr( pos + 1 ) );
+                        url = url.substr(0, pos);
+                    }
+
                     DbWebHandler * handler = DbWebHandler::findHandler( url );
-                    if ( handler ){
-                        if ( handler->requiresREST( url ) && ! cmdLine.rest )
+                    if ( handler ) {
+                        if ( handler->requiresREST( url ) && ! cmdLine.rest ) {
                             _rejectREST( responseMsg , responseCode , headers );
-                        else
-                            handler->handle( rq , url , responseMsg , responseCode , headers , from );
+                        }
+                        else {
+                            string callback = params.getStringField("jsonp");
+                            uassert(13453, "server not started with --jsonp", callback.empty() || cmdLine.jsonp);
+
+                            handler->handle( rq , url , params , responseMsg , responseCode , headers , from );
+
+                            if (responseCode == 200 && !callback.empty()) {
+                                responseMsg = callback + '(' + responseMsg + ')';
+                            }
+                        }
                         return;
                     }
                 }
@@ -171,27 +185,27 @@ namespace mongo {
                     _rejectREST( responseMsg , responseCode , headers );
                     return;
                 }
-                
+
                 responseCode = 404;
                 headers.push_back( "Content-Type: text/html" );
                 responseMsg = "<html><body>unknown url</body></html>\n";
                 return;
             }
-            
+
             // generate home page
 
-            if ( ! allowed( rq , headers, from ) ){
+            if ( ! allowed( rq , headers, from ) ) {
                 responseCode = 401;
                 responseMsg = "not allowed\n";
                 return;
-            }            
+            }
 
             responseCode = 200;
             stringstream ss;
             string dbname;
             {
                 stringstream z;
-                z << "mongod " << prettyHostName();
+                z << cmdLine.binaryName << ' ' << prettyHostName();
                 dbname = z.str();
             }
             ss << start(dbname) << h2(dbname);
@@ -202,12 +216,18 @@ namespace mongo {
             {
                 const map<string, Command*> *m = Command::webCommands();
                 if( m ) {
-                    ss << a("", "These read-only context-less commands can be executed from the web interface.  Results are json format, unless ?text is appended in which case the result is output as text for easier human viewing", "Commands") << ": ";
-                    for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ ) { 
+                    ss <<
+                       a("",
+                         "These read-only context-less commands can be executed from the web interface. "
+                         "Results are json format, unless ?text=1 is appended in which case the result is output as text "
+                         "for easier human viewing",
+                         "Commands")
+                       << ": ";
+                    for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ ) {
                         stringstream h;
                         i->second->help(h);
                         string help = h.str();
-                        ss << "<a href=\"/" << i->first << "?text\"";
+                        ss << "<a href=\"/" << i->first << "?text=1\"";
                         if( help != "no help defined" )
                             ss << " title=\"" << help << '"';
                         ss << ">" << i->first << "</a> ";
@@ -216,69 +236,67 @@ namespace mongo {
                 }
             }
             ss << '\n';
-	    /*
-            ss << "HTTP <a "
-                "title=\"click for documentation on this http interface\""
-                "href=\"http://www.mongodb.org/display/DOCS/Http+Interface\">admin port</a>:" << _port << "<p>\n";
-	    */
+            /*
+                ss << "HTTP <a "
+                    "title=\"click for documentation on this http interface\""
+                    "href=\"http://www.mongodb.org/display/DOCS/Http+Interface\">admin port</a>:" << _port << "<p>\n";
+            */
 
             doUnlockedStuff(ss);
 
             WebStatusPlugin::runAll( ss );
-            
+
             ss << "</body></html>\n";
             responseMsg = ss.str();
-
-
         }
 
-        void _rejectREST( string& responseMsg , int& responseCode, vector<string>& headers ){
-                                responseCode = 403;
-                    stringstream ss;
-                    ss << "REST is not enabled.  use --rest to turn on.\n";
-                    ss << "check that port " << _port << " is secured for the network too.\n";
-                    responseMsg = ss.str();
-                    headers.push_back( "Content-Type: text/plain" );
+        void _rejectREST( string& responseMsg , int& responseCode, vector<string>& headers ) {
+            responseCode = 403;
+            stringstream ss;
+            ss << "REST is not enabled.  use --rest to turn on.\n";
+            ss << "check that port " << _port << " is secured for the network too.\n";
+            responseMsg = ss.str();
+            headers.push_back( "Content-Type: text/plain" );
         }
 
     };
     // ---
-    
-    bool prisort( const Prioritizable * a , const Prioritizable * b ){
+
+    bool prisort( const Prioritizable * a , const Prioritizable * b ) {
         return a->priority() < b->priority();
     }
 
     // -- status framework ---
-    WebStatusPlugin::WebStatusPlugin( const string& secionName , double priority , const string& subheader ) 
+    WebStatusPlugin::WebStatusPlugin( const string& secionName , double priority , const string& subheader )
         : Prioritizable(priority), _name( secionName ) , _subHeading( subheader ) {
         if ( ! _plugins )
             _plugins = new vector<WebStatusPlugin*>();
         _plugins->push_back( this );
     }
 
-    void WebStatusPlugin::initAll(){
+    void WebStatusPlugin::initAll() {
         if ( ! _plugins )
             return;
-        
+
         sort( _plugins->begin(), _plugins->end() , prisort );
-        
+
         for ( unsigned i=0; i<_plugins->size(); i++ )
             (*_plugins)[i]->init();
     }
 
-    void WebStatusPlugin::runAll( stringstream& ss ){
+    void WebStatusPlugin::runAll( stringstream& ss ) {
         if ( ! _plugins )
             return;
-        
-        for ( unsigned i=0; i<_plugins->size(); i++ ){
+
+        for ( unsigned i=0; i<_plugins->size(); i++ ) {
             WebStatusPlugin * p = (*_plugins)[i];
-            ss << "<hr>\n" 
+            ss << "<hr>\n"
                << "<b>" << p->_name << "</b>";
-            
+
             ss << " " << p->_subHeading;
 
             ss << "<br>\n";
-            
+
             p->run(ss);
         }
 
@@ -290,29 +308,30 @@ namespace mongo {
 
     class LogPlugin : public WebStatusPlugin {
     public:
-        LogPlugin() : WebStatusPlugin( "Log" , 100 ), _log(0){
+        LogPlugin() : WebStatusPlugin( "Log" , 100 ), _log(0) {
         }
-        
-        virtual void init(){
+
+        virtual void init() {
             assert( ! _log );
             _log = new RamLog();
             Logstream::get().addGlobalTee( _log );
         }
 
-        virtual void run( stringstream& ss ){
+        virtual void run( stringstream& ss ) {
             _log->toHTML( ss );
         }
         RamLog * _log;
     };
-      
+
     LogPlugin * logPlugin = new LogPlugin();
 
     // -- handler framework ---
 
     DbWebHandler::DbWebHandler( const string& name , double priority , bool requiresREST )
-        : Prioritizable(priority), _name(name) , _requiresREST(requiresREST){
+        : Prioritizable(priority), _name(name) , _requiresREST(requiresREST) {
 
-        { // setup strings
+        {
+            // setup strings
             _defaultUrl = "/";
             _defaultUrl += name;
 
@@ -320,8 +339,9 @@ namespace mongo {
             ss << name << " priority: " << priority << " rest: " << requiresREST;
             _toString = ss.str();
         }
-        
-        { // add to handler list
+
+        {
+            // add to handler list
             if ( ! _handlers )
                 _handlers = new vector<DbWebHandler*>();
             _handlers->push_back( this );
@@ -329,11 +349,11 @@ namespace mongo {
         }
     }
 
-    DbWebHandler * DbWebHandler::findHandler( const string& url ){
+    DbWebHandler * DbWebHandler::findHandler( const string& url ) {
         if ( ! _handlers )
             return 0;
-        
-        for ( unsigned i=0; i<_handlers->size(); i++ ){
+
+        for ( unsigned i=0; i<_handlers->size(); i++ ) {
             DbWebHandler * h = (*_handlers)[i];
             if ( h->handles( url ) )
                 return h;
@@ -341,76 +361,71 @@ namespace mongo {
 
         return 0;
     }
-    
+
     vector<DbWebHandler*> * DbWebHandler::_handlers = 0;
 
     // --- basic handlers ---
 
     class FavIconHandler : public DbWebHandler {
     public:
-        FavIconHandler() : DbWebHandler( "favicon.ico" , 0 , false ){}
+        FavIconHandler() : DbWebHandler( "favicon.ico" , 0 , false ) {}
 
-        virtual void handle( const char *rq, string url, 
+        virtual void handle( const char *rq, string url, BSONObj params,
                              string& responseMsg, int& responseCode,
-                             vector<string>& headers,  const SockAddr &from ){
+                             vector<string>& headers,  const SockAddr &from ) {
             responseCode = 404;
             headers.push_back( "Content-Type: text/plain" );
             responseMsg = "no favicon\n";
         }
 
     } faviconHandler;
-    
+
     class StatusHandler : public DbWebHandler {
     public:
-        StatusHandler() : DbWebHandler( "_status" , 1 , false ){}
-        
-        virtual void handle( const char *rq, string url, 
+        StatusHandler() : DbWebHandler( "_status" , 1 , false ) {}
+
+        virtual void handle( const char *rq, string url, BSONObj params,
                              string& responseMsg, int& responseCode,
-                             vector<string>& headers,  const SockAddr &from ){
+                             vector<string>& headers,  const SockAddr &from ) {
             headers.push_back( "Content-Type: application/json" );
             responseCode = 200;
-            
+
             static vector<string> commands;
-            if ( commands.size() == 0 ){
+            if ( commands.size() == 0 ) {
                 commands.push_back( "serverStatus" );
                 commands.push_back( "buildinfo" );
             }
-            
-            BSONObj params;
-            if ( url.find( "?" ) != string::npos ) {
-                MiniWebServer::parseParams( params , url.substr( url.find( "?" ) + 1 ) );
-            }
-            
+
             BSONObjBuilder buf(1024);
-            
-            for ( unsigned i=0; i<commands.size(); i++ ){
+
+            for ( unsigned i=0; i<commands.size(); i++ ) {
                 string cmd = commands[i];
 
                 Command * c = Command::findCommand( cmd );
                 assert( c );
                 assert( c->locktype() == 0 );
-                
+
                 BSONObj co;
                 {
                     BSONObjBuilder b;
                     b.append( cmd , 1 );
-                    
-                    if ( cmd == "serverStatus" && params["repl"].type() ){
+
+                    if ( cmd == "serverStatus" && params["repl"].type() ) {
                         b.append( "repl" , atoi( params["repl"].valuestr() ) );
                     }
-                    
+
                     co = b.obj();
                 }
-                
+
                 string errmsg;
-                
+
                 BSONObjBuilder sub;
                 if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) )
                     buf.append( cmd , errmsg );
                 else
                     buf.append( cmd , sub.obj() );
             }
-            
+
             responseMsg = buf.obj().jsonString();
 
         }
@@ -419,14 +434,14 @@ namespace mongo {
 
     class CommandListHandler : public DbWebHandler {
     public:
-        CommandListHandler() : DbWebHandler( "_commands" , 1 , true ){}
-        
-        virtual void handle( const char *rq, string url, 
+        CommandListHandler() : DbWebHandler( "_commands" , 1 , true ) {}
+
+        virtual void handle( const char *rq, string url, BSONObj params,
                              string& responseMsg, int& responseCode,
-                             vector<string>& headers,  const SockAddr &from ){
+                             vector<string>& headers,  const SockAddr &from ) {
             headers.push_back( "Content-Type: text/html" );
             responseCode = 200;
-            
+
             stringstream ss;
             ss << start("Commands List");
             ss << p( a("/", "back", "Home") );
@@ -435,41 +450,21 @@ namespace mongo {
             ss << "S:slave-ok  R:read-lock  W:write-lock  A:admin-only<br>\n";
             ss << table();
             ss << "<tr><th>Command</th><th>Attributes</th><th>Help</th></tr>\n";
-            for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ ) 
+            for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ )
                 i->second->htmlHelp(ss);
             ss << _table() << _end();
-            
+
             responseMsg = ss.str();
         }
     } commandListHandler;
 
     class CommandsHandler : public DbWebHandler {
     public:
-        CommandsHandler() : DbWebHandler( "DUMMY COMMANDS" , 2 , true ){}
-        
-        bool _cmd( const string& url , string& cmd , bool& text ) const {
-            const char * x = url.c_str();
-            
-            if ( x[0] != '/' ){
-                // this should never happen
-                return false;
-            }
-            
-            if ( strchr( x + 1 , '/' ) )
-                return false;
-            
-            x++;
+        CommandsHandler() : DbWebHandler( "DUMMY COMMANDS" , 2 , true ) {}
 
-            const char * end = strstr( x , "?text" );
-            if ( end ){
-                text = true;
-                cmd = string( x , end - x );
-            }
-            else {
-                text = false;
-                cmd = string(x);
-            }
-             
+        bool _cmd( const string& url , string& cmd , bool& text, bo params ) const {
+            cmd = str::after(url, '/');
+            text = params["text"].boolean();
             return true;
         }
 
@@ -477,45 +472,43 @@ namespace mongo {
             const map<string,Command*> *m = Command::webCommands();
             if( ! m )
                 return 0;
-            
+
             map<string,Command*>::const_iterator i = m->find(cmd);
             if ( i == m->end() )
                 return 0;
-            
+
             return i->second;
         }
 
-        virtual bool handles( const string& url ) const { 
+        virtual bool handles( const string& url ) const {
             string cmd;
             bool text;
-            if ( ! _cmd( url , cmd , text ) )
+            if ( ! _cmd( url , cmd , text, bo() ) )
                 return false;
-
-            return _cmd( cmd );
+            return _cmd(cmd) != 0;
         }
-        
-        virtual void handle( const char *rq, string url, 
+
+        virtual void handle( const char *rq, string url, BSONObj params,
                              string& responseMsg, int& responseCode,
-                             vector<string>& headers,  const SockAddr &from ){
-            
+                             vector<string>& headers,  const SockAddr &from ) {
             string cmd;
             bool text = false;
-            assert( _cmd( url , cmd , text ) );
+            assert( _cmd( url , cmd , text, params ) );
             Command * c = _cmd( cmd );
             assert( c );
 
             BSONObj cmdObj = BSON( cmd << 1 );
             Client& client = cc();
-            
+
             BSONObjBuilder result;
             execCommand(c, client, 0, "admin.", cmdObj , result, false);
-            
+
             responseCode = 200;
-            
-            string j = result.done().jsonString(JS, text );
+
+            string j = result.done().jsonString(Strict, text );
             responseMsg = j;
-            
-            if( text ){
+
+            if( text ) {
                 headers.push_back( "Content-Type: text/plain" );
                 responseMsg += '\n';
             }
@@ -524,23 +517,16 @@ namespace mongo {
             }
 
         }
-        
+
     } commandsHandler;
 
     // --- external ----
 
-    string prettyHostName() { 
-        stringstream s;
-        s << getHostName();
-        if( mongo::cmdLine.port != CmdLine::DefaultDBPort ) 
-            s << ':' << mongo::cmdLine.port;
-        return s.str();
-    }
-
-    void webServerThread() {
+    void webServerThread(const AdminAccess* adminAccess) {
+        boost::scoped_ptr<const AdminAccess> adminAccessPtr(adminAccess); // adminAccess is owned here
         Client::initThread("websvr");
         const int p = cmdLine.port + 1000;
-        DbWebServer mini(cmdLine.bind_ip, p);
+        DbWebServer mini(cmdLine.bind_ip, p, adminAccessPtr.get());
         log() << "web admin interface listening on port " << p << endl;
         mini.initAndListen();
         cc().shutdown();
diff --git a/db/dbwebserver.h b/db/dbwebserver.h
index d1a2f0d..bdbcba2 100644
--- a/db/dbwebserver.h
+++ b/db/dbwebserver.h
@@ -17,20 +17,22 @@
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#include "../util/admin_access.h"
+
 namespace mongo {
 
     class Prioritizable {
     public:
-        Prioritizable( double p ) : _priority(p){}
+        Prioritizable( double p ) : _priority(p) {}
         double priority() const { return _priority; }
     private:
         double _priority;
     };
-    
+
     class DbWebHandler : public Prioritizable {
     public:
         DbWebHandler( const string& name , double priority , bool requiresREST );
-        virtual ~DbWebHandler(){}
+        virtual ~DbWebHandler() {}
 
         virtual bool handles( const string& url ) const { return url == _defaultUrl; }
 
@@ -38,20 +40,21 @@ namespace mongo {
 
         virtual void handle( const char *rq, // the full request
                              string url,
+                             BSONObj params,
                              // set these and return them:
                              string& responseMsg,
                              int& responseCode,
                              vector<string>& headers, // if completely empty, content-type: text/html will be added
                              const SockAddr &from
-                             ) = 0;
-        
+                           ) = 0;
+
         string toString() const { return _toString; }
         static DbWebHandler * findHandler( const string& url );
 
     private:
         string _name;
         bool _requiresREST;
-        
+
         string _defaultUrl;
         string _toString;
 
@@ -61,8 +64,8 @@ namespace mongo {
     class WebStatusPlugin : public Prioritizable {
     public:
         WebStatusPlugin( const string& secionName , double priority , const string& subheader = "" );
-        virtual ~WebStatusPlugin(){}
-        
+        virtual ~WebStatusPlugin() {}
+
         virtual void run( stringstream& ss ) = 0;
         /** called when web server stats up */
         virtual void init() = 0;
@@ -73,18 +76,10 @@ namespace mongo {
         string _name;
         string _subHeading;
         static vector<WebStatusPlugin*> * _plugins;
-        
+
     };
 
-    void webServerThread();
+    void webServerThread( const AdminAccess* admins );
     string prettyHostName();
-    
-    /** @return if there are any admin users.  this should not block for long and throw if can't get a lock if needed */
-    bool webHaveAdminUsers();
-    
-    /** @return admin user with this name.  this should not block for long and throw if can't get a lock if needed */
-    BSONObj webGetAdminUser( const string& username );
 
 };
-
-
diff --git a/db/diskloc.h b/db/diskloc.h
index 2747abd..f356c73 100644
--- a/db/diskloc.h
+++ b/db/diskloc.h
@@ -14,7 +14,7 @@
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-/* storage.h
+/* @file diskloc.h
 
    Storage subsystem management.
    Lays out our datafiles on disk, manages disk space.
@@ -26,7 +26,6 @@
 
 namespace mongo {
 
-
     class Record;
     class DeletedRecord;
     class Extent;
@@ -34,77 +33,64 @@ namespace mongo {
     class MongoDataFile;
 
 #pragma pack(1)
+    /** represents a disk location/offset on disk in a database.  64 bits.
+        it is assumed these will be passed around by value a lot so don't do anything to make them large
+        (such as adding a virtual function)
+     */
     class DiskLoc {
-        int fileNo; /* this will be volume, file #, etc. */
+        int _a;     // this will be volume, file #, etc. but is a logical value could be anything depending on storage engine
         int ofs;
+
     public:
-        // Note: MaxFiles imposes a limit of about 32TB of data per process
-        enum SentinelValues { MaxFiles=16000, NullOfs = -1 };
 
-        int a() const {
-            return fileNo;
-        }
+        enum SentinelValues {
+            NullOfs = -1,
+            MaxFiles=16000 // thus a limit of about 32TB of data per db
+        };
 
-        DiskLoc(int a, int b) : fileNo(a), ofs(b) {
-            //assert(ofs!=0);
-        }
+        DiskLoc(int a, int b) : _a(a), ofs(b) { }
         DiskLoc() { Null(); }
         DiskLoc(const DiskLoc& l) {
-            fileNo=l.fileNo;
+            _a=l._a;
             ofs=l.ofs;
         }
 
-        bool questionable() {
+        bool questionable() const {
             return ofs < -1 ||
-                   fileNo < -1 ||
-                   fileNo > 524288;
+                   _a < -1 ||
+                   _a > 524288;
         }
 
-        bool isNull() const {
-            return fileNo == -1;
-            //            return ofs == NullOfs;
-        }
+        bool isNull() const { return _a == -1; }
         void Null() {
-            fileNo = -1;
-            ofs = 0;
-        }
-        void assertOk() {
-            assert(!isNull());
+            _a = -1;
+            ofs = 0; /* note NullOfs is different. todo clean up.  see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
         }
+        void assertOk() { assert(!isNull()); }
         void setInvalid() {
-            fileNo = -2; 
+            _a = -2;
             ofs = 0;
         }
-        bool isValid() const {
-            return fileNo != -2;
-        }
+        bool isValid() const { return _a != -2; }
 
         string toString() const {
             if ( isNull() )
                 return "null";
             stringstream ss;
-            ss << hex << fileNo << ':' << ofs;
+            ss << hex << _a << ':' << ofs;
             return ss.str();
         }
 
-        BSONObj toBSONObj() const {
-            return BSON( "file" << fileNo << "offset" << ofs );
-        }
+        BSONObj toBSONObj() const { return BSON( "file" << _a << "offset" << ofs );  }
 
-        int& GETOFS() {
-            return ofs;
-        }
-        int getOfs() const {
-            return ofs;
-        }
+        int a() const { return _a; }
+
+        int& GETOFS()      { return ofs; }
+        int getOfs() const { return ofs; }
         void set(int a, int b) {
-            fileNo=a;
+            _a=a;
             ofs=b;
         }
-        void setOfs(int _fileNo, int _ofs) {
-            fileNo = _fileNo;
-            ofs = _ofs;
-        }
 
         void inc(int amt) {
             assert( !isNull() );
@@ -112,23 +98,23 @@ namespace mongo {
         }
 
         bool sameFile(DiskLoc b) {
-            return fileNo == b.fileNo;
+            return _a== b._a;
         }
 
         bool operator==(const DiskLoc& b) const {
-            return fileNo==b.fileNo && ofs == b.ofs;
+            return _a==b._a&& ofs == b.ofs;
         }
         bool operator!=(const DiskLoc& b) const {
             return !(*this==b);
         }
         const DiskLoc& operator=(const DiskLoc& b) {
-            fileNo=b.fileNo;
+            _a=b._a;
             ofs = b.ofs;
             //assert(ofs!=0);
             return *this;
         }
         int compare(const DiskLoc& b) const {
-            int x = fileNo - b.fileNo;
+            int x = _a - b._a;
             if ( x )
                 return x;
             return ofs - b.ofs;
@@ -137,18 +123,27 @@ namespace mongo {
             return compare(b) < 0;
         }
 
-        /* get the "thing" associated with this disk location.
-           it is assumed the object is what it is -- you must asure that:
-           think of this as an unchecked type cast.
+        /**
+         * Marks this disk loc for writing
+         * @returns a non const reference to this disk loc
+         * This function explicitly signals we are writing and casts away const
+         */
+        DiskLoc& writing() const; // see dur.h
+
+        /* Get the "thing" associated with this disk location.
+           it is assumed the object is what you say it is -- you must assure that
+           (think of this as an unchecked type cast)
+           Note: set your Context first so that the database to which the diskloc applies is known.
         */
         BSONObj obj() const;
         Record* rec() const;
         DeletedRecord* drec() const;
         Extent* ext() const;
-        BtreeBucket* btree() const;
-        BtreeBucket* btreemod() const; // marks modified / dirty
+        const BtreeBucket* btree() const;
+        // Explicitly signals we are writing and casts away const
+        BtreeBucket* btreemod() const;
 
-        MongoDataFile& pdf() const;
+        /*MongoDataFile& pdf() const;*/
     };
 #pragma pack()
 
diff --git a/db/driverHelpers.cpp b/db/driverHelpers.cpp
index d8971ad..d98a33b 100644
--- a/db/driverHelpers.cpp
+++ b/db/driverHelpers.cpp
@@ -24,11 +24,11 @@
 #include "pch.h"
 #include "jsobj.h"
 #include "pdfile.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "commands.h"
 #include "cmdline.h"
 #include "btree.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "../util/background.h"
 #include "../scripting/engine.h"
 
@@ -36,18 +36,18 @@ namespace mongo {
 
     class BasicDriverHelper : public Command {
     public:
-        BasicDriverHelper( const char * name ) : Command( name ){}
-        
+        BasicDriverHelper( const char * name ) : Command( name ) {}
+
         virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
-        virtual bool slaveOverrideOk(){ return true; }        
+        virtual bool slaveOverrideOk() { return true; }
     };
 
     class ObjectIdTest : public BasicDriverHelper {
     public:
-        ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ){}
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
-            if ( cmdObj.firstElement().type() != jstOID ){
+        ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ) {}
+        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( cmdObj.firstElement().type() != jstOID ) {
                 errmsg = "not oid";
                 return false;
             }
diff --git a/db/dur.cpp b/db/dur.cpp
new file mode 100644
index 0000000..15b4565
--- /dev/null
+++ b/db/dur.cpp
@@ -0,0 +1,635 @@
+// @file dur.cpp durability in the storage engine (crash-safeness / journaling)
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+   phases
+
+     PREPLOGBUFFER
+       we will build an output buffer ourself and then use O_DIRECT
+       we could be in read lock for this
+       for very large objects write directly to redo log in situ?
+     WRITETOJOURNAL
+       we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity
+         have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
+         for now (1.7.5/1.8.0) we are in read lock which is not ideal.
+     WRITETODATAFILES
+       apply the writes back to the non-private MMF after they are for certain in redo log
+     REMAPPRIVATEVIEW
+       we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real
+         remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want
+         to be too frequent.
+       there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
+         be required.  so doing these remaps fractionally is helpful. 
+
+     @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "client.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_commitjob.h"
+#include "dur_recover.h"
+#include "../util/concurrency/race.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "../util/timer.h"
+#include "dur_stats.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    namespace dur {
+
+        void WRITETODATAFILES();
+        void PREPLOGBUFFER();
+
+        /** declared later in this file
+            only used in this file -- use DurableInterface::commitNow() outside
+        */
+        static void groupCommit();
+
+        CommitJob commitJob;
+
+        Stats stats;
+
+        void Stats::S::reset() {
+            memset(this, 0, sizeof(*this));
+        }
+
+        Stats::Stats() {
+            _a.reset();
+            _b.reset();
+            curr = &_a;
+            _intervalMicros = 3000000;
+        }
+
+        Stats::S * Stats::other() {
+            return curr == &_a ? &_b : &_a;
+        }
+
+        BSONObj Stats::S::_asObj() {
+            return BSON(
+                       "commits" << _commits <<
+                       "journaledMB" << _journaledBytes / 1000000.0 <<
+                       "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 <<
+                       "commitsInWriteLock" << _commitsInWriteLock <<
+                       "earlyCommits" << _earlyCommits << 
+                       "timeMs" <<
+                       BSON( "dt" << _dtMillis <<
+                             "prepLogBuffer" << (unsigned) (_prepLogBufferMicros/1000) <<
+                             "writeToJournal" << (unsigned) (_writeToJournalMicros/1000) <<
+                             "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros/1000) <<
+                             "remapPrivateView" << (unsigned) (_remapPrivateViewMicros/1000)
+                           )
+                   );
+        }
+
+        BSONObj Stats::asObj() {
+            return other()->_asObj();
+        }
+
+        void Stats::rotate() {
+            unsigned long long now = curTimeMicros64();
+            unsigned long long dt = now - _lastRotate;
+            if( dt >= _intervalMicros && _intervalMicros ) {
+                // rotate
+                curr->_dtMillis = (unsigned) (dt/1000);
+                _lastRotate = now;
+                curr = other();
+                curr->reset();
+            }
+        }
+
+        void NonDurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+            memcpy(dst, src, len);
+        }
+
+        void DurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+            MemoryMappedFile::makeWritable(dst, len);
+
+            // we stay in this mutex for everything to work with DurParanoid/validateSingleMapMatches
+            //
+            // this also makes setNoJournal threadsafe, which is good as we call it from a read (not a write) lock 
+            // in class SlaveTracking
+            //
+            scoped_lock lk( privateViews._mutex() );
+            size_t ofs;
+            MongoMMF *f = privateViews.find_inlock(dst, ofs);
+            assert(f);
+            void *w = (((char *)f->view_write())+ofs);
+            // first write it to the writable (file) view
+            memcpy(w, src, len);
+            if( memcmp(w, dst, len) ) {
+                // if we get here, a copy-on-write had previously occurred. so write it to the private view too
+                // to keep them in sync.  we do this as we do not want to cause a copy on write unnecessarily.
+                memcpy(dst, src, len);
+            }
+        }
+
+        /** base declare write intent function that all the helpers call. */
+        void DurableImpl::declareWriteIntent(void *p, unsigned len) {
+            commitJob.note(p, len);
+        }
+
+        static DurableImpl* durableImpl = new DurableImpl();
+        static NonDurableImpl* nonDurableImpl = new NonDurableImpl();
+        DurableInterface* DurableInterface::_impl = nonDurableImpl;
+
+        void DurableInterface::enableDurability() {
+            assert(_impl == nonDurableImpl);
+            _impl = durableImpl;
+        }
+
+        void DurableInterface::disableDurability() {
+            assert(_impl == durableImpl);
+            massert(13616, "can't disable durability with pending writes", !commitJob.hasWritten());
+            _impl = nonDurableImpl;
+        }
+
+        bool DurableImpl::commitNow() {
+            stats.curr->_earlyCommits++;
+            groupCommit();
+            return true;
+        }
+
+        bool DurableImpl::awaitCommit() {
+            commitJob.awaitNextCommit();
+            return true;
+        }
+
+        /** Declare that a file has been created
+            Normally writes are applied only after journaling, for safety.  But here the file
+            is created first, and the journal will just replay the creation if the create didn't
+            happen because of crashing.
+        */
+        void DurableImpl::createdFile(string filename, unsigned long long len) {
+            shared_ptr<DurOp> op( new FileCreatedOp(filename, len) );
+            commitJob.noteOp(op);
+        }
+
+        void* DurableImpl::writingPtr(void *x, unsigned len) {
+            void *p = x;
+            declareWriteIntent(p, len);
+            return p;
+        }
+
+        /** declare intent to write
+            @param ofs offset within buf at which we will write
+            @param len the length at ofs we will write
+            @return new buffer pointer.
+        */
+        void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) {
+            char *p = (char *) buf;
+            declareWriteIntent(p+ofs, len);
+            return p;
+        }
+
+        void* DurableImpl::writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) {
+            char *p = (char *) buf;
+            for( vector< pair< long long, unsigned > >::const_iterator i = ranges.begin();
+                    i != ranges.end(); ++i ) {
+                declareWriteIntent( p + i->first, i->second );
+            }
+            return p;
+        }
+
+        bool DurableImpl::commitIfNeeded() {
+            DEV commitJob._nSinceCommitIfNeededCall = 0;
+            if (commitJob.bytes() > UncommittedBytesLimit) { // should this also fire if CmdLine::DurAlwaysCommit?
+                stats.curr->_earlyCommits++;
+                groupCommit();
+                return true;
+            }
+            return false;
+        }
+
+        /** Used in _DEBUG builds to check that we didn't overwrite the last intent
+            that was declared.  called just before writelock release.  we check a few
+            bytes after the declared region to see if they changed.
+
+            @see MongoMutex::_releasedWriteLock
+
+            SLOW
+        */
+#if 0
+        void DurableImpl::debugCheckLastDeclaredWrite() {
+            static int n;
+            ++n;
+
+            assert(debug && cmdLine.dur);
+            if (commitJob.writes().empty())
+                return;
+            const WriteIntent &i = commitJob.lastWrite();
+            size_t ofs;
+            MongoMMF *mmf = privateViews.find(i.start(), ofs);
+            if( mmf == 0 )
+                return;
+            size_t past = ofs + i.length();
+            if( mmf->length() < past + 8 )
+                return; // too close to end of view
+            char *priv = (char *) mmf->getView();
+            char *writ = (char *) mmf->view_write();
+            unsigned long long *a = (unsigned long long *) (priv+past);
+            unsigned long long *b = (unsigned long long *) (writ+past);
+            if( *a != *b ) {
+                for( set<WriteIntent>::iterator it(commitJob.writes().begin()), end((commitJob.writes().begin())); it != end; ++it ) {
+                    const WriteIntent& wi = *it;
+                    char *r1 = (char*) wi.start();
+                    char *r2 = (char*) wi.end();
+                    if( r1 <= (((char*)a)+8) && r2 > (char*)a ) {
+                        //log() << "it's ok " << wi.p << ' ' << wi.len << endl;
+                        return;
+                    }
+                }
+                log() << "dur data after write area " << i.start() << " does not agree" << endl;
+                log() << " was:  " << ((void*)b) << "  " << hexdump((char*)b, 8) << endl;
+                log() << " now:  " << ((void*)a) << "  " << hexdump((char*)a, 8) << endl;
+                log() << " n:    " << n << endl;
+                log() << endl;
+            }
+        }
+#endif
+
+        /** write the buffer we have built to the journal and fsync it.
+            outside of lock as that could be slow.
+        */
+        static void WRITETOJOURNAL(AlignedBuilder& ab) {
+            Timer t;
+            journal(ab);
+            stats.curr->_writeToJournalMicros += t.micros();
+        }
+
+        // Functor to be called over all MongoFiles
+
+        class validateSingleMapMatches {
+        public:
+            validateSingleMapMatches(unsigned long long& bytes) :_bytes(bytes)  {}
+            void operator () (MongoFile *mf) {
+                if( mf->isMongoMMF() ) {
+                    MongoMMF *mmf = (MongoMMF*) mf;
+                    const char *p = (const char *) mmf->getView();
+                    const char *w = (const char *) mmf->view_write();
+
+                    if (!p || !w) return; // File not fully opened yet
+
+                    _bytes += mmf->length();
+
+                    assert( mmf->length() == (unsigned) mmf->length() );
+                    {
+                        scoped_lock lk( privateViews._mutex() ); // see setNoJournal
+                        if (memcmp(p, w, (unsigned) mmf->length()) == 0)
+                            return; // next file
+                    }
+
+                    unsigned low = 0xffffffff;
+                    unsigned high = 0;
+                    log() << "DurParanoid mismatch in " << mmf->filename() << endl;
+                    int logged = 0;
+                    unsigned lastMismatch = 0xffffffff;
+                    for( unsigned i = 0; i < mmf->length(); i++ ) {
+                        if( p[i] != w[i] ) {
+                            if( lastMismatch != 0xffffffff && lastMismatch+1 != i )
+                                log() << endl; // separate blocks of mismatches
+                            lastMismatch= i;
+                            if( ++logged < 60 ) {
+                                stringstream ss;
+                                ss << "mismatch ofs:" << hex << i <<  "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i];
+                                if( p[i] > 32 && p[i] <= 126 )
+                                    ss << '\t' << p[i];
+                                log() << ss.str() << endl;
+                            }
+                            if( logged == 60 )
+                                log() << "..." << endl;
+                            if( i < low ) low = i;
+                            if( i > high ) high = i;
+                        }
+                    }
+                    if( low != 0xffffffff ) {
+                        std::stringstream ss;
+                        ss << "dur error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
+                        log() << ss.str() << endl;
+                        log() << "priv loc: " << (void*)(p+low) << ' ' << endl;
+                        set<WriteIntent>& b = commitJob.writes();
+                        (void)b; // mark as unused. Useful for inspection in debugger
+
+                        // should we abort() here so this isn't unnoticed in some circumstances?
+                        massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false);
+                    }
+                }
+            }
+        private:
+            unsigned long long& _bytes;
+        };
+
+        /** (SLOW) diagnostic to check that the private view and the non-private view are in sync.
+        */
+        void debugValidateAllMapsMatch() {
+            if( ! (cmdLine.durOptions & CmdLine::DurParanoid) )
+                return;
+
+            unsigned long long bytes = 0;
+            Timer t;
+            MongoFile::forEach(validateSingleMapMatches(bytes));
+            OCCASIONALLY log() << "DurParanoid map check " << t.millis() << "ms for " <<  (bytes / (1024*1024)) << "MB" << endl;
+        }
+
+        extern size_t privateMapBytes;
+
+        /** We need to remap the private views periodically. otherwise they would become very large.
+            Call within write lock.
+        */
+        void _REMAPPRIVATEVIEW() {
+            static unsigned startAt;
+            static unsigned long long lastRemap;
+
+            dbMutex.assertWriteLocked();
+            dbMutex._remapPrivateViewRequested = false;
+            assert( !commitJob.hasWritten() );
+
+            // we want to remap all private views about every 2 seconds.  there could be ~1000 views so
+            // we do a little each pass; beyond the remap time, more significantly, there will be copy on write
+            // faults after remapping, so doing a little bit at a time will avoid big load spikes on
+            // remapping.
+            unsigned long long now = curTimeMicros64();
+            double fraction = (now-lastRemap)/2000000.0;
+            lastRemap = now;
+
+            rwlock lk(MongoFile::mmmutex, false);
+            set<MongoFile*>& files = MongoFile::getAllFiles();
+            unsigned sz = files.size();
+            if( sz == 0 )
+                return;
+
+            {
+                // be careful not to use too much memory if the write rate is 
+                // extremely high
+                double f = privateMapBytes / ((double)UncommittedBytesLimit);
+                if( f > fraction ) { 
+                    fraction = f;
+                }
+                privateMapBytes = 0;
+            }
+
+            unsigned ntodo = (unsigned) (sz * fraction);
+            if( ntodo < 1 ) ntodo = 1;
+            if( ntodo > sz ) ntodo = sz;
+
+            const set<MongoFile*>::iterator b = files.begin();
+            const set<MongoFile*>::iterator e = files.end();
+            set<MongoFile*>::iterator i = b;
+            // skip to our starting position
+            for( unsigned x = 0; x < startAt; x++ ) {
+                i++;
+                if( i == e ) i = b;
+            }
+            startAt = (startAt + ntodo) % sz; // mark where to start next time
+
+            for( unsigned x = 0; x < ntodo; x++ ) {
+                dassert( i != e );
+                if( (*i)->isMongoMMF() ) {
+                    MongoMMF *mmf = (MongoMMF*) *i;
+                    assert(mmf);
+                    if( mmf->willNeedRemap() ) {
+                        mmf->willNeedRemap() = false;
+                        mmf->remapThePrivateView();
+                    }
+                    i++;
+                    if( i == e ) i = b;
+                }
+            }
+        }
+        void REMAPPRIVATEVIEW() {
+            Timer t;
+            _REMAPPRIVATEVIEW();
+            stats.curr->_remapPrivateViewMicros += t.micros();
+        }
+
+        mutex groupCommitMutex("groupCommit");
+
+        /** locking: in read lock when called. */
+        static void _groupCommit() {
+            stats.curr->_commits++;
+
+            if( !commitJob.hasWritten() ) {
+                // getlasterror request could have came after the data was already committed
+                commitJob.notifyCommitted();
+                return;
+            }
+
+            // we need to make sure two group commits aren't running at the same time
+            // (and we are only read locked in the dbMutex, so it could happen)
+            scoped_lock lk(groupCommitMutex);
+
+            PREPLOGBUFFER();
+
+            // todo : write to the journal outside locks, as this write can be slow.
+            //        however, be careful then about remapprivateview as that cannot be done 
+            //        if new writes are then pending in the private maps.
+            WRITETOJOURNAL(commitJob._ab);
+
+            // data is now in the journal, which is sufficient for acknowledging getLastError.
+            // (ok to crash after that)
+            commitJob.notifyCommitted();
+
+            WRITETODATAFILES();
+
+            commitJob.reset();
+
+            // REMAPPRIVATEVIEW
+            //
+            // remapping private views must occur after WRITETODATAFILES otherwise
+            // we wouldn't see newly written data on reads.
+            //
+            DEV assert( !commitJob.hasWritten() );
+            if( !dbMutex.isWriteLocked() ) {
+                // this needs done in a write lock (as there is a short window during remapping when each view 
+                // might not exist) thus we do it on the next acquisition of that instead of here (there is no 
+                // rush if you aren't writing anyway -- but it must happen, if it is done, before any uncommitted 
+                // writes occur).  If desired, perhpas this can be eliminated on posix as it may be that the remap 
+                // is race-free there.
+                //
+                dbMutex._remapPrivateViewRequested = true;
+            }
+            else {
+                stats.curr->_commitsInWriteLock++;
+                // however, if we are already write locked, we must do it now -- up the call tree someone
+                // may do a write without a new lock acquisition.  this can happen when MongoMMF::close() calls
+                // this method when a file (and its views) is about to go away.
+                //
+                REMAPPRIVATEVIEW();
+            }
+        }
+
+        /** locking in read lock when called
+            @see MongoMMF::close()
+        */
+        static void groupCommit() {
+            // we need to be at least read locked on the dbMutex so that we know the write intent data 
+            // structures are not changing while we work
+            dbMutex.assertAtLeastReadLocked();
+
+            try {
+                _groupCommit();
+            }
+            catch(DBException& e ) { 
+                log() << "dbexception in groupCommit causing immediate shutdown: " << e.toString() << endl;
+                abort();
+            }
+            catch(std::ios_base::failure& e) { 
+                log() << "ios_base exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+                abort();
+            }
+            catch(std::bad_alloc& e) { 
+                log() << "bad_alloc exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+                abort();
+            }
+            catch(std::exception& e) {
+                log() << "exception in dur::groupCommit causing immediate shutdown: " << e.what() << endl;
+                abort(); // based on myTerminate()
+            }
+        }
+
+        static void go() {
+            if( !commitJob.hasWritten() ){
+                commitJob.notifyCommitted();
+                return;
+            }
+
+            {
+                readlocktry lk("", 1000);
+                if( lk.got() ) {
+                    groupCommit();
+                    return;
+                }
+            }
+
+            // starvation on read locks could occur.  so if read lock acquisition is slow, try to get a
+            // write lock instead.  otherwise journaling could be delayed too long (too much data will 
+            // not accumulate though, as commitIfNeeded logic will have executed in the meantime if there 
+            // has been writes)
+            writelock lk;
+            groupCommit();
+        }
+
+        /** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its
+            views disappear
+        */
+        void closingFileNotification() {
+            if (!cmdLine.dur)
+                return;
+
+            if( dbMutex.atLeastReadLocked() ) {
+                groupCommit();
+            }
+            else {
+                assert( inShutdown() );
+                if( commitJob.hasWritten() ) {
+                    log() << "dur warning files are closing outside locks with writes pending" << endl;
+                }
+            }
+        }
+
+        CodeBlock durThreadMain;
+
+        void durThread() {
+            Client::initThread("dur");
+            const int HowOftenToGroupCommitMs = 90;
+            while( !inShutdown() ) {
+                sleepmillis(10);
+                CodeBlock::Within w(durThreadMain);
+                try {
+                    int millis = HowOftenToGroupCommitMs;
+                    {
+                        stats.rotate();
+                        {
+                            Timer t;
+                            journalRotate(); // note we do this part outside of mongomutex
+                            millis -= t.millis();
+                            assert( millis <= HowOftenToGroupCommitMs );
+                            if( millis < 5 )
+                                millis = 5;
+                        }
+
+                        // we do this in a couple blocks, which makes it a tiny bit faster (only a little) on throughput,
+                        // but is likely also less spiky on our cpu usage, which is good:
+                        sleepmillis(millis/2);
+                        commitJob.wi()._deferred.invoke();
+                        sleepmillis(millis/2);
+                        commitJob.wi()._deferred.invoke();
+                    }
+
+                    go();
+                }
+                catch(std::exception& e) {
+                    log() << "exception in durThread causing immediate shutdown: " << e.what() << endl;
+                    abort(); // based on myTerminate()
+                }
+            }
+            cc().shutdown();
+        }
+
+        void recover();
+
+        void releasingWriteLock() {
+            // implicit commitIfNeeded check on each write unlock
+            DEV commitJob._nSinceCommitIfNeededCall = 0; // implicit commit if needed
+            if( commitJob.bytes() > UncommittedBytesLimit || cmdLine.durOptions & CmdLine::DurAlwaysCommit ) {
+                stats.curr->_earlyCommits++;
+                groupCommit();
+            }
+        }
+
+        void preallocateFiles();
+
+        /** at startup, recover, and then start the journal threads */
+        void startup() {
+            if( !cmdLine.dur )
+                return;
+
+            DurableInterface::enableDurability();
+
+            journalMakeDir();
+            try {
+                recover();
+            }
+            catch(...) {
+                log() << "exception during recovery" << endl;
+                throw;
+            }
+
+            preallocateFiles();
+
+            boost::thread t(durThread);
+        }
+
+        void DurableImpl::syncDataAndTruncateJournal() {
+            dbMutex.assertWriteLocked();
+
+            groupCommit();
+            MongoFile::flushAll(true);
+            journalCleanup();
+
+            assert(!haveJournalFiles()); // Double check post-conditions
+        }
+
+    } // namespace dur
+
+} // namespace mongo
diff --git a/db/dur.h b/db/dur.h
new file mode 100644
index 0000000..a8035e4
--- /dev/null
+++ b/db/dur.h
@@ -0,0 +1,201 @@
+// @file dur.h durability support
+
+#pragma once
+
+#include "diskloc.h"
+#include "mongommf.h"
+
+namespace mongo {
+
+    class NamespaceDetails;
+
+    namespace dur {
+
+        // a smaller limit is likely better on 32 bit
+#if defined(__i386__) || defined(_M_IX86)
+        const unsigned UncommittedBytesLimit = 50 * 1024 * 1024;
+#else
+        const unsigned UncommittedBytesLimit = 100 * 1024 * 1024;
+#endif
+
+        /** Call during startup so durability module can initialize
+            Throws if fatal error
+            Does nothing if cmdLine.dur is false
+         */
+        void startup();
+
+        class DurableInterface : boost::noncopyable {
+        public:
+            virtual ~DurableInterface() { log() << "ERROR warning ~DurableInterface not intended to be called" << endl; }
+
+            /** Declare that a file has been created
+                Normally writes are applied only after journaling, for safety.  But here the file
+                is created first, and the journal will just replay the creation if the create didn't
+                happen because of crashing.
+            */
+            virtual void createdFile(string filename, unsigned long long len) = 0;
+
+            /** Declarations of write intent.
+
+                Use these methods to declare "i'm about to write to x and it should be logged for redo."
+
+                Failure to call writing...() is checked in _DEBUG mode by using a read only mapped view
+                (i.e., you'll segfault if the code is covered in that situation).  The _DEBUG check doesn't
+                verify that your length is correct though.
+            */
+
+            /** declare intent to write to x for up to len
+                @return pointer where to write.  this is modified when testIntent is true.
+            */
+            virtual void* writingPtr(void *x, unsigned len) = 0;
+
+            /** declare write intent; should already be in the write view to work correctly when testIntent is true.
+                if you aren't, use writingPtr() instead.
+            */
+            virtual void declareWriteIntent(void *x, unsigned len) = 0;
+
+            /** declare intent to write
+                @param ofs offset within buf at which we will write
+                @param len the length at ofs we will write
+                @return new buffer pointer.  this is modified when testIntent is true.
+            */
+            virtual void* writingAtOffset(void *buf, unsigned ofs, unsigned len) = 0;
+
+            /** declare intent to write
+                @param ranges vector of pairs representing ranges.  Each pair
+                comprises an offset from buf where a range begins, then the
+                range length.
+                @return new buffer pointer.  this is modified when testIntent is true.
+             */
+            virtual void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) = 0;
+
+            /** Wait for acknowledgement of the next group commit.
+                @return true if --dur is on.  There will be delay.
+                @return false if --dur is off.
+            */
+            virtual bool awaitCommit() = 0;
+
+            /** Commit immediately.
+
+                Generally, you do not want to do this often, as highly granular committing may affect
+                performance.
+
+                Does not return until the commit is complete.
+
+                You must be at least read locked when you call this.  Ideally, you are not write locked
+                and then read operations can occur concurrently.
+
+                @return true if --dur is on.
+                @return false if --dur is off. (in which case there is action)
+            */
+            virtual bool commitNow() = 0;
+
+            /** Commit if enough bytes have been modified. Current threshold is 50MB
+
+                The idea is that long running write operations that dont yield
+                (like creating an index or update with $atomic) can call this
+                whenever the db is in a sane state and it will prevent commits
+                from growing too large.
+                @return true if commited
+            */
+            virtual bool commitIfNeeded() = 0;
+
+            /** Declare write intent for a DiskLoc.  @see DiskLoc::writing() */
+            inline DiskLoc& writingDiskLoc(DiskLoc& d) { return *((DiskLoc*) writingPtr(&d, sizeof(d))); }
+
+            /** Declare write intent for an int */
+            inline int& writingInt(const int& d) { return *((int*) writingPtr((int*) &d, sizeof(d))); }
+
+            /** "assume i've already indicated write intent, let me write"
+                redeclaration is fine too, but this is faster.
+            */
+            template <typename T>
+            inline
+            T* alreadyDeclared(T *x) {
+#if defined(_TESTINTENT)
+                return (T*) MongoMMF::switchToPrivateView(x);
+#else
+                return x;
+#endif
+            }
+
+            /** declare intent to write to x for sizeof(*x) */
+            template <typename T>
+            inline
+            T* writing(T *x) {
+                return (T*) writingPtr(x, sizeof(T));
+            }
+
+            /** write something that doesn't have to be journaled, as this write is "unimportant".
+                a good example is paddingFactor.
+                can be thought of as memcpy(dst,src,len)
+                the dur implementation acquires a mutex in this method, so do not assume it is faster
+                without measuring!
+            */
+            virtual void setNoJournal(void *dst, void *src, unsigned len) = 0;
+
+            /** Commits pending changes, flushes all changes to main data
+                files, then removes the journal.
+                
+                This is useful as a "barrier" to ensure that writes before this
+                call will never go through recovery and be applied to files
+                that have had changes made after this call applied.
+             */
+            virtual void syncDataAndTruncateJournal() = 0;
+
+            static DurableInterface& getDur() { return *_impl; }
+
+        private:
+            /** Intentionally unimplemented method.
+             It's very easy to manipulate Record::data open ended.  Thus a call to writing(Record*) is suspect.
+             This will override the templated version and yield an unresolved external.
+             */
+            Record* writing(Record* r);
+            /** Intentionally unimplemented method. BtreeBuckets are allocated in buffers larger than sizeof( BtreeBucket ). */
+            BtreeBucket* writing( BtreeBucket* );
+            /** Intentionally unimplemented method. NamespaceDetails may be based on references to 'Extra' objects. */
+            NamespaceDetails* writing( NamespaceDetails* );
+
+            static DurableInterface* _impl; // NonDurableImpl at startup()
+            static void enableDurability(); // makes _impl a DurableImpl
+            static void disableDurability(); // makes _impl a NonDurableImpl
+
+            // these need to be able to enable/disable Durability
+            friend void startup();
+            friend class TempDisableDurability;
+        }; // class DurableInterface
+
+        class NonDurableImpl : public DurableInterface {
+            void* writingPtr(void *x, unsigned len) { return x; }
+            void* writingAtOffset(void *buf, unsigned ofs, unsigned len) { return buf; }
+            void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges) { return buf; }
+            void declareWriteIntent(void *, unsigned) { }
+            void createdFile(string filename, unsigned long long len) { }
+            bool awaitCommit() { return false; }
+            bool commitNow() { return false; }
+            bool commitIfNeeded() { return false; }
+            void setNoJournal(void *dst, void *src, unsigned len);
+            void syncDataAndTruncateJournal() {}
+        };
+
+        class DurableImpl : public DurableInterface {
+            void* writingPtr(void *x, unsigned len);
+            void* writingAtOffset(void *buf, unsigned ofs, unsigned len);
+            void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges);
+            void declareWriteIntent(void *, unsigned);
+            void createdFile(string filename, unsigned long long len);
+            bool awaitCommit();
+            bool commitNow();
+            bool commitIfNeeded();
+            void setNoJournal(void *dst, void *src, unsigned len);
+            void syncDataAndTruncateJournal();
+        };
+
+    } // namespace dur
+
+    inline dur::DurableInterface& getDur() { return dur::DurableInterface::getDur(); }
+
+    /** declare that we are modifying a diskloc and this is a datafile write. */
+    inline DiskLoc& DiskLoc::writing() const { return getDur().writingDiskLoc(*const_cast< DiskLoc * >( this )); }
+
+}
diff --git a/db/dur_commitjob.cpp b/db/dur_commitjob.cpp
new file mode 100644
index 0000000..aed38e8
--- /dev/null
+++ b/db/dur_commitjob.cpp
@@ -0,0 +1,210 @@
+/* @file dur_commitjob.cpp */
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dur_commitjob.h"
+#include "taskqueue.h"
+
+namespace mongo {
+
+    namespace dur {
+
+        BOOST_STATIC_ASSERT( UncommittedBytesLimit > BSONObjMaxInternalSize * 3 );
+        BOOST_STATIC_ASSERT( sizeof(void*)==4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6 );
+
+        void Writes::D::go(const Writes::D& d) {
+            commitJob.wi()._insertWriteIntent(d.p, d.len);
+        }
+
+        void WriteIntent::absorb(const WriteIntent& other) {
+            dassert(overlaps(other));
+
+            void* newStart = min(start(), other.start());
+            p = max(p, other.p);
+            len = (char*)p - (char*)newStart;
+
+            dassert(contains(other));
+        }
+
+        void Writes::clear() {
+            dbMutex.assertAtLeastReadLocked();
+
+            _alreadyNoted.clear();
+            _writes.clear();
+            _ops.clear();
+            _drained = false;
+#if defined(DEBUG_WRITE_INTENT)
+            cout << "_debug clear\n";
+            _debug.clear();
+#endif
+        }
+
+#if defined(DEBUG_WRITE_INTENT)
+        void assertAlreadyDeclared(void *p, int len) {
+            if( commitJob.wi()._debug[p] >= len )
+                return;
+            log() << "assertAlreadyDeclared fails " << (void*)p << " len:" << len << ' ' << commitJob.wi()._debug[p] << endl;
+            printStackTrace();
+            abort();
+        }
+#endif
+
+        void Writes::_insertWriteIntent(void* p, int len) {
+            WriteIntent wi(p, len);
+
+            if (_writes.empty()) {
+                _writes.insert(wi);
+                return;
+            }
+
+            typedef set<WriteIntent>::const_iterator iterator; // shorter
+
+            iterator closest = _writes.lower_bound(wi);
+            // closest.end() >= wi.end()
+
+            if ((closest != _writes.end() && closest->overlaps(wi)) || // high end
+                    (closest != _writes.begin() && (--closest)->overlaps(wi))) { // low end
+                if (closest->contains(wi))
+                    return; // nothing to do
+
+                // find overlapping range and merge into wi
+                iterator   end(closest);
+                iterator begin(closest);
+                while (  end->overlaps(wi)) { wi.absorb(*end); ++end; if (end == _writes.end()) break; }  // look forwards
+                while (begin->overlaps(wi)) { wi.absorb(*begin); if (begin == _writes.begin()) break; --begin; } // look backwards
+                if (!begin->overlaps(wi)) ++begin; // make inclusive
+
+                DEV { // ensure we're not deleting anything we shouldn't
+                    for (iterator it(begin); it != end; ++it) {
+                        assert(wi.contains(*it));
+                    }
+                }
+
+                _writes.erase(begin, end);
+                _writes.insert(wi);
+
+                DEV { // ensure there are no overlaps
+                    // this can be very slow - n^2 - so make it RARELY
+                    RARELY {
+                        for (iterator it(_writes.begin()), end(boost::prior(_writes.end())); it != end; ++it) {
+                            assert(!it->overlaps(*boost::next(it)));
+                        }
+                    }
+                }
+            }
+            else { // no entries overlapping wi
+                _writes.insert(closest, wi);
+            }
+        }
+
+
+        /** note an operation other than a "basic write" */
+        void CommitJob::noteOp(shared_ptr<DurOp> p) {
+            DEV dbMutex.assertWriteLocked();
+            dassert( cmdLine.dur );
+            if( !_hasWritten ) {
+                assert( !dbMutex._remapPrivateViewRequested );
+                _hasWritten = true;
+            }
+            _wi._ops.push_back(p);
+        }
+
+        size_t privateMapBytes = 0; // used by _REMAPPRIVATEVIEW to track how much / how fast to remap
+
+        void CommitJob::reset() {
+            _hasWritten = false;
+            _wi.clear();
+            _ab.reset();
+            privateMapBytes += _bytes;
+            _bytes = 0;
+            _nSinceCommitIfNeededCall = 0;
+        }
+
+        CommitJob::CommitJob() : _ab(4 * 1024 * 1024) , _hasWritten(false), 
+            _bytes(0), _nSinceCommitIfNeededCall(0) { }
+
+        void CommitJob::note(void* p, int len) {
+            // from the point of view of the dur module, it would be fine (i think) to only
+            // be read locked here.  but must be at least read locked to avoid race with
+            // remapprivateview
+            DEV dbMutex.assertWriteLocked();
+            dassert( cmdLine.dur );
+            if( !_wi._alreadyNoted.checkAndSet(p, len) ) {
+                MemoryMappedFile::makeWritable(p, len);
+
+                if( !_hasWritten ) {
+                    // you can't be writing if one of these is pending, so this is a verification.
+                    assert( !dbMutex._remapPrivateViewRequested );
+
+                    // we don't bother doing a group commit when nothing is written, so we have a var to track that
+                    _hasWritten = true;
+                }
+
+                /** tips for debugging:
+                        if you have an incorrect diff between data files in different folders
+                        (see jstests/dur/quick.js for example),
+                        turn this on and see what is logged.  if you have a copy of its output from before the
+                        regression, a simple diff of these lines would tell you a lot likely.
+                */
+#if 0 && defined(_DEBUG)
+                {
+                    static int n;
+                    if( ++n < 10000 ) {
+                        size_t ofs;
+                        MongoMMF *mmf = privateViews._find(w.p, ofs);
+                        if( mmf ) {
+                            log() << "DEBUG note write intent " << w.p << ' ' << mmf->filename() << " ofs:" << hex << ofs << " len:" << w.len << endl;
+                        }
+                        else {
+                            log() << "DEBUG note write intent " << w.p << ' ' << w.len << " NOT FOUND IN privateViews" << endl;
+                        }
+                    }
+                    else if( n == 10000 ) {
+                        log() << "DEBUG stopping write intent logging, too much to log" << endl;
+                    }
+                }
+#endif
+
+                // remember intent. we will journal it in a bit
+                _wi.insertWriteIntent(p, len);
+                wassert( _wi._writes.size() <  2000000 );
+                assert(  _wi._writes.size() < 20000000 );
+
+                {
+                    // a bit over conservative in counting pagebytes used
+                    static size_t lastPos; // note this doesn't reset with each commit, but that is ok we aren't being that precise
+                    size_t x = ((size_t) p) & ~0xfff; // round off to page address (4KB)
+                    if( x != lastPos ) { 
+                        lastPos = x;
+                        unsigned b = (len+4095) & ~0xfff;
+                        _bytes += b;
+#if defined(_DEBUG)
+                        _nSinceCommitIfNeededCall++;
+                        if( _nSinceCommitIfNeededCall >= 80 ) {
+                            if( _nSinceCommitIfNeededCall % 40 == 0 )
+                                log() << "debug nsincecommitifneeded:" << _nSinceCommitIfNeededCall << " bytes:" << _bytes << endl;
+                        }
+#endif
+                        uassert(13623, "DR102 too much data written uncommitted", _bytes < UncommittedBytesLimit * 3);
+                    }
+                }
+            }
+        }
+
+    }
+}
diff --git a/db/dur_commitjob.h b/db/dur_commitjob.h
new file mode 100644
index 0000000..104d054
--- /dev/null
+++ b/db/dur_commitjob.h
@@ -0,0 +1,221 @@
+/* @file dur_commitjob.h used by dur.cpp
+*/
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/alignedbuilder.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/concurrency/synchronization.h"
+#include "cmdline.h"
+#include "durop.h"
+#include "dur.h"
+#include "taskqueue.h"
+
+//#define DEBUG_WRITE_INTENT 1
+
+namespace mongo {
+    namespace dur {
+
+        /** declaration of an intent to write to a region of a memory mapped view
+         *
+         * We store the end rather than the start pointer to make operator< faster
+         * since that is heavily used in set lookup.
+         */
+        struct WriteIntent { /* copyable */
+            WriteIntent() : w_ptr(0), p(0) { }
+            WriteIntent(void *a, unsigned b) : w_ptr(0), p((char*)a+b), len(b) { }
+
+            void* start() const { return (char*)p - len; }
+            void* end() const { return p; }
+            unsigned length() const { return len; }
+
+            bool operator < (const WriteIntent& rhs) const { return end() < rhs.end(); }
+
+            // can they be merged?
+            bool overlaps(const WriteIntent& rhs) const {
+                return (start() <= rhs.end() && end() >= rhs.start());
+            }
+
+            // is merging necessary?
+            bool contains(const WriteIntent& rhs) const {
+                return (start() <= rhs.start() && end() >= rhs.end());
+            }
+
+            // merge into me
+            void absorb(const WriteIntent& other);
+
+            friend ostream& operator << (ostream& out, const WriteIntent& wi) {
+                return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len);
+            }
+
+            mutable void *w_ptr;  // writable mapping of p.
+            // mutable because set::iterator is const but this isn't used in op<
+#if defined(_EXPERIMENTAL)
+            mutable unsigned ofsInJournalBuffer;
+#endif
+        private:
+            void *p;      // intent to write up to p
+            unsigned len; // up to this len
+        };
+
+        /** try to remember things we have already marked for journaling.  false negatives are ok if infrequent -
+            we will just log them twice.
+        */
+        template<int Prime>
+        class Already : boost::noncopyable {
+        public:
+            Already() { clear(); }
+            void clear() { memset(this, 0, sizeof(*this)); }
+
+            /* see if we have Already recorded/indicated our write intent for this region of memory.
+               automatically upgrades the length if the length was shorter previously.
+               @return true if already indicated.
+            */
+            bool checkAndSet(void* p, int len) {
+                unsigned x = mongoutils::hashPointer(p);
+                pair<void*, int> nd = nodes[x % N];
+                if( nd.first == p ) {
+                    if( nd.second < len ) {
+                        nd.second = len;
+                        return false; // haven't indicated this len yet
+                    }
+                    return true; // already indicated
+                }
+                nd.first = p;
+                nd.second = len;
+                return false; // a new set
+            }
+
+        private:
+            enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily
+            pair<void*,int> nodes[N];
+        };
+
+        /** our record of pending/uncommitted write intents */
+        class Writes : boost::noncopyable {
+            struct D {
+                void *p;
+                unsigned len;
+                static void go(const D& d);
+            };
+        public:
+            TaskQueue<D> _deferred;
+            Already<127> _alreadyNoted;
+            set<WriteIntent> _writes;
+            vector< shared_ptr<DurOp> > _ops; // all the ops other than basic writes
+            bool _drained; // _deferred is drained?  for asserting/testing
+
+            /** reset the Writes structure (empties all the above) */
+            void clear();
+
+            /** merges into set (ie non-deferred version) */
+            void _insertWriteIntent(void* p, int len);
+
+            void insertWriteIntent(void* p, int len) {
+#if defined(DEBUG_WRITE_INTENT)
+                if( _debug[p] < len )
+                    _debug[p] = len;
+#endif
+                D d;
+                d.p = p;
+                d.len = len;
+                _deferred.defer(d);
+            }
+
+#ifdef _DEBUG
+            WriteIntent _last;
+#endif
+#if defined(DEBUG_WRITE_INTENT)
+            map<void*,int> _debug;
+#endif
+        };
+
+#if defined(DEBUG_WRITE_INTENT)
+        void assertAlreadyDeclared(void *, int len);
+#else
+        inline void assertAlreadyDeclared(void *, int len) { }
+#endif
+
+        /** A commit job object for a group commit.  Currently there is one instance of this object.
+
+            concurrency: assumption is caller is appropriately locking.
+                         for example note() invocations are from the write lock.
+                         other uses are in a read lock from a single thread (durThread)
+        */
+        class CommitJob : boost::noncopyable {
+        public:
+            AlignedBuilder _ab; // for direct i/o writes to journal
+
+            CommitJob();
+
+            /** record/note an intent to write */
+            void note(void* p, int len);
+
+            /** note an operation other than a "basic write" */
+            void noteOp(shared_ptr<DurOp> p);
+
+            set<WriteIntent>& writes() {
+                if( !_wi._drained ) {
+                    // generally, you don't want to use the set until it is prepared (after deferred ops are applied)
+                    // thus this assert here.
+                    assert(false);
+                }
+                return _wi._writes;
+            }
+
+            vector< shared_ptr<DurOp> >& ops() { return _wi._ops; }
+
+            /** this method is safe to call outside of locks. when haswritten is false we don't do any group commit and avoid even
+                trying to acquire a lock, which might be helpful at times.
+            */
+            bool hasWritten() const { return _hasWritten; }
+
+            /** we use the commitjob object over and over, calling reset() rather than reconstructing */
+            void reset();
+
+            /** the commit code calls this when data reaches the journal (on disk) */
+            void notifyCommitted() { _notify.notifyAll(); }
+
+            /** Wait until the next group commit occurs. That is, wait until someone calls notifyCommitted. */
+            void awaitNextCommit() {
+                if( hasWritten() )
+                    _notify.wait();
+            }
+
+            /** we check how much written and if it is getting to be a lot, we commit sooner. */
+            size_t bytes() const { return _bytes; }
+
+#if defined(_DEBUG)
+            const WriteIntent& lastWrite() const { return _wi._last; }
+#endif
+
+            Writes& wi() { return _wi; }
+        private:
+            bool _hasWritten;
+            Writes _wi; // todo: fix name
+            size_t _bytes;
+            NotifyAll _notify; // for getlasterror fsync:true acknowledgements
+        public:
+            unsigned _nSinceCommitIfNeededCall;
+        };
+
+        extern CommitJob commitJob;
+
+    }
+}
diff --git a/db/dur_journal.cpp b/db/dur_journal.cpp
new file mode 100644
index 0000000..946f94c
--- /dev/null
+++ b/db/dur_journal.cpp
@@ -0,0 +1,576 @@
+// @file dur_journal.cpp writing to the writeahead logging journal
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client.h"
+#include "namespace.h"
+#include "dur_journal.h"
+#include "dur_journalformat.h"
+#include "dur_stats.h"
+#include "../util/logfile.h"
+#include "../util/timer.h"
+#include "../util/alignedbuilder.h"
+#include "../util/message.h" // getelapsedtimemillis
+#include "../util/concurrency/race.h"
+#include <boost/static_assert.hpp>
+#undef assert
+#define assert MONGO_assert
+#include "../util/mongoutils/str.h"
+#include "dur_journalimpl.h"
+#include "../util/file.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    class AlignedBuilder;
+
+    namespace dur {
+        BOOST_STATIC_ASSERT( sizeof(JHeader) == 8192 );
+        BOOST_STATIC_ASSERT( sizeof(JSectHeader) == 20 );
+        BOOST_STATIC_ASSERT( sizeof(JSectFooter) == 32 );
+        BOOST_STATIC_ASSERT( sizeof(JEntry) == 12 );
+        BOOST_STATIC_ASSERT( sizeof(LSNFile) == 88 );
+
+        bool usingPreallocate = false;
+
+        void removeOldJournalFile(path p);
+
+        filesystem::path getJournalDir() {
+            filesystem::path p(dbpath);
+            p /= "journal";
+            return p;
+        }
+
+        path lsnPath() {
+            return getJournalDir()/"lsn";
+        }
+
+        extern CodeBlock durThreadMain;
+
+        /** this should be called when something really bad happens so that we can flag appropriately
+        */
+        void journalingFailure(const char *msg) {
+            /** todo:
+                (1) don't log too much
+                (2) make an indicator in the journal dir that something bad happened.
+                (2b) refuse to do a recovery startup if that is there without manual override.
+            */
+            log() << "journaling error " << msg << endl;
+            assert(false);
+        }
+
+        JHeader::JHeader(string fname) {
+            magic[0] = 'j'; magic[1] = '\n';
+            _version = CurrentVersion;
+            memset(ts, 0, sizeof(ts));
+            time_t t = time(0);
+            strncpy(ts, time_t_to_String_short(t).c_str(), sizeof(ts)-1);
+            memset(dbpath, 0, sizeof(dbpath));
+            strncpy(dbpath, fname.c_str(), sizeof(dbpath)-1);
+            {
+                fileId = t&0xffffffff;
+                fileId |= ((unsigned long long)getRandomNumber()) << 32;
+            }
+            memset(reserved3, 0, sizeof(reserved3));
+            txt2[0] = txt2[1] = '\n';
+            n1 = n2 = n3 = n4 = '\n';
+        }
+
+        // class Journal
+
+        Journal j;
+
+        const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0);
+
+        Journal::Journal() :
+            _curLogFileMutex("JournalLfMutex") {
+            _written = 0;
+            _nextFileNumber = 0;
+            _curLogFile = 0;
+            _curFileId = 0;
+            _preFlushTime = 0;
+            _lastFlushTime = 0;
+            _writeToLSNNeeded = false;
+        }
+
+        path Journal::getFilePathFor(int filenumber) const {
+            filesystem::path p(dir);
+            p /= string(str::stream() << "j._" << filenumber);
+            return p;
+        }
+
+        /** never throws
+            @return true if journal dir is not empty
+        */
+        bool haveJournalFiles() {
+            try {
+                for ( boost::filesystem::directory_iterator i( getJournalDir() );
+                        i != boost::filesystem::directory_iterator();
+                        ++i ) {
+                    string fileName = boost::filesystem::path(*i).leaf();
+                    if( str::startsWith(fileName, "j._") )
+                        return true;
+                }
+            }
+            catch(...) { }
+            return false;
+        }
+
+        /** throws */
+        void removeJournalFiles() {
+            log() << "removeJournalFiles" << endl;
+            try {
+                for ( boost::filesystem::directory_iterator i( getJournalDir() );
+                        i != boost::filesystem::directory_iterator();
+                        ++i ) {
+                    string fileName = boost::filesystem::path(*i).leaf();
+                    if( str::startsWith(fileName, "j._") ) {
+                        try {
+                            removeOldJournalFile(*i);
+                        }
+                        catch(std::exception& e) {
+                            log() << "couldn't remove " << fileName << ' ' << e.what() << endl;
+                            throw;
+                        }
+                    }
+                }
+                try {
+                    boost::filesystem::remove(lsnPath());
+                }
+                catch(...) {
+                    log() << "couldn't remove " << lsnPath().string() << endl;
+                    throw;
+                }
+            }
+            catch( std::exception& e ) {
+                log() << "error removing journal files " << e.what() << endl;
+                throw;
+            }
+            assert(!haveJournalFiles());
+            log(1) << "removeJournalFiles end" << endl;
+        }
+
+        /** at clean shutdown */
+        bool okToCleanUp = false; // successful recovery would set this to true
+        void Journal::cleanup() {
+            if( !okToCleanUp )
+                return;
+
+            try {
+                scoped_lock lk(_curLogFileMutex);
+                closeCurrentJournalFile();
+                removeJournalFiles();
+            }
+            catch(std::exception& e) {
+                log() << "error couldn't remove journal file during shutdown " << e.what() << endl;
+                throw;
+            }
+        }
+        void journalCleanup() { j.cleanup(); }
+
+        bool _preallocateIsFaster() {
+            bool faster = false;
+            filesystem::path p = getJournalDir() / "tempLatencyTest";
+            try { remove(p); } catch(...) { }
+            try {
+                AlignedBuilder b(8192);
+                int millis[2];
+                const int N = 50;
+                for( int pass = 0; pass < 2; pass++ ) {
+                    LogFile f(p.string());
+                    Timer t;
+                    for( int i = 0 ; i < N; i++ ) { 
+                        f.synchronousAppend(b.buf(), 8192);
+                    }
+                    millis[pass] = t.millis();
+                    // second time through, file exists and is prealloc case
+                }
+                int diff = millis[0] - millis[1];
+                if( diff > 2 * N ) {
+                    // at least 2ms faster for prealloc case?
+                    faster = true;
+                    log() << "preallocateIsFaster=true " << diff / (1.0*N) << endl;
+                }
+            }
+            catch(...) {
+                log() << "info preallocateIsFaster couldn't run; returning false" << endl;
+            }
+            try { remove(p); } catch(...) { }
+            return faster;
+        }
+        bool preallocateIsFaster() {
+            return _preallocateIsFaster() && _preallocateIsFaster() && _preallocateIsFaster(); 
+        }
+
+        // throws
+        void preallocateFile(filesystem::path p, unsigned long long len) {
+            if( exists(p) ) 
+                return;
+
+            const unsigned BLKSZ = 1024 * 1024;
+            log() << "preallocating a journal file " << p.string() << endl;
+            LogFile f(p.string());
+            AlignedBuilder b(BLKSZ);
+            for( unsigned long long x = 0; x < len; x += BLKSZ ) { 
+                f.synchronousAppend(b.buf(), BLKSZ);
+            }
+        }
+
+        // throws
+        void _preallocateFiles() {
+            for( int i = 0; i <= 2; i++ ) {
+                string fn = str::stream() << "prealloc." << i;
+                filesystem::path filepath = getJournalDir() / fn;
+
+                unsigned long long limit = Journal::DataLimit;
+                if( debug && i == 1 ) { 
+                    // moving 32->64, the prealloc files would be short.  that is "ok", but we want to exercise that 
+                    // case, so we force exercising here when _DEBUG is set by arbitrarily stopping prealloc at a low 
+                    // limit for a file.  also we want to be able to change in the future the constant without a lot of
+                    // work anyway.
+                    limit = 16 * 1024 * 1024;
+                }
+                preallocateFile(filepath, limit);
+            }
+        }
+
+        void preallocateFiles() {
+            if( preallocateIsFaster() ||
+                exists(getJournalDir()/"prealloc.0") || // if enabled previously, keep using
+                exists(getJournalDir()/"prealloc.1") ) {
+                    usingPreallocate = true;
+                    try {
+                        _preallocateFiles();
+                    }
+                    catch(...) { 
+                        log() << "warning caught exception in preallocateFiles, continuing" << endl;
+                    }
+            }
+            j.open();
+        }
+
+        void removeOldJournalFile(path p) { 
+            if( usingPreallocate ) {
+                try {
+                    for( int i = 0; i <= 2; i++ ) {
+                        string fn = str::stream() << "prealloc." << i;
+                        filesystem::path filepath = getJournalDir() / fn;
+                        if( !filesystem::exists(filepath) ) {
+                            // we can recycle this file into this prealloc file location
+                            boost::filesystem::rename(p, filepath);
+                            return;
+                        }
+                    }
+                } catch(...) { 
+                    log() << "warning exception in dur::removeOldJournalFile " << p.string() << endl;
+                    // fall through and try to delete the file
+                }
+            }
+
+            // already have 3 prealloc files, so delete this file
+            try {
+                boost::filesystem::remove(p);
+            }
+            catch(...) { 
+                log() << "warning exception removing " << p.string() << endl;
+            }
+        }
+
+        // find a prealloc.<n> file, presumably to take and use
+        path findPrealloced() { 
+            try {
+                for( int i = 0; i <= 2; i++ ) {
+                    string fn = str::stream() << "prealloc." << i;
+                    filesystem::path filepath = getJournalDir() / fn;
+                    if( filesystem::exists(filepath) )
+                        return filepath;
+                }
+            } catch(...) { 
+                log() << "warning exception in dur::findPrealloced()" << endl;
+            }
+            return path();
+        }
+
+        /** assure journal/ dir exists. throws. call during startup. */
+        void journalMakeDir() {
+            j.init();
+
+            filesystem::path p = getJournalDir();
+            j.dir = p.string();
+            log() << "journal dir=" << j.dir << endl;
+            if( !exists(j.dir) ) {
+                try {
+                    create_directory(j.dir);
+                }
+                catch(std::exception& e) {
+                    log() << "error creating directory " << j.dir << ' ' << e.what() << endl;
+                    throw;
+                }
+            }
+        }
+
+        void Journal::_open() {
+            _curFileId = 0;
+            assert( _curLogFile == 0 );
+            path fname = getFilePathFor(_nextFileNumber);
+
+            // if we have a prealloced file, use it 
+            {
+                path p = findPrealloced();
+                if( !p.empty() ) { 
+                    try { 
+                        {
+                            // JHeader::fileId must be updated before renaming to be race-safe
+                            LogFile f(p.string());
+                            JHeader h(p.string());
+                            AlignedBuilder b(8192);
+                            b.appendStruct(h);
+                            f.synchronousAppend(b.buf(), b.len());
+                        }
+                        boost::filesystem::rename(p, fname);
+                    }
+                    catch(...) { 
+                        log() << "warning couldn't write to / rename file " << p.string() << endl;
+                    }
+                }
+            }
+
+            _curLogFile = new LogFile(fname.string());
+            _nextFileNumber++;
+            {
+                JHeader h(fname.string());
+                _curFileId = h.fileId;
+                assert(_curFileId);
+                AlignedBuilder b(8192);
+                b.appendStruct(h);
+                _curLogFile->synchronousAppend(b.buf(), b.len());
+            }
+        }
+
+        void Journal::init() {
+            assert( _curLogFile == 0 );
+            MongoFile::notifyPreFlush = preFlush;
+            MongoFile::notifyPostFlush = postFlush;
+        }
+
+        void Journal::open() {
+            assert( MongoFile::notifyPreFlush == preFlush );
+            mutex::scoped_lock lk(_curLogFileMutex);
+            _open();
+        }
+
+        void LSNFile::set(unsigned long long x) {
+            memset(this, 0, sizeof(*this));
+            lsn = x;
+            checkbytes = ~x;
+        }
+
+        /** logs details of the situation, and returns 0, if anything surprising in the LSNFile
+            if something highly surprising, throws to abort
+        */
+        unsigned long long LSNFile::get() {
+            uassert(13614, "unexpected version number of lsn file in journal/ directory", ver == 0);
+            if( ~lsn != checkbytes ) {
+                log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn << " checkbytes: " << hex << checkbytes << endl;
+                return 0;
+            }
+            return lsn;
+        }
+
+        /** called during recovery (the error message text below assumes that)
+        */
+        unsigned long long journalReadLSN() {
+            if( !debug ) {
+                // in nondebug build, for now, be conservative until more tests written, and apply the whole journal.
+                // however we will still write the lsn file to exercise that code, and use in _DEBUG build.
+                return 0;
+            }
+
+            if( !MemoryMappedFile::exists(lsnPath()) ) {
+                log() << "info no lsn file in journal/ directory" << endl;
+                return 0;
+            }
+
+            try {
+                // os can flush as it likes.  if it flushes slowly, we will just do extra work on recovery.
+                // however, given we actually close the file when writing, that seems unlikely.
+                LSNFile L;
+                File f;
+                f.open(lsnPath().string().c_str());
+                assert(f.is_open());
+                f.read(0,(char*)&L, sizeof(L));
+                unsigned long long lsn = L.get();
+                return lsn;
+            }
+            catch(std::exception& e) {
+                uasserted(13611, str::stream() << "can't read lsn file in journal directory : " << e.what());
+            }
+            return 0;
+        }
+
+        unsigned long long getLastDataFileFlushTime() {
+            return j.lastFlushTime();
+        }
+
+        /** remember "last sequence number" to speed recoveries
+            concurrency: called by durThread only.
+        */
+        void Journal::updateLSNFile() {
+            if( !_writeToLSNNeeded )
+                return;
+            durThreadMain.assertWithin();
+            _writeToLSNNeeded = false;
+            try {
+                // os can flush as it likes.  if it flushes slowly, we will just do extra work on recovery.
+                // however, given we actually close the file, that seems unlikely.
+                File f;
+                f.open(lsnPath().string().c_str());
+                if( !f.is_open() ) { 
+                    // can get 0 if an i/o error
+                    log() << "warning: open of lsn file failed" << endl;
+                    return;
+                }
+                log() << "lsn set " << _lastFlushTime << endl;
+                LSNFile lsnf;
+                lsnf.set(_lastFlushTime);
+                f.write(0, (char*)&lsnf, sizeof(lsnf));
+            }
+            catch(std::exception& e) {
+                log() << "warning: write to lsn file failed " << e.what() << endl;
+                // keep running (ignore the error). recovery will be slow.
+            }
+        }
+
+        void Journal::preFlush() {
+            j._preFlushTime = Listener::getElapsedTimeMillis();
+        }
+
+        void Journal::postFlush() {
+            j._lastFlushTime = j._preFlushTime;
+            j._writeToLSNNeeded = true;
+        }
+
+        // call from within _curLogFileMutex
+        void Journal::closeCurrentJournalFile() {
+            if (!_curLogFile)
+                return;
+
+            JFile jf;
+            jf.filename = _curLogFile->_name;
+            jf.lastEventTimeMs = Listener::getElapsedTimeMillis();
+            _oldJournalFiles.push_back(jf);
+
+            delete _curLogFile; // close
+            _curLogFile = 0;
+            _written = 0;
+        }
+
+        /** remove older journal files.
+            be in _curLogFileMutex but not dbMutex when calling
+        */
+        void Journal::removeUnneededJournalFiles() {
+            while( !_oldJournalFiles.empty() ) {
+                JFile f = _oldJournalFiles.front();
+
+                if( f.lastEventTimeMs < _lastFlushTime + ExtraKeepTimeMs ) {
+                    // eligible for deletion
+                    path p( f.filename );
+                    log() << "old journal file will be removed: " << f.filename << endl;
+                    removeOldJournalFile(p);
+                }
+                else {
+                    break;
+                }
+
+                _oldJournalFiles.pop_front();
+            }
+        }
+
+        /** check if time to rotate files.  assure a file is open.
+            done separately from the journal() call as we can do this part
+            outside of lock.
+            thread: durThread()
+         */
+        void journalRotate() {
+            j.rotate();
+        }
+        void Journal::rotate() {
+            assert( !dbMutex.atLeastReadLocked() );
+            durThreadMain.assertWithin();
+
+            scoped_lock lk(_curLogFileMutex);
+
+            if ( inShutdown() || !_curLogFile )
+                return;
+
+            j.updateLSNFile();
+
+            if( _curLogFile && _written < DataLimit )
+                return;
+
+            if( _curLogFile ) {
+
+                closeCurrentJournalFile();
+
+                removeUnneededJournalFiles();
+            }
+
+            try {
+                Timer t;
+                _open();
+                int ms = t.millis();
+                if( ms >= 200 ) {
+                    log() << "DR101 latency warning on journal file open " << ms << "ms" << endl;
+                }
+            }
+            catch(std::exception& e) {
+                log() << "warning exception opening journal file " << e.what() << endl;
+                throw;
+            }
+        }
+
+        /** write to journal
+        */
+        void journal(const AlignedBuilder& b) {
+            j.journal(b);
+        }
+        void Journal::journal(const AlignedBuilder& b) {
+            try {
+                mutex::scoped_lock lk(_curLogFileMutex);
+
+                // must already be open -- so that _curFileId is correct for previous buffer building
+                assert( _curLogFile );
+
+                stats.curr->_journaledBytes += b.len();
+                _written += b.len();
+                _curLogFile->synchronousAppend((void *) b.buf(), b.len());
+            }
+            catch(std::exception& e) {
+                log() << "warning exception in dur::journal " << e.what() << endl;
+                throw;
+            }
+        }
+
+    }
+}
+
+/* todo
+   test (and handle) disk full on journal append.  best quick thing to do is to terminate.
+   if we roll back operations, there are nuances such as is ReplSetImpl::lastOpTimeWritten too new in ram then?
+*/
diff --git a/db/dur_journal.h b/db/dur_journal.h
new file mode 100644
index 0000000..81957b5
--- /dev/null
+++ b/db/dur_journal.h
@@ -0,0 +1,68 @@
+// @file dur_journal.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+    class AlignedBuilder;
+
+    namespace dur {
+
+        /** true if ok to cleanup journal files at termination. otherwise, files journal will be retained.
+        */
+        extern bool okToCleanUp;
+
+        /** at termination after db files closed & fsynced */
+        void journalCleanup();
+
+        /** assure journal/ dir exists. throws */
+        void journalMakeDir();
+
+        /** check if time to rotate files; assure a file is open.
+             done separately from the journal() call as we can do this part
+             outside of lock.
+            only called by durThread.
+         */
+        void journalRotate();
+
+        /** write/append to journal file *
+            @param buf - a buffer that will be written to the journal.
+            will not return until on disk
+        */
+        void journal(const AlignedBuilder& buf);
+
+        /** flag that something has gone wrong during writing to the journal
+            (not for recovery mode)
+        */
+        void journalingFailure(const char *msg);
+
+        /** read lsn from disk from the last run before doing recovery */
+        unsigned long long journalReadLSN();
+
+        unsigned long long getLastDataFileFlushTime();
+
+        /** never throws.
+            @return true if there are any journal files in the journal dir.
+        */
+        bool haveJournalFiles();
+
+        // in case disk controller buffers writes
+        const long long ExtraKeepTimeMs = 10000;
+
+    }
+}
diff --git a/db/dur_journalformat.h b/db/dur_journalformat.h
new file mode 100644
index 0000000..d29f94d
--- /dev/null
+++ b/db/dur_journalformat.h
@@ -0,0 +1,166 @@
+// @file dur_journalformat.h The format of our journal files.
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/md5.hpp"
+
+namespace mongo {
+
+    namespace dur {
+
+#pragma pack(1)
+        /** beginning header for a journal/j._<n> file
+            there is nothing important int this header at this time.  except perhaps version #.
+        */
+        struct JHeader {
+            JHeader() { }
+            JHeader(string fname);
+
+            char magic[2]; // "j\n". j means journal, then a linefeed, fwiw if you were to run "less" on the file or something...
+
+            // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near
+            // that.  simply incrementing the version # is safe on a fwd basis.
+            enum { CurrentVersion = 0x4147 };
+            unsigned short _version;
+
+            // these are just for diagnostic ease (make header more useful as plain text)
+            char n1;          // '\n'
+            char ts[20];      // ascii timestamp of file generation.  for user reading, not used by code.
+            char n2;          // '\n'
+            char dbpath[128]; // path/filename of this file for human reading and diagnostics.  not used by code.
+            char n3, n4;      // '\n', '\n'
+
+            unsigned long long fileId; // unique identifier that will be in each JSectHeader. important as we recycle prealloced files
+
+            char reserved3[8026]; // 8KB total for the file header
+            char txt2[2];         // "\n\n" at the end
+
+            bool versionOk() const { return _version == CurrentVersion; }
+            bool valid() const { return magic[0] == 'j' && txt2[1] == '\n' && fileId; }
+        };
+
+        /** "Section" header.  A section corresponds to a group commit.
+            len is length of the entire section including header and footer.
+        */
+        struct JSectHeader {
+            unsigned len;                  // length in bytes of the whole section
+            unsigned long long seqNumber;  // sequence number that can be used on recovery to not do too much work
+            unsigned long long fileId;     // matches JHeader::fileId
+        };
+
+        /** an individual write operation within a group commit section.  Either the entire section should
+            be applied, or nothing.  (We check the md5 for the whole section before doing anything on recovery.)
+        */
+        struct JEntry {
+            enum OpCodes {
+                OpCode_Footer      = 0xffffffff,
+                OpCode_DbContext   = 0xfffffffe,
+                OpCode_FileCreated = 0xfffffffd,
+                OpCode_DropDb      = 0xfffffffc,
+                OpCode_Min         = 0xfffff000
+            };
+            union {
+                unsigned len;    // length in bytes of the data of the JEntry. does not include the JEntry header
+                OpCodes opcode;
+            };
+
+            unsigned ofs;  // offset in file
+
+            // sentinel and masks for _fileNo
+            enum {
+                DotNsSuffix = 0x7fffffff, // ".ns" file
+                LocalDbBit  = 0x80000000  // assuming "local" db instead of using the JDbContext
+            };
+            int _fileNo;   // high bit is set to indicate it should be the <dbpath>/local database
+            // char data[len] follows
+
+            const char * srcData() const {
+                const int *i = &_fileNo;
+                return (const char *) (i+1);
+            }
+
+            int getFileNo() const { return _fileNo & (~LocalDbBit); }
+            void setFileNo(int f) { _fileNo = f; }
+            bool isNsSuffix() const { return getFileNo() == DotNsSuffix; }
+
+            void setLocalDbContextBit() { _fileNo |= LocalDbBit; }
+            bool isLocalDbContext() const { return _fileNo & LocalDbBit; }
+            void clearLocalDbContextBit() { _fileNo = getFileNo(); }
+
+            static string suffix(int fileno) {
+                if( fileno == DotNsSuffix ) return "ns";
+                stringstream ss;
+                ss << fileno;
+                return ss.str();
+            }
+        };
+
+        /** group commit section footer. md5 is a key field. */
+        struct JSectFooter {
+            JSectFooter(const void* begin, int len) { // needs buffer to compute hash
+                sentinel = JEntry::OpCode_Footer;
+                reserved = 0;
+                magic[0] = magic[1] = magic[2] = magic[3] = '\n';
+
+                // skip section header since size modified after hashing
+                (const char*&)begin += sizeof(JSectHeader);
+                len                 -= sizeof(JSectHeader);
+
+                md5(begin, len, hash);
+            }
+            unsigned sentinel;
+            md5digest hash; // unsigned char[16]
+            unsigned long long reserved;
+            char magic[4]; // "\n\n\n\n"
+
+            bool checkHash(const void* begin, int len) const {
+                // skip section header since size modified after hashing
+                (const char*&)begin += sizeof(JSectHeader);
+                len                 -= sizeof(JSectHeader);
+                md5digest current;
+                md5(begin, len, current);
+                DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(current, 16) << endl;
+                return (memcmp(hash, current, sizeof(hash)) == 0);
+            }
+        };
+
+        /** declares "the next entry(s) are for this database / file path prefix" */
+        struct JDbContext {
+            JDbContext() : sentinel(JEntry::OpCode_DbContext) { }
+            const unsigned sentinel;   // compare to JEntry::len -- zero is our sentinel
+            //char dbname[];
+        };
+
+        /** "last sequence number" */
+        struct LSNFile {
+            unsigned ver;
+            unsigned reserved2;
+            unsigned long long lsn;
+            unsigned long long checkbytes;
+            unsigned long long reserved[8];
+
+            void set(unsigned long long lsn);
+            unsigned long long get();
+        };
+
+#pragma pack()
+
+    }
+
+}
diff --git a/db/dur_journalimpl.h b/db/dur_journalimpl.h
new file mode 100644
index 0000000..9566dff
--- /dev/null
+++ b/db/dur_journalimpl.h
@@ -0,0 +1,101 @@
+// @file dur_journal.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/logfile.h"
+
+namespace mongo {
+    namespace dur {
+
+        /** the writeahead journal for durability */
+        class Journal {
+        public:
+            string dir; // set by journalMakeDir() during initialization
+
+            Journal();
+
+            /** call during startup by journalMakeDir() */
+            void init();
+
+            /** check if time to rotate files.  assure a file is open.
+                done separately from the journal() call as we can do this part
+                outside of lock.
+                thread: durThread()
+             */
+            void rotate();
+
+            /** write to journal
+            */
+            void journal(const AlignedBuilder& b);
+
+            boost::filesystem::path getFilePathFor(int filenumber) const;
+
+            unsigned long long lastFlushTime() const { return _lastFlushTime; }
+            void cleanup();
+
+            // Rotate after reaching this data size in a journal (j._<n>) file
+            // We use a smaller size for 32 bit as the journal is mmapped during recovery (only)
+            // Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must 
+            // work.  (and should as-is)
+            static const unsigned long long DataLimit = (sizeof(void*)==4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024;
+
+            unsigned long long curFileId() const { return _curFileId; }
+
+            void assureLogFileOpen() {
+                mutex::scoped_lock lk(_curLogFileMutex);
+                if( _curLogFile == 0 )
+                    _open();
+            }
+
+            /** open a journal file to journal operations to. */
+            void open();
+
+        private:
+            void _open();
+            void closeCurrentJournalFile();
+            void removeUnneededJournalFiles();
+
+            unsigned long long _written; // bytes written so far to the current journal (log) file
+            unsigned _nextFileNumber;
+
+            mutex _curLogFileMutex;
+
+            LogFile *_curLogFile; // use _curLogFileMutex
+            unsigned long long _curFileId; // current file id see JHeader::fileId
+
+            struct JFile {
+                string filename;
+                unsigned long long lastEventTimeMs;
+            };
+
+            // files which have been closed but not unlinked (rotated out) yet
+            // ordered oldest to newest
+            list<JFile> _oldJournalFiles; // use _curLogFileMutex
+
+            // lsn related
+            static void preFlush();
+            static void postFlush();
+            unsigned long long _preFlushTime;
+            unsigned long long _lastFlushTime; // data < this time is fsynced in the datafiles (unless hard drive controller is caching)
+            bool _writeToLSNNeeded;
+            void updateLSNFile();
+        };
+
+    }
+}
diff --git a/db/dur_preplogbuffer.cpp b/db/dur_preplogbuffer.cpp
new file mode 100644
index 0000000..1648e89
--- /dev/null
+++ b/db/dur_preplogbuffer.cpp
@@ -0,0 +1,192 @@
+// @file dur_preplogbuffer.cpp
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+     PREPLOGBUFFER
+       we will build an output buffer ourself and then use O_DIRECT
+       we could be in read lock for this
+       for very large objects write directly to redo log in situ?
+     @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_journalimpl.h"
+#include "dur_commitjob.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "../util/alignedbuilder.h"
+#include "../util/timer.h"
+#include "dur_stats.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+    namespace dur {
+
+        extern Journal j;
+
+        RelativePath local = RelativePath::fromRelativePath("local");
+
+        MongoMMF* findMMF_inlock(void *ptr, size_t &ofs) {
+            MongoMMF *f = privateViews.find_inlock(ptr, ofs);
+            if( f == 0 ) {
+                string s = str::stream() << "view pointer cannot be resolved " << (size_t) ptr;
+                journalingFailure(s.c_str()); // asserts
+            }
+            return f;
+        }
+
+        /** put the basic write operation into the buffer (bb) to be journaled */
+        void prepBasicWrite_inlock(AlignedBuilder&bb, const WriteIntent *i, RelativePath& lastDbPath) {
+            size_t ofs = 1;
+            MongoMMF *mmf = findMMF_inlock(i->start(), /*out*/ofs);
+            dassert( i->w_ptr == 0 );
+
+            if( !mmf->willNeedRemap() ) {
+                // tag this mmf as needed a remap of its private view later.
+                // usually it will already be dirty/already set, so we do the if above first
+                // to avoid possibility of cpu cache line contention
+                mmf->willNeedRemap() = true;
+            }
+
+            // since we have already looked up the mmf, we go ahead and remember the write view location
+            // so we don't have to find the MongoMMF again later in WRITETODATAFILES()
+            dassert( i->w_ptr == 0 );
+            i->w_ptr = ((char*)mmf->view_write()) + ofs;
+
+            JEntry e;
+            e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); //dont write past end of file
+            assert( ofs <= 0x80000000 );
+            e.ofs = (unsigned) ofs;
+            e.setFileNo( mmf->fileSuffixNo() );
+            if( mmf->relativePath() == local ) {
+                e.setLocalDbContextBit();
+            }
+            else if( mmf->relativePath() != lastDbPath ) {
+                lastDbPath = mmf->relativePath();
+                JDbContext c;
+                bb.appendStruct(c);
+                bb.appendStr(lastDbPath.toString());
+            }
+            bb.appendStruct(e);
+#if defined(_EXPERIMENTAL)
+            i->ofsInJournalBuffer = bb.len();
+#endif
+            bb.appendBuf(i->start(), e.len);
+
+            if (e.len != (unsigned)i->length()) {
+                log() << "dur info splitting prepBasicWrite at boundary" << endl;
+
+                // This only happens if we write to the last byte in a file and
+                // the fist byte in another file that is mapped adjacently. I
+                // think most OSs leave at least a one page gap between
+                // mappings, but better to be safe.
+
+                WriteIntent next ((char*)i->start() + e.len, i->length() - e.len);
+                prepBasicWrite_inlock(bb, &next, lastDbPath);
+            }
+        }
+
+        /** basic write ops / write intents.  note there is no particular order to these : if we have
+            two writes to the same location during the group commit interval, it is likely
+            (although not assured) that it is journaled here once.
+        */
+        void prepBasicWrites(AlignedBuilder& bb) {
+            scoped_lock lk(privateViews._mutex());
+
+            // each time events switch to a different database we journal a JDbContext
+            RelativePath lastDbPath;
+
+            for( set<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
+                prepBasicWrite_inlock(bb, &(*i), lastDbPath);
+            }
+        }
+
+        void resetLogBuffer(AlignedBuilder& bb) {
+            bb.reset();
+
+            // JSectHeader
+            JSectHeader h;
+            h.len = (unsigned) 0xffffffff;  // total length, will fill in later
+            h.seqNumber = getLastDataFileFlushTime();
+            h.fileId = j.curFileId();
+
+            bb.appendStruct(h);
+        }
+
+        /** we will build an output buffer ourself and then use O_DIRECT
+            we could be in read lock for this
+            caller handles locking
+        */
+        void _PREPLOGBUFFER() {
+            assert( cmdLine.dur );
+
+            {
+                // now that we are locked, fully drain deferred notes of write intents
+                DEV dbMutex.assertAtLeastReadLocked();
+                Writes& writes = commitJob.wi();
+                writes._deferred.invoke();
+                writes._drained = true;
+            }
+
+            AlignedBuilder& bb = commitJob._ab;
+            resetLogBuffer(bb);
+
+            // ops other than basic writes (DurOp's)
+            {
+                for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) {
+                    (*i)->serialize(bb);
+                }
+            }
+
+            {
+                prepBasicWrites(bb);
+            }
+
+            {
+                JSectFooter f(bb.buf(), bb.len());
+                bb.appendStruct(f);
+            }
+
+            {
+                // pad to alignment, and set the total section length in the JSectHeader
+                assert( 0xffffe000 == (~(Alignment-1)) );
+                unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1));
+                dassert( L >= (unsigned) bb.len() );
+
+                *((unsigned*)bb.atOfs(0)) = L;
+
+                unsigned padding = L - bb.len();
+                bb.skip(padding);
+                dassert( bb.len() % Alignment == 0 );
+            }
+
+            return;
+        }
+        void PREPLOGBUFFER() {
+            Timer t;
+            j.assureLogFileOpen(); // so fileId is set
+            _PREPLOGBUFFER();
+            stats.curr->_prepLogBufferMicros += t.micros();
+        }
+
+    }
+}
diff --git a/db/dur_recover.cpp b/db/dur_recover.cpp
new file mode 100644
index 0000000..1480a59
--- /dev/null
+++ b/db/dur_recover.cpp
@@ -0,0 +1,457 @@
+// @file dur_recover.cpp crash recovery via the journal
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "dur.h"
+#include "dur_recover.h"
+#include "dur_journal.h"
+#include "dur_journalformat.h"
+#include "durop.h"
+#include "namespace.h"
+#include "../util/mongoutils/str.h"
+#include "../util/bufreader.h"
+#include "pdfile.h"
+#include "database.h"
+#include "db.h"
+#include "../util/unittest.h"
+#include "cmdline.h"
+#include "curop.h"
+#include "mongommf.h"
+
+#include <sys/stat.h>
+#include <fcntl.h>
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    namespace dur {
+
+        struct ParsedJournalEntry { /*copyable*/
+            ParsedJournalEntry() : e(0) { }
+
+            // relative path of database for the operation.
+            // might be a pointer into mmaped Journal file
+            const char *dbName;
+
+            // thse are pointers into the memory mapped journal file
+            const JEntry *e;  // local db sentinel is already parsed out here into dbName
+
+            // if not one of the two simple JEntry's above, this is the operation:
+            shared_ptr<DurOp> op;
+        };
+
+        void removeJournalFiles();
+        path getJournalDir();
+
+        /** get journal filenames, in order. throws if unexpected content found */
+        static void getFiles(path dir, vector<path>& files) {
+            map<unsigned,path> m;
+            for ( filesystem::directory_iterator i( dir );
+                    i != filesystem::directory_iterator();
+                    ++i ) {
+                filesystem::path filepath = *i;
+                string fileName = filesystem::path(*i).leaf();
+                if( str::startsWith(fileName, "j._") ) {
+                    unsigned u = str::toUnsigned( str::after(fileName, '_') );
+                    if( m.count(u) ) {
+                        uasserted(13531, str::stream() << "unexpected files in journal directory " << dir.string() << " : " << fileName);
+                    }
+                    m.insert( pair<unsigned,path>(u,filepath) );
+                }
+            }
+            for( map<unsigned,path>::iterator i = m.begin(); i != m.end(); ++i ) {
+                if( i != m.begin() && m.count(i->first - 1) == 0 ) {
+                    uasserted(13532,
+                    str::stream() << "unexpected file in journal directory " << dir.string()
+                      << " : " << filesystem::path(i->second).leaf() << " : can't find its preceeding file");
+                }
+                files.push_back(i->second);
+            }
+        }
+
+        /** read through the memory mapped data of a journal file (journal/j._<n> file)
+            throws
+        */
+        class JournalSectionIterator : boost::noncopyable {
+        public:
+            JournalSectionIterator(const void *p, unsigned len, bool doDurOps)
+                : _br(p, len)
+                , _sectHead(static_cast<const JSectHeader*>(_br.skip(sizeof(JSectHeader))))
+                , _lastDbName(NULL)
+                , _doDurOps(doDurOps)
+            {}
+
+            bool atEof() const { return _br.atEof(); }
+
+            unsigned long long seqNumber() const { return _sectHead->seqNumber; }
+
+            /** get the next entry from the log.  this function parses and combines JDbContext and JEntry's.
+             *  @return true if got an entry.  false at successful end of section (and no entry returned).
+             *  throws on premature end of section.
+             */
+            bool next(ParsedJournalEntry& e) {
+                unsigned lenOrOpCode;
+                _br.read(lenOrOpCode);
+
+                if (lenOrOpCode > JEntry::OpCode_Min) {
+                    switch( lenOrOpCode ) {
+
+                    case JEntry::OpCode_Footer: {
+                        if (_doDurOps) {
+                            const char* pos = (const char*) _br.pos();
+                            pos -= sizeof(lenOrOpCode); // rewind to include OpCode
+                            const JSectFooter& footer = *(const JSectFooter*)pos;
+                            int len = pos - (char*)_sectHead;
+                            if (!footer.checkHash(_sectHead, len)) {
+                                massert(13594, str::stream() << "Journal checksum doesn't match. recorded: "
+                                        << toHex(footer.hash, sizeof(footer.hash))
+                                        << " actual: " << md5simpledigest(_sectHead, len)
+                                        , false);
+                            }
+                        }
+                        return false; // false return value denotes end of section
+                    }
+
+                    case JEntry::OpCode_FileCreated:
+                    case JEntry::OpCode_DropDb: {
+                        e.dbName = 0;
+                        boost::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, _br);
+                        if (_doDurOps) {
+                            e.op = op;
+                        }
+                        return true;
+                    }
+
+                    case JEntry::OpCode_DbContext: {
+                        _lastDbName = (const char*) _br.pos();
+                        const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _br.remaining());
+                        const unsigned len = strnlen(_lastDbName, limit);
+                        massert(13533, "problem processing journal file during recovery", _lastDbName[len] == '\0');
+                        _br.skip(len+1); // skip '\0' too
+                        _br.read(lenOrOpCode);
+                    }
+                    // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
+
+                    default:
+                        // fall through
+                        ;
+                    }
+                }
+
+                // JEntry - a basic write
+                assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min );
+                _br.rewind(4);
+                e.e = (JEntry *) _br.skip(sizeof(JEntry));
+                e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
+                assert( e.e->len == lenOrOpCode );
+                _br.skip(e.e->len);
+                return true;
+            }
+        private:
+            BufReader _br;
+            const JSectHeader* _sectHead;
+            const char *_lastDbName; // pointer into mmaped journal file
+            const bool _doDurOps;
+        };
+
+        static string fileName(const char* dbName, int fileNo) {
+            stringstream ss;
+            ss << dbName << '.';
+            assert( fileNo >= 0 );
+            if( fileNo == JEntry::DotNsSuffix )
+                ss << "ns";
+            else
+                ss << fileNo;
+
+            // relative name -> full path name
+            path full(dbpath);
+            full /= ss.str();
+            return full.string();
+        }
+
+        RecoveryJob::~RecoveryJob() {
+            DESTRUCTOR_GUARD(
+                if( !_mmfs.empty() )
+                    close();
+            )
+        }
+
+        void RecoveryJob::close() {
+            scoped_lock lk(_mx);
+            _close();
+        }
+
+        void RecoveryJob::_close() {
+            MongoFile::flushAll(true);
+            _mmfs.clear();
+        }
+
+        void RecoveryJob::write(const ParsedJournalEntry& entry) {
+            const string fn = fileName(entry.dbName, entry.e->getFileNo());
+            MongoFile* file;
+            {
+                MongoFileFinder finder; // must release lock before creating new MongoMMF
+                file = finder.findByPath(fn);
+            }
+
+            MongoMMF* mmf;
+            if (file) {
+                assert(file->isMongoMMF());
+                mmf = (MongoMMF*)file;
+            }
+            else {
+                assert(_recovering);
+                boost::shared_ptr<MongoMMF> sp (new MongoMMF);
+                assert(sp->open(fn, false));
+                _mmfs.push_back(sp);
+                mmf = sp.get();
+            }
+
+            if ((entry.e->ofs + entry.e->len) <= mmf->length()) {
+                void* dest = (char*)mmf->view_write() + entry.e->ofs;
+                memcpy(dest, entry.e->srcData(), entry.e->len);
+            }
+            else {
+                massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering);
+            }
+        }
+
+        void RecoveryJob::applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump) {
+            if( entry.e ) {
+                if( dump ) {
+                    stringstream ss;
+                    ss << "  BASICWRITE " << setw(20) << entry.dbName << '.';
+                    if( entry.e->isNsSuffix() )
+                        ss << "ns";
+                    else
+                        ss << setw(2) << entry.e->getFileNo();
+                    ss << ' ' << setw(6) << entry.e->len << ' ' << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/
+                       "  " << hexdump(entry.e->srcData(), entry.e->len);
+                    log() << ss.str() << endl;
+                }
+                if( apply ) {
+                    write(entry);
+                }
+            }
+            else if(entry.op) {
+                // a DurOp subclass operation
+                if( dump ) {
+                    log() << "  OP " << entry.op->toString() << endl;
+                }
+                if( apply ) {
+                    if( entry.op->needFilesClosed() ) {
+                        _close(); // locked in processSection
+                    }
+                    entry.op->replay();
+                }
+            }
+        }
+
+        void RecoveryJob::applyEntries(const vector<ParsedJournalEntry> &entries) {
+            bool apply = (cmdLine.durOptions & CmdLine::DurScanOnly) == 0;
+            bool dump = cmdLine.durOptions & CmdLine::DurDumpJournal;
+            if( dump )
+                log() << "BEGIN section" << endl;
+
+            for( vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) {
+                applyEntry(*i, apply, dump);
+            }
+
+            if( dump )
+                log() << "END section" << endl;
+        }
+
+        void RecoveryJob::processSection(const void *p, unsigned len) {
+            scoped_lock lk(_mx);
+
+            vector<ParsedJournalEntry> entries;
+            JournalSectionIterator i(p, len, _recovering);
+
+            //DEV log() << "recovery processSection seq:" << i.seqNumber() << endl;
+            if( _recovering && _lastDataSyncedFromLastRun > i.seqNumber() + ExtraKeepTimeMs ) {
+                if( i.seqNumber() != _lastSeqMentionedInConsoleLog ) {
+                    log() << "recover skipping application of section seq:" << i.seqNumber() << " < lsn:" << _lastDataSyncedFromLastRun << endl;
+                    _lastSeqMentionedInConsoleLog = i.seqNumber();
+                }
+                return;
+            }
+
+            // first read all entries to make sure this section is valid
+            ParsedJournalEntry e;
+            while( i.next(e) ) {
+                entries.push_back(e);
+            }
+
+            // got all the entries for one group commit.  apply them:
+            applyEntries(entries);
+        }
+
+        /** apply a specific journal file, that is already mmap'd
+            @param p start of the memory mapped file
+            @return true if this is detected to be the last file (ends abruptly)
+        */
+        bool RecoveryJob::processFileBuffer(const void *p, unsigned len) {
+            try {
+                unsigned long long fileId;
+                BufReader br(p,len);
+
+                {
+                    // read file header
+                    JHeader h;
+                    br.read(h);
+                    if( !h.versionOk() ) {
+                        log() << "journal file version number mismatch. recover with old version of mongod, terminate cleanly, then upgrade." << endl;
+                        uasserted(13536, str::stream() << "journal version number mismatch " << h._version);
+                    }
+                    uassert(13537, "journal header invalid", h.valid());
+                    fileId = h.fileId;
+                    if(cmdLine.durOptions & CmdLine::DurDumpJournal) { 
+                        log() << "JHeader::fileId=" << fileId << endl;
+                    }
+                }
+
+                // read sections
+                while ( !br.atEof() ) {
+                    JSectHeader h;
+                    br.peek(h);
+                    if( h.fileId != fileId ) {
+                        if( debug || (cmdLine.durOptions & CmdLine::DurDumpJournal) ) {
+                            log() << "Ending processFileBuffer at differing fileId want:" << fileId << " got:" << h.fileId << endl;
+                            log() << "  sect len:" << h.len << " seqnum:" << h.seqNumber << endl;
+                        }
+                        return true;
+                    }
+                    processSection(br.skip(h.len), h.len);
+
+                    // ctrl c check
+                    killCurrentOp.checkForInterrupt(false);
+                }
+            }
+            catch( BufReader::eof& ) {
+                if( cmdLine.durOptions & CmdLine::DurDumpJournal )
+                    log() << "ABRUPT END" << endl;
+                return true; // abrupt end
+            }
+
+            return false; // non-abrupt end
+        }
+
+        /** apply a specific journal file */
+        bool RecoveryJob::processFile(path journalfile) {
+            log() << "recover " << journalfile.string() << endl;
+            MemoryMappedFile f;
+            void *p = f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL);
+            massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p);
+            return processFileBuffer(p, (unsigned) f.length());
+        }
+
+        /** @param files all the j._0 style files we need to apply for recovery */
+        void RecoveryJob::go(vector<path>& files) {
+            log() << "recover begin" << endl;
+            _recovering = true;
+
+            // load the last sequence number synced to the datafiles on disk before the last crash
+            _lastDataSyncedFromLastRun = journalReadLSN();
+            log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl;
+
+            for( unsigned i = 0; i != files.size(); ++i ) {
+	      /*bool abruptEnd = */processFile(files[i]);
+                /*if( abruptEnd && i+1 < files.size() ) {
+                    log() << "recover error: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl;
+                    close();
+                    uasserted(13535, "recover abrupt journal file end");
+                }*/
+            }
+
+            close();
+
+            if( cmdLine.durOptions & CmdLine::DurScanOnly ) {
+                uasserted(13545, str::stream() << "--durOptions " << (int) CmdLine::DurScanOnly << " (scan only) specified");
+            }
+
+            log() << "recover cleaning up" << endl;
+            removeJournalFiles();
+            log() << "recover done" << endl;
+            okToCleanUp = true;
+            _recovering = false;
+        }
+
+        void _recover() {
+            assert( cmdLine.dur );
+
+            filesystem::path p = getJournalDir();
+            if( !exists(p) ) {
+                log() << "directory " << p.string() << " does not exist, there will be no recovery startup step" << endl;
+                okToCleanUp = true;
+                return;
+            }
+
+            vector<path> journalFiles;
+            getFiles(p, journalFiles);
+
+            if( journalFiles.empty() ) {
+                log() << "recover : no journal files present, no recovery needed" << endl;
+                okToCleanUp = true;
+                return;
+            }
+
+            RecoveryJob::get().go(journalFiles);
+        }
+
+        extern mutex groupCommitMutex;
+
+        /** recover from a crash
+            called during startup
+            throws on error
+        */
+        void recover() {
+            // we use a lock so that exitCleanly will wait for us
+            // to finish (or at least to notice what is up and stop)
+            writelock lk;
+
+            // this is so the mutexdebugger doesn't get confused.  we are actually single threaded 
+            // at this point in the program so it wouldn't have been a true problem (I think)
+            scoped_lock lk2(groupCommitMutex);
+
+            _recover(); // throws on interruption
+        }
+
+        struct BufReaderY { int a,b; };
+        class BufReaderUnitTest : public UnitTest {
+        public:
+            void run() {
+                BufReader r((void*) "abcdabcdabcd", 12);
+                char x;
+                BufReaderY y;
+                r.read(x); //cout << x; // a
+                assert( x == 'a' );
+                r.read(y);
+                r.read(x);
+                assert( x == 'b' );
+            }
+        } brunittest;
+
+        // can't free at termination because order of destruction of global vars is arbitrary
+        RecoveryJob &RecoveryJob::_instance = *(new RecoveryJob());
+
+    } // namespace dur
+
+} // namespace mongo
+
diff --git a/db/dur_recover.h b/db/dur_recover.h
new file mode 100644
index 0000000..1022fdc
--- /dev/null
+++ b/db/dur_recover.h
@@ -0,0 +1,45 @@
+// @file dur.h durability support
+
+#pragma once
+
+#include "../util/concurrency/mutex.h"
+#include "../util/file.h"
+
+namespace mongo {
+    class MongoMMF;
+
+    namespace dur {
+        struct ParsedJournalEntry;
+
+        /** call go() to execute a recovery from existing journal files.
+         */
+        class RecoveryJob : boost::noncopyable {
+        public:
+            RecoveryJob() :_lastDataSyncedFromLastRun(0), _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; }
+            void go(vector<path>& files);
+            ~RecoveryJob();
+            void processSection(const void *, unsigned len);
+            void close(); // locks and calls _close()
+
+            static RecoveryJob & get() { return _instance; }
+        private:
+            void write(const ParsedJournalEntry& entry); // actually writes to the file
+            void applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump);
+            void applyEntries(const vector<ParsedJournalEntry> &entries);
+            bool processFileBuffer(const void *, unsigned len);
+            bool processFile(path journalfile);
+            void _close(); // doesn't lock
+
+            list<boost::shared_ptr<MongoMMF> > _mmfs;
+
+            unsigned long long _lastDataSyncedFromLastRun;
+            unsigned long long _lastSeqMentionedInConsoleLog;
+
+            mongo::mutex _mx; // protects _mmfs
+
+            bool _recovering; // are we in recovery or WRITETODATAFILES
+
+            static RecoveryJob &_instance;
+        };
+    }
+}
diff --git a/db/dur_stats.h b/db/dur_stats.h
new file mode 100644
index 0000000..5f5a188
--- /dev/null
+++ b/db/dur_stats.h
@@ -0,0 +1,46 @@
+// @file dur_stats.h
+
+namespace mongo {
+    namespace dur {
+
+        /** journaling stats.  the model here is that the commit thread is the only writer, and that reads are
+            uncommon (from a serverStatus command and such).  Thus, there should not be multicore chatter overhead.
+        */
+        struct Stats {
+            Stats();
+            void rotate();
+            BSONObj asObj();
+            unsigned _intervalMicros;
+            struct S {
+                BSONObj _asObj();
+                void reset();
+
+                unsigned _commits;
+                unsigned _earlyCommits; // count of early commits from commitIfNeeded() or from getDur().commitNow()
+                unsigned long long _journaledBytes;
+                unsigned long long _writeToDataFilesBytes;
+
+                unsigned long long _prepLogBufferMicros;
+                unsigned long long _writeToJournalMicros;
+                unsigned long long _writeToDataFilesMicros;
+                unsigned long long _remapPrivateViewMicros;
+
+                // undesirable to be in write lock for the group commit (it can be done in a read lock), so good if we
+                // have visibility when this happens.  can happen for a couple reasons
+                // - read lock starvation
+                // - file being closed
+                // - data being written faster than the normal group commit interval
+                unsigned _commitsInWriteLock;
+
+                unsigned _dtMillis;
+            };
+            S *curr;
+        private:
+            S _a,_b;
+            unsigned long long _lastRotate;
+            S* other();
+        };
+        extern Stats stats;
+
+    }
+}
diff --git a/db/dur_writetodatafiles.cpp b/db/dur_writetodatafiles.cpp
new file mode 100644
index 0000000..50797ea
--- /dev/null
+++ b/db/dur_writetodatafiles.cpp
@@ -0,0 +1,99 @@
+// @file dur_writetodatafiles.cpp apply the writes back to the non-private MMF after they are for certain in redo log
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dur_commitjob.h"
+#include "dur_stats.h"
+#include "dur_recover.h"
+#include "../util/timer.h"
+
+namespace mongo {
+    namespace dur {
+
+        void debugValidateAllMapsMatch();
+
+        /** apply the writes back to the non-private MMF after they are for certain in redo log
+
+            (1) todo we don't need to write back everything every group commit.  we MUST write back
+            that which is going to be a remapped on its private view - but that might not be all
+            views.
+
+            (2) todo should we do this using N threads?  would be quite easy
+                see Hackenberg paper table 5 and 6.  2 threads might be a good balance.
+
+            (3) with enough work, we could do this outside the read lock.  it's a bit tricky though.
+                - we couldn't do it from the private views then as they may be changing.  would have to then
+                  be from the journal alignedbuffer.
+                - we need to be careful the file isn't unmapped on us -- perhaps a mutex or something
+                  with MongoMMF on closes or something to coordinate that.
+
+            locking: in read lock when called
+
+            @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc&hl=en
+        */
+
+        void WRITETODATAFILES_Impl1() {
+            RecoveryJob::get().processSection(commitJob._ab.buf(), commitJob._ab.len());
+        }
+
+        // the old implementation
+        void WRITETODATAFILES_Impl2() {
+            /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
+            for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
+                const WriteIntent& intent = *it;
+                stats.curr->_writeToDataFilesBytes += intent.length();
+                dassert(intent.w_ptr);
+                memcpy(intent.w_ptr, intent.start(), intent.length());
+            }
+        }
+
+#if defined(_EXPERIMENTAL)
+        void WRITETODATAFILES_Impl3() {
+            /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
+            for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
+                const WriteIntent& intent = *it;
+                stats.curr->_writeToDataFilesBytes += intent.length();
+                dassert(intent.w_ptr);
+                memcpy(intent.w_ptr,
+                       commitJob._ab.atOfs(intent.ofsInJournalBuffer),
+                       intent.length());
+            }
+        }
+#endif
+
+        void WRITETODATAFILES() {
+            dbMutex.assertAtLeastReadLocked();
+
+            MongoFile::markAllWritable(); // for _DEBUG. normally we don't write in a read lock
+
+            Timer t;
+#if defined(_EXPERIMENTAL)
+            WRITETODATAFILES_Impl3();
+#else
+            WRITETODATAFILES_Impl1();
+#endif
+            stats.curr->_writeToDataFilesMicros += t.micros();
+
+            if (!dbMutex.isWriteLocked())
+                MongoFile::unmarkAllWritable();
+
+            debugValidateAllMapsMatch();
+        }
+
+    }
+}
diff --git a/db/durop.cpp b/db/durop.cpp
new file mode 100644
index 0000000..344b21e
--- /dev/null
+++ b/db/durop.cpp
@@ -0,0 +1,160 @@
+// @file durop.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "concurrency.h"
+#include "../util/alignedbuilder.h"
+#include "../util/mongoutils/str.h"
+#include "../util/file.h"
+#include "mongommf.h"
+#include "durop.h"
+#include "../util/file_allocator.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    extern string dbpath; // --dbpath parm
+
+    void _deleteDataFiles(const char *);
+
+    namespace dur {
+
+        /** read a durop from journal file referenced by br.
+            @param opcode the opcode which has already been written from the bufreader
+        */
+        shared_ptr<DurOp> DurOp::read(unsigned opcode, BufReader& br) {
+            shared_ptr<DurOp> op;
+            switch( opcode ) {
+            case JEntry::OpCode_FileCreated:
+                op = shared_ptr<DurOp>( new FileCreatedOp(br) );
+                break;
+            case JEntry::OpCode_DropDb:
+                op = shared_ptr<DurOp>( new DropDbOp(br) );
+                break;
+            default:
+                massert(13546, (str::stream() << "dur recover unrecognized opcode in journal " << opcode), false);
+            }
+            return op;
+        }
+
+        void DurOp::serialize(AlignedBuilder& ab) {
+            ab.appendNum(_opcode);
+            _serialize(ab);
+        }
+
+        DropDbOp::DropDbOp(BufReader& log) : DurOp(JEntry::OpCode_DropDb) {
+            unsigned long long reserved;
+            log.read(reserved);
+            log.read(reserved);
+            log.readStr(_db);
+            string reservedStr;
+            log.readStr(reservedStr);
+        }
+
+        void DropDbOp::_serialize(AlignedBuilder& ab) {
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendStr(_db);
+            ab.appendStr(""); // reserved
+        }
+
+        /** throws */
+        void DropDbOp::replay() {
+            log() << "recover replay drop db " << _db << endl;
+            _deleteDataFiles(_db.c_str());
+        }
+
+        FileCreatedOp::FileCreatedOp(string f, unsigned long long l) :
+            DurOp(JEntry::OpCode_FileCreated) {
+            _p = RelativePath::fromFullPath(f);
+            _len = l;
+        }
+
+        FileCreatedOp::FileCreatedOp(BufReader& log) : DurOp(JEntry::OpCode_FileCreated) {
+            unsigned long long reserved;
+            log.read(reserved);
+            log.read(reserved);
+            log.read(_len); // size of file, not length of name
+            string s;
+            log.readStr(s);
+            _p._p = s;
+        }
+
+        void FileCreatedOp::_serialize(AlignedBuilder& ab) {
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendNum(_len);
+            ab.appendStr(_p.toString());
+        }
+
+        string FileCreatedOp::toString() {
+            return str::stream() << "FileCreatedOp " << _p.toString() << ' ' << _len/1024.0/1024.0 << "MB";
+        }
+
+        // if an operation deletes or creates a file (or moves etc.), it may need files closed.
+        bool FileCreatedOp::needFilesClosed() {
+            return exists( _p.asFullPath() );
+        }
+
+        void FileCreatedOp::replay() {
+            // i believe the code assumes new files are filled with zeros.  thus we have to recreate the file,
+            // or rewrite at least, even if it were the right length.  perhaps one day we should change that
+            // although easier to avoid defects if we assume it is zeros perhaps.
+            string full = _p.asFullPath();
+            if( exists(full) ) {
+                try {
+                    remove(full);
+                }
+                catch(std::exception& e) {
+                    log(1) << "recover info FileCreateOp::replay unlink " << e.what() << endl;
+                }
+            }
+
+            log() << "recover create file " << full << ' ' << _len/1024.0/1024.0 << "MB" << endl;
+            if( MemoryMappedFile::exists(full) ) {
+                // first delete if exists.
+                try {
+                    remove(full);
+                }
+                catch(...) {
+                    log() << "warning could not delete file " << full << endl;
+                }
+            }
+            ensureParentDirCreated(full);
+            File f;
+            f.open(full.c_str());
+            massert(13547, str::stream() << "recover couldn't create file " << full, f.is_open());
+            unsigned long long left = _len;
+            const unsigned blksz = 64 * 1024;
+            scoped_array<char> v( new char[blksz] );
+            memset( v.get(), 0, blksz );
+            fileofs ofs = 0;
+            while( left ) {
+                unsigned long long w = left < blksz ? left : blksz;
+                f.write(ofs, v.get(), (unsigned) w);
+                left -= w;
+                ofs += w;
+            }
+            f.fsync();
+            massert(13628, str::stream() << "recover failure writing file " << full, !f.bad() );
+        }
+
+    }
+
+}
diff --git a/db/durop.h b/db/durop.h
new file mode 100644
index 0000000..c4574c2
--- /dev/null
+++ b/db/durop.h
@@ -0,0 +1,111 @@
+// @file durop.h class DurOp and descendants
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/bufreader.h"
+#include "../util/paths.h"
+
+namespace mongo {
+
+    class AlignedBuilder;
+
+    namespace dur {
+
+        const unsigned Alignment = 8192;
+
+        /** DurOp - Operations we journal that aren't just basic writes.
+         *
+         *  Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent.
+         *  We don't make WriteIntent inherit from DurOp to keep it as lean as possible as there will be millions of
+         *  them (we don't want a vtable for example there).
+         *
+         *  For each op we want to journal, we define a subclass.
+         */
+        class DurOp { /* copyable */
+        public:
+            // @param opcode a sentinel value near max unsigned which uniquely identifies the operation.
+            // @see dur::JEntry
+            DurOp(unsigned opcode) : _opcode(opcode) { }
+
+            virtual ~DurOp() { }
+
+            /** serialize the op out to a builder which will then be written (presumably) to the journal */
+            void serialize(AlignedBuilder& ab);
+
+            /** read a durop from journal file referenced by br.
+                @param opcode the opcode which has already been written from the bufreader
+            */
+            static shared_ptr<DurOp> read(unsigned opcode, BufReader& br);
+
+            /** replay the operation (during recovery)
+                throws
+
+                For now, these are not replayed during the normal WRITETODATAFILES phase, since these
+                operations are handled in other parts of the code. At some point this may change.
+            */
+            virtual void replay() = 0;
+
+            virtual string toString() = 0;
+
+            /** if the op requires all file to be closed before doing its work, returns true. */
+            virtual bool needFilesClosed() { return false; }
+
+        protected:
+            /** DurOp will have already written the opcode for you */
+            virtual void _serialize(AlignedBuilder& ab) = 0;
+
+        private:
+            const unsigned _opcode;
+        };
+
+        /** indicates creation of a new file */
+        class FileCreatedOp : public DurOp {
+        public:
+            FileCreatedOp(BufReader& log);
+            /** param f filename to create with path */
+            FileCreatedOp(string f, unsigned long long l);
+            virtual void replay();
+            virtual string toString();
+            virtual bool needFilesClosed();
+        protected:
+            virtual void _serialize(AlignedBuilder& ab);
+        private:
+            RelativePath _p;
+            unsigned long long _len; // size of file, not length of name
+        };
+
+        /** record drop of a database */
+        class DropDbOp : public DurOp {
+        public:
+            DropDbOp(BufReader& log);
+            DropDbOp(string db) :
+                DurOp(JEntry::OpCode_DropDb), _db(db) { }
+            virtual void replay();
+            virtual string toString() { return string("DropDbOp ") + _db; }
+            virtual bool needFilesClosed() { return true; }
+        protected:
+            virtual void _serialize(AlignedBuilder& ab);
+        private:
+            string _db;
+        };
+
+    }
+
+}
diff --git a/db/extsort.cpp b/db/extsort.cpp
index 68e6b52..2e6d8d8 100644
--- a/db/extsort.cpp
+++ b/db/extsort.cpp
@@ -19,160 +19,160 @@
 #include "pch.h"
 
 #include "extsort.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "../util/file.h"
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 
 namespace mongo {
-    
+
     BSONObj BSONObjExternalSorter::extSortOrder;
     unsigned long long BSONObjExternalSorter::_compares = 0;
-    
+
     BSONObjExternalSorter::BSONObjExternalSorter( const BSONObj & order , long maxFileSize )
-        : _order( order.getOwned() ) , _maxFilesize( maxFileSize ) , 
-          _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0){
-        
+        : _order( order.getOwned() ) , _maxFilesize( maxFileSize ) ,
+          _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0) {
+
         stringstream rootpath;
         rootpath << dbpath;
         if ( dbpath[dbpath.size()-1] != '/' )
             rootpath << "/";
         rootpath << "_tmp/esort." << time(0) << "." << rand() << "/";
         _root = rootpath.str();
-        
+
         log(1) << "external sort root: " << _root.string() << endl;
 
         create_directories( _root );
         _compares = 0;
     }
-    
-    BSONObjExternalSorter::~BSONObjExternalSorter(){
-        if ( _cur ){
+
+    BSONObjExternalSorter::~BSONObjExternalSorter() {
+        if ( _cur ) {
             delete _cur;
             _cur = 0;
         }
-        
+
         unsigned long removed = remove_all( _root );
         wassert( removed == 1 + _files.size() );
     }
 
-    void BSONObjExternalSorter::_sortInMem(){
+    void BSONObjExternalSorter::_sortInMem() {
         // extSortComp needs to use glbals
         // qsort_r only seems available on bsd, which is what i really want to use
         dblock l;
         extSortOrder = _order;
         _cur->sort( BSONObjExternalSorter::extSortComp );
     }
-    
-    void BSONObjExternalSorter::sort(){
+
+    void BSONObjExternalSorter::sort() {
         uassert( 10048 ,  "already sorted" , ! _sorted );
-        
+
         _sorted = true;
 
-        if ( _cur && _files.size() == 0 ){
+        if ( _cur && _files.size() == 0 ) {
             _sortInMem();
             log(1) << "\t\t not using file.  size:" << _curSizeSoFar << " _compares:" << _compares << endl;
             return;
         }
-        
-        if ( _cur ){
+
+        if ( _cur ) {
             finishMap();
         }
-        
-        if ( _cur ){
+
+        if ( _cur ) {
             delete _cur;
             _cur = 0;
         }
-        
+
         if ( _files.size() == 0 )
             return;
-        
+
     }
 
-    void BSONObjExternalSorter::add( const BSONObj& o , const DiskLoc & loc ){
+    void BSONObjExternalSorter::add( const BSONObj& o , const DiskLoc & loc ) {
         uassert( 10049 ,  "sorted already" , ! _sorted );
-        
-        if ( ! _cur ){
+
+        if ( ! _cur ) {
             _cur = new InMemory( _arraySize );
         }
-        
+
         Data& d = _cur->getNext();
         d.first = o.getOwned();
         d.second = loc;
-        
+
         long size = o.objsize();
         _curSizeSoFar += size + sizeof( DiskLoc ) + sizeof( BSONObj );
-        
-        if (  _cur->hasSpace() == false ||  _curSizeSoFar > _maxFilesize ){
+
+        if (  _cur->hasSpace() == false ||  _curSizeSoFar > _maxFilesize ) {
             finishMap();
             log(1) << "finishing map" << endl;
         }
 
     }
-    
-    void BSONObjExternalSorter::finishMap(){
+
+    void BSONObjExternalSorter::finishMap() {
         uassert( 10050 ,  "bad" , _cur );
-        
+
         _curSizeSoFar = 0;
         if ( _cur->size() == 0 )
             return;
-        
+
         _sortInMem();
-        
+
         stringstream ss;
         ss << _root.string() << "/file." << _files.size();
         string file = ss.str();
-        
+
         ofstream out;
         out.open( file.c_str() , ios_base::out | ios_base::binary );
         assertStreamGood( 10051 ,  (string)"couldn't open file: " + file , out );
-        
+
         int num = 0;
-        for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ){
+        for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ) {
             Data p = *i;
             out.write( p.first.objdata() , p.first.objsize() );
             out.write( (char*)(&p.second) , sizeof( DiskLoc ) );
             num++;
         }
-        
+
         _cur->clear();
-        
+
         _files.push_back( file );
         out.close();
 
         log(2) << "Added file: " << file << " with " << num << "objects for external sort" << endl;
     }
-    
+
     // ---------------------------------
 
     BSONObjExternalSorter::Iterator::Iterator( BSONObjExternalSorter * sorter ) :
-        _cmp( sorter->_order ) , _in( 0 ){
-        
-        for ( list<string>::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ){
+        _cmp( sorter->_order ) , _in( 0 ) {
+
+        for ( list<string>::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ) {
             _files.push_back( new FileIterator( *i ) );
             _stash.push_back( pair<Data,bool>( Data( BSONObj() , DiskLoc() ) , false ) );
         }
-        
-        if ( _files.size() == 0 && sorter->_cur ){
+
+        if ( _files.size() == 0 && sorter->_cur ) {
             _in = sorter->_cur;
             _it = sorter->_cur->begin();
         }
 
-        
+
     }
-    
-    BSONObjExternalSorter::Iterator::~Iterator(){
+
+    BSONObjExternalSorter::Iterator::~Iterator() {
         for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
             delete *i;
         _files.clear();
     }
-    
-    bool BSONObjExternalSorter::Iterator::more(){
+
+    bool BSONObjExternalSorter::Iterator::more() {
 
         if ( _in )
             return _it != _in->end();
-        
+
         for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
             if ( (*i)->more() )
                 return true;
@@ -181,34 +181,34 @@ namespace mongo {
                 return true;
         return false;
     }
-        
-    BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next(){
-        
-        if ( _in ){
+
+    BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next() {
+
+        if ( _in ) {
             Data& d = *_it;
             ++_it;
             return d;
         }
-        
+
         Data best;
         int slot = -1;
-        
-        for ( unsigned i=0; i<_stash.size(); i++ ){
 
-            if ( ! _stash[i].second ){
+        for ( unsigned i=0; i<_stash.size(); i++ ) {
+
+            if ( ! _stash[i].second ) {
                 if ( _files[i]->more() )
                     _stash[i] = pair<Data,bool>( _files[i]->next() , true );
                 else
                     continue;
             }
-            
-            if ( slot == -1 || _cmp( best , _stash[i].first ) == 0 ){
+
+            if ( slot == -1 || _cmp( best , _stash[i].first ) == 0 ) {
                 best = _stash[i].first;
                 slot = i;
             }
-                
+
         }
-        
+
         assert( slot >= 0 );
         _stash[slot].second = false;
 
@@ -216,27 +216,26 @@ namespace mongo {
     }
 
     // -----------------------------------
-    
-    BSONObjExternalSorter::FileIterator::FileIterator( string file ){
-        long length;
+
+    BSONObjExternalSorter::FileIterator::FileIterator( string file ) {
+        unsigned long long length;
         _buf = (char*)_file.map( file.c_str() , length , MemoryMappedFile::SEQUENTIAL );
         massert( 10308 ,  "mmap failed" , _buf );
-        assert( (unsigned long long)length == (unsigned long long)file_size( file ) );
+        assert( length == (unsigned long long) file_size( file ) );
         _end = _buf + length;
     }
-    BSONObjExternalSorter::FileIterator::~FileIterator(){
-    }
-    
-    bool BSONObjExternalSorter::FileIterator::more(){
+    BSONObjExternalSorter::FileIterator::~FileIterator() {}
+
+    bool BSONObjExternalSorter::FileIterator::more() {
         return _buf < _end;
     }
-    
-    BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next(){
+
+    BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next() {
         BSONObj o( _buf );
         _buf += o.objsize();
         DiskLoc * l = (DiskLoc*)_buf;
         _buf += 8;
         return Data( o , *l );
     }
-    
+
 }
diff --git a/db/extsort.h b/db/extsort.h
index fa0eca4..c0791db 100644
--- a/db/extsort.h
+++ b/db/extsort.h
@@ -20,8 +20,8 @@
 
 #include "../pch.h"
 #include "jsobj.h"
-#include "namespace.h"
-#include "curop.h"
+#include "namespace-inl.h"
+#include "curop-inl.h"
 #include "../util/array.h"
 
 namespace mongo {
@@ -32,13 +32,13 @@ namespace mongo {
      */
     class BSONObjExternalSorter : boost::noncopyable {
     public:
-        
+
         typedef pair<BSONObj,DiskLoc> Data;
 
     private:
         static BSONObj extSortOrder;
 
-        static int extSortComp( const void *lv, const void *rv ){
+        static int extSortComp( const void *lv, const void *rv ) {
             RARELY killCurrentOp.checkForInterrupt();
             _compares++;
             Data * l = (Data*)lv;
@@ -54,7 +54,7 @@ namespace mongo {
             FileIterator( string file );
             ~FileIterator();
             bool more();
-            Data next();            
+            Data next();
         private:
             MemoryMappedFile _file;
             char * _buf;
@@ -63,7 +63,7 @@ namespace mongo {
 
         class MyCmp {
         public:
-            MyCmp( const BSONObj & order = BSONObj() ) : _order( order ){}
+            MyCmp( const BSONObj & order = BSONObj() ) : _order( order ) {}
             bool operator()( const Data &l, const Data &r ) const {
                 RARELY killCurrentOp.checkForInterrupt();
                 _compares++;
@@ -78,50 +78,50 @@ namespace mongo {
         };
 
     public:
-        
+
         typedef FastArray<Data> InMemory;
 
         class Iterator : boost::noncopyable {
         public:
-            
+
             Iterator( BSONObjExternalSorter * sorter );
             ~Iterator();
             bool more();
             Data next();
-            
+
         private:
             MyCmp _cmp;
             vector<FileIterator*> _files;
             vector< pair<Data,bool> > _stash;
-            
+
             InMemory * _in;
             InMemory::iterator _it;
-            
+
         };
-        
+
         BSONObjExternalSorter( const BSONObj & order = BSONObj() , long maxFileSize = 1024 * 1024 * 100 );
         ~BSONObjExternalSorter();
-        
+
         void add( const BSONObj& o , const DiskLoc & loc );
-        void add( const BSONObj& o , int a , int b ){
+        void add( const BSONObj& o , int a , int b ) {
             add( o , DiskLoc( a , b ) );
         }
 
         /* call after adding values, and before fetching the iterator */
         void sort();
-        
-        auto_ptr<Iterator> iterator(){
+
+        auto_ptr<Iterator> iterator() {
             uassert( 10052 ,  "not sorted" , _sorted );
             return auto_ptr<Iterator>( new Iterator( this ) );
         }
-        
-        int numFiles(){
+
+        int numFiles() {
             return _files.size();
         }
-        
-        long getCurSizeSoFar(){ return _curSizeSoFar; }
 
-        void hintNumObjects( long long numObjects ){
+        long getCurSizeSoFar() { return _curSizeSoFar; }
+
+        void hintNumObjects( long long numObjects ) {
             if ( numObjects < _arraySize )
                 _arraySize = (int)(numObjects + 100);
         }
@@ -129,18 +129,18 @@ namespace mongo {
     private:
 
         void _sortInMem();
-        
+
         void sort( string file );
         void finishMap();
-        
+
         BSONObj _order;
         long _maxFilesize;
         path _root;
-        
+
         int _arraySize;
         InMemory * _cur;
         long _curSizeSoFar;
-        
+
         list<string> _files;
         bool _sorted;
 
diff --git a/db/filever.h b/db/filever.h
index 4aa18d4..e89a824 100644
--- a/db/filever.h
+++ b/db/filever.h
@@ -20,11 +20,11 @@
 
 namespace mongo {
 
-inline void checkDataFileVersion(NamespaceDetails& d) { 
-}
+    inline void checkDataFileVersion(NamespaceDetails& d) {
+    }
 
-inline void checkIndexFileVersion(NamespaceDetails& d) { 
-}
+    inline void checkIndexFileVersion(NamespaceDetails& d) {
+    }
 
 }
 
diff --git a/db/geo/2d.cpp b/db/geo/2d.cpp
index 60818fc..934ee80 100644
--- a/db/geo/2d.cpp
+++ b/db/geo/2d.cpp
@@ -17,14 +17,14 @@
 */
 
 #include "pch.h"
-#include "../namespace.h"
+#include "../namespace-inl.h"
 #include "../jsobj.h"
 #include "../index.h"
 #include "../../util/unittest.h"
 #include "../commands.h"
 #include "../pdfile.h"
 #include "../btree.h"
-#include "../curop.h"
+#include "../curop-inl.h"
 #include "../matcher.h"
 
 #include "core.h"
@@ -33,7 +33,8 @@ namespace mongo {
 
 #if 0
 # define GEODEBUG(x) cout << x << endl;
-    inline void PREFIXDEBUG(GeoHash prefix, const GeoConvert* g){
+# define GEODEBUGPRINT(x) PRINT(x)
+    inline void PREFIXDEBUG(GeoHash prefix, const GeoConvert* g) {
         if (!prefix.constrains()) {
             cout << "\t empty prefix" << endl;
             return ;
@@ -46,18 +47,29 @@ namespace mongo {
         Point center ( (ll._x+tr._x)/2, (ll._y+tr._y)/2 );
         double radius = fabs(ll._x - tr._x) / 2;
 
-        cout << "\t ll: " << ll.toString() << " tr: " << tr.toString() 
+        cout << "\t ll: " << ll.toString() << " tr: " << tr.toString()
              << " center: " << center.toString() << " radius: " << radius << endl;
 
     }
 #else
-# define GEODEBUG(x) 
-# define PREFIXDEBUG(x, y) 
+# define GEODEBUG(x)
+# define GEODEBUGPRINT(x)
+# define PREFIXDEBUG(x, y)
 #endif
 
-    double EARTH_RADIUS_KM = 6371;
-    double EARTH_RADIUS_MILES = EARTH_RADIUS_KM * 0.621371192;
+    const double EARTH_RADIUS_KM = 6371;
+    const double EARTH_RADIUS_MILES = EARTH_RADIUS_KM * 0.621371192;
 
+    enum GeoDistType {
+        GEO_PLAIN,
+        GEO_SPHERE
+    };
+
+    inline double computeXScanDistance(double y, double maxDistDegrees) {
+        // TODO: this overestimates for large madDistDegrees far from the equator
+        return maxDistDegrees / min(cos(deg2rad(min(+89.0, y + maxDistDegrees))),
+                                    cos(deg2rad(max(-89.0, y - maxDistDegrees))));
+    }
 
     GeoBitSets geoBitSets;
 
@@ -66,14 +78,14 @@ namespace mongo {
     class Geo2dType : public IndexType , public GeoConvert {
     public:
         Geo2dType( const IndexPlugin * plugin , const IndexSpec* spec )
-            : IndexType( plugin , spec ){
-            
+            : IndexType( plugin , spec ) {
+
             BSONObjBuilder orderBuilder;
 
             BSONObjIterator i( spec->keyPattern );
-            while ( i.more() ){
+            while ( i.more() ) {
                 BSONElement e = i.next();
-                if ( e.type() == String && GEO2DNAME == e.valuestr() ){
+                if ( e.type() == String && GEO2DNAME == e.valuestr() ) {
                     uassert( 13022 , "can't have 2 geo field" , _geo.size() == 0 );
                     uassert( 13023 , "2d has to be first in index" , _other.size() == 0 );
                     _geo = e.fieldName();
@@ -83,16 +95,16 @@ namespace mongo {
                 }
                 orderBuilder.append( "" , 1 );
             }
-            
+
             uassert( 13024 , "no geo field specified" , _geo.size() );
-            
+
             _bits = _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft
 
             uassert( 13028 , "can't have more than 32 bits in geo index" , _bits <= 32 );
 
             _max = _configval( spec , "max" , 180 );
             _min = _configval( spec , "min" , -180 );
-            
+
             _scaling = (1024*1024*1024*4.0)/(_max-_min);
 
             _order = orderBuilder.obj();
@@ -103,30 +115,30 @@ namespace mongo {
             _error = distance(a, b);
         }
 
-        int _configval( const IndexSpec* spec , const string& name , int def ){
+        int _configval( const IndexSpec* spec , const string& name , int def ) {
             BSONElement e = spec->info[name];
             if ( e.isNumber() )
                 return e.numberInt();
             return def;
         }
 
-        ~Geo2dType(){
-            
+        ~Geo2dType() {
+
         }
 
-        virtual BSONObj fixKey( const BSONObj& in ) { 
+        virtual BSONObj fixKey( const BSONObj& in ) {
             if ( in.firstElement().type() == BinData )
                 return in;
 
             BSONObjBuilder b(in.objsize()+16);
-            
+
             if ( in.firstElement().isABSONObj() )
                 _hash( in.firstElement().embeddedObject() ).append( b , "" );
             else if ( in.firstElement().type() == String )
                 GeoHash( in.firstElement().valuestr() ).append( b , "" );
             else if ( in.firstElement().type() == RegEx )
                 GeoHash( in.firstElement().regex() ).append( b , "" );
-            else 
+            else
                 return in;
 
             BSONObjIterator i(in);
@@ -152,19 +164,44 @@ namespace mongo {
 
             _hash( embed ).append( b , "" );
 
-            for ( size_t i=0; i<_other.size(); i++ ){
-                BSONElement e = obj[_other[i]];
-                if ( e.eoo() )
-                    e = _spec->missingField();
-                b.appendAs( e , "" );
-            }
+            // Go through all the other index keys
+            for ( vector<string>::const_iterator i = _other.begin(); i != _other.end(); ++i ){
+
+            	// Get *all* fields for the index key
+				BSONElementSet eSet;
+				obj.getFieldsDotted( *i, eSet );
+
+
+				if ( eSet.size() == 0 )
+					b.appendAs( _spec->missingField(), "" );
+				else if ( eSet.size() == 1 )
+					b.appendAs( *(eSet.begin()), "" );
+				else{
+
+					// If we have more than one key, store as an array of the objects
+					// TODO:  Store multiple keys?
+
+					BSONArrayBuilder aBuilder;
+
+					for( BSONElementSet::iterator ei = eSet.begin(); ei != eSet.end(); ++ei ){
+						aBuilder.append( *ei );
+					}
+
+					BSONArray arr = aBuilder.arr();
+
+					b.append( "", arr );
+
+				}
+
+			}
+
             keys.insert( b.obj() );
         }
-        
+
         GeoHash _tohash( const BSONElement& e ) const {
             if ( e.isABSONObj() )
                 return _hash( e.embeddedObject() );
-            
+
             return GeoHash( e , _bits );
         }
 
@@ -174,7 +211,7 @@ namespace mongo {
             BSONElement x = i.next();
             uassert( 13068 , "geo field only has 1 element" , i.more() );
             BSONElement y = i.next();
-            
+
             uassert( 13026 , "geo values have to be numbers: " + o.toString() , x.isNumber() && y.isNumber() );
 
             return hash( x.number() , y.number() );
@@ -192,33 +229,33 @@ namespace mongo {
             b.append( "y" , _unconvert( y ) );
             return b.obj();
         }
-        
+
         unsigned _convert( double in ) const {
             uassert( 13027 , "point not in range" , in <= (_max + _error) && in >= (_min - _error) );
             in -= _min;
             assert( in > 0 );
             return (unsigned)(in * _scaling);
         }
-        
+
         double _unconvert( unsigned in ) const {
             double x = in;
             x /= _scaling;
             x += _min;
             return x;
         }
-        
+
         void unhash( const GeoHash& h , double& x , double& y ) const {
             unsigned a,b;
             h.unhash(a,b);
             x = _unconvert( a );
             y = _unconvert( b );
         }
-        
+
         double distance( const GeoHash& a , const GeoHash& b ) const {
             double ax,ay,bx,by;
             unhash( a , ax , ay );
             unhash( b , bx , by );
-            
+
             double dx = bx - ax;
             double dy = by - ay;
 
@@ -237,6 +274,11 @@ namespace mongo {
             b.move( 1 , 1 );
             unhash( a, ax, ay );
             unhash( b, bx, by );
+
+            // _min and _max are a singularity
+            if (bx == _min)
+                bx = _max;
+
             return (fabs(ax-bx));
         }
 
@@ -248,10 +290,10 @@ namespace mongo {
 
         virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const {
             BSONElement e = query.getFieldDotted(_geo.c_str());
-            switch ( e.type() ){
+            switch ( e.type() ) {
             case Object: {
                 BSONObj sub = e.embeddedObject();
-                switch ( sub.firstElement().getGtLtOp() ){
+                switch ( sub.firstElement().getGtLtOp() ) {
                 case BSONObj::opNEAR:
                 case BSONObj::opWITHIN:
                     return OPTIMAL;
@@ -259,6 +301,9 @@ namespace mongo {
                 }
             }
             case Array:
+            	// Non-geo index data is stored in a non-standard way, cannot use for exact lookups with
+            	// additional criteria
+            	if ( query.nFields() > 1 ) return USELESS;
                 return HELPFUL;
             default:
                 return USELESS;
@@ -267,7 +312,7 @@ namespace mongo {
 
         string _geo;
         vector<string> _other;
-        
+
         unsigned _bits;
         int _max;
         int _min;
@@ -279,38 +324,38 @@ namespace mongo {
 
     class Box {
     public:
-        
+
         Box( const Geo2dType * g , const GeoHash& hash )
-            : _min( g , hash ) , 
-              _max( _min._x + g->sizeEdge( hash ) , _min._y + g->sizeEdge( hash ) ){
+            : _min( g , hash ) ,
+              _max( _min._x + g->sizeEdge( hash ) , _min._y + g->sizeEdge( hash ) ) {
         }
-        
+
         Box( double x , double y , double size )
-            : _min( x , y ) , 
-              _max( x + size , y + size ){
+            : _min( x , y ) ,
+              _max( x + size , y + size ) {
         }
 
         Box( Point min , Point max )
-            : _min( min ) , _max( max ){
+            : _min( min ) , _max( max ) {
         }
 
-        Box(){}
+        Box() {}
 
         string toString() const {
             StringBuilder buf(64);
             buf << _min.toString() << " -->> " << _max.toString();
             return buf.str();
         }
-        
+
         bool between( double min , double max , double val , double fudge=0) const {
             return val + fudge >= min && val <= max + fudge;
         }
-        
+
         bool mid( double amin , double amax , double bmin , double bmax , bool min , double& res ) const {
             assert( amin <= amax );
             assert( bmin <= bmax );
 
-            if ( amin < bmin ){
+            if ( amin < bmin ) {
                 if ( amax < bmin )
                     return false;
                 res = min ? bmin : amax;
@@ -323,16 +368,16 @@ namespace mongo {
         }
 
         double intersects( const Box& other ) const {
-            
+
             Point boundMin(0,0);
             Point boundMax(0,0);
-            
+
             if ( mid( _min._x , _max._x , other._min._x , other._max._x , true , boundMin._x ) == false ||
-                 mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false ||
-                 mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false ||
-                 mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false )
+                    mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false ||
+                    mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false ||
+                    mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false )
                 return 0;
-            
+
             Box intersection( boundMin , boundMax );
 
             return intersection.area() / ( ( area() + other.area() ) / 2 );
@@ -347,45 +392,49 @@ namespace mongo {
                           ( _min._y + _max._y ) / 2 );
         }
 
-        bool inside( Point p , double fudge = 0 ){
+        bool inside( Point p , double fudge = 0 ) {
             bool res = inside( p._x , p._y , fudge );
             //cout << "is : " << p.toString() << " in " << toString() << " = " << res << endl;
             return res;
         }
-        
-        bool inside( double x , double y , double fudge = 0 ){
-            return 
+
+        bool inside( double x , double y , double fudge = 0 ) {
+            return
                 between( _min._x , _max._x  , x , fudge ) &&
                 between( _min._y , _max._y  , y , fudge );
         }
-        
+
+        bool contains(const Box& other, double fudge=0) {
+            return inside(other._min, fudge) && inside(other._max, fudge);
+        }
+
         Point _min;
         Point _max;
     };
-    
+
     class Geo2dPlugin : public IndexPlugin {
     public:
-        Geo2dPlugin() : IndexPlugin( GEO2DNAME ){
+        Geo2dPlugin() : IndexPlugin( GEO2DNAME ) {
         }
-        
+
         virtual IndexType* generate( const IndexSpec* spec ) const {
             return new Geo2dType( this , spec );
         }
     } geo2dplugin;
-    
+
     struct GeoUnitTest : public UnitTest {
-        
-        int round( double d ){
+
+        int round( double d ) {
             return (int)(.5+(d*1000));
         }
-        
+
 #define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); }
 
-        void run(){
+        void run() {
             assert( ! GeoHash::isBitSet( 0 , 0 ) );
             assert( ! GeoHash::isBitSet( 0 , 31 ) );
             assert( GeoHash::isBitSet( 1 , 31 ) );
-            
+
             IndexSpec i( BSON( "loc" << "2d" ) );
             Geo2dType g( &geo2dplugin , &i );
             {
@@ -411,7 +460,7 @@ namespace mongo {
                 assert( round( in["x"].number() ) == round( out["x"].number() ) );
                 assert( round( in["y"].number() ) == round( out["y"].number() ) );
             }
-            
+
             {
                 GeoHash h( "0000" );
                 h.move( 0 , 1 );
@@ -424,13 +473,13 @@ namespace mongo {
                 GEOHEQ( h , "0100" );
                 h.move( 0 , -1 );
                 GEOHEQ( h , "0001" );
-                
+
 
                 h.init( "0000" );
                 h.move( 1 , 0 );
                 GEOHEQ( h , "0010" );
             }
-            
+
             {
                 Box b( 5 , 5 , 2 );
                 assert( "(5,5) -->> (7,7)" == b.toString() );
@@ -444,7 +493,7 @@ namespace mongo {
                 b = g.hash( 42 , 44 );
                 assert( round(10) == round(g.distance( a , b )) );
             }
-            
+
             {
                 GeoHash x("0000");
                 assert( 0 == x.getHash() );
@@ -454,7 +503,7 @@ namespace mongo {
                 assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) );
                 assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) );
             }
-               
+
             {
                 GeoHash x("1010");
                 GEOHEQ( x , "1010" );
@@ -462,8 +511,8 @@ namespace mongo {
                 GEOHEQ( y , "101001" );
             }
 
-            { 
-                
+            {
+
                 GeoHash a = g.hash( 5 , 5 );
                 GeoHash b = g.hash( 5 , 7 );
                 GeoHash c = g.hash( 100 , 100 );
@@ -509,13 +558,13 @@ namespace mongo {
                 assert( entry.hasPrefix( GeoHash( "1100" ) ) );
                 assert( entry.hasPrefix( prefix ) );
             }
-            
+
             {
                 GeoHash a = g.hash( 50 , 50 );
                 GeoHash b = g.hash( 48 , 54 );
                 assert( round( 4.47214 ) == round( g.distance( a , b ) ) );
             }
-            
+
 
             {
                 Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) );
@@ -534,7 +583,7 @@ namespace mongo {
                 int N = 10000;
                 {
                     Timer t;
-                    for ( int i=0; i<N; i++ ){
+                    for ( int i=0; i<N; i++ ) {
                         unsigned x = (unsigned)rand();
                         unsigned y = (unsigned)rand();
                         GeoHash h( x , y );
@@ -548,7 +597,7 @@ namespace mongo {
 
                 {
                     Timer t;
-                    for ( int i=0; i<N; i++ ){
+                    for ( int i=0; i<N; i++ ) {
                         unsigned x = (unsigned)rand();
                         unsigned y = (unsigned)rand();
                         GeoHash h( x , y );
@@ -579,7 +628,7 @@ namespace mongo {
                 {
                     Point BNA (-1.5127, 0.6304);
                     Point LAX (-2.0665, 0.5924);
-                    
+
                     double dist1 = spheredist_rad(BNA, LAX);
                     double dist2 = spheredist_rad(LAX, BNA);
 
@@ -590,26 +639,42 @@ namespace mongo {
                 {
                     Point JFK (-73.77694444, 40.63861111 );
                     Point LAX (-118.40, 33.94);
-                    
+
                     double dist = spheredist_deg(JFK, LAX) * EARTH_RADIUS_MILES;
                     assert( dist > 2469 && dist < 2470 );
                 }
 
+                {
+                    Point BNA (-86.67, 36.12);
+                    Point LAX (-118.40, 33.94);
+                    Point JFK (-73.77694444, 40.63861111 );
+                    assert( spheredist_deg(BNA, BNA) < 1e-6);
+                    assert( spheredist_deg(LAX, LAX) < 1e-6);
+                    assert( spheredist_deg(JFK, JFK) < 1e-6);
+
+                    Point zero (0, 0);
+                    Point antizero (0,-180);
+
+                    // these were known to cause NaN
+                    assert( spheredist_deg(zero, zero) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6);
+                }
             }
         }
     } geoUnitTest;
-    
+
     class GeoPoint {
     public:
-        GeoPoint(){
+        GeoPoint() {
         }
 
         GeoPoint( const KeyNode& node , double distance )
-            : _key( node.key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ) , _distance( distance ){
+            : _key( node.key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ) , _distance( distance ) {
         }
 
         GeoPoint( const BSONObj& key , DiskLoc loc , double distance )
-            : _key(key) , _loc(loc) , _o( loc.obj() ) , _distance( distance ){
+            : _key(key) , _loc(loc) , _o( loc.obj() ) , _distance( distance ) {
         }
 
         bool operator<( const GeoPoint& other ) const {
@@ -630,44 +695,44 @@ namespace mongo {
     public:
         GeoAccumulator( const Geo2dType * g , const BSONObj& filter )
             : _g(g) , _lookedAt(0) , _objectsLoaded(0) , _found(0) {
-            if ( ! filter.isEmpty() ){
+            if ( ! filter.isEmpty() ) {
                 _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) );
             }
         }
 
-        virtual ~GeoAccumulator(){
+        virtual ~GeoAccumulator() {
         }
 
-        virtual void add( const KeyNode& node ){
+        virtual void add( const KeyNode& node ) {
             // when looking at other boxes, don't want to look at some object twice
             pair<set<DiskLoc>::iterator,bool> seenBefore = _seen.insert( node.recordLoc );
-            if ( ! seenBefore.second ){
+            if ( ! seenBefore.second ) {
                 GEODEBUG( "\t\t\t\t already seen : " << node.recordLoc.obj()["_id"] );
                 return;
             }
             _lookedAt++;
-            
+
             // distance check
             double d = 0;
-            if ( ! checkDistance( GeoHash( node.key.firstElement() ) , d ) ){
+            if ( ! checkDistance( GeoHash( node.key.firstElement() ) , d ) ) {
                 GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj()  << "\t" << d );
                 return;
-            } 
+            }
             GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj()  << "\t" << d );
-            
+
             // matcher
             MatchDetails details;
-            if ( _matcher.get() ){
+            if ( _matcher.get() ) {
                 bool good = _matcher->matches( node.key , node.recordLoc , &details );
                 if ( details.loadedObject )
                     _objectsLoaded++;
-                
-                if ( ! good ){
+
+                if ( ! good ) {
                     GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] );
                     return;
                 }
             }
-            
+
             if ( ! details.loadedObject ) // dont double count
                 _objectsLoaded++;
 
@@ -681,7 +746,7 @@ namespace mongo {
         long long found() const {
             return _found;
         }
-        
+
         const Geo2dType * _g;
         set<DiskLoc> _seen;
         auto_ptr<CoveredIndexMatcher> _matcher;
@@ -690,82 +755,96 @@ namespace mongo {
         long long _objectsLoaded;
         long long _found;
     };
-    
+
     class GeoHopper : public GeoAccumulator {
     public:
         typedef multiset<GeoPoint> Holder;
 
-        GeoHopper( const Geo2dType * g , unsigned max , const GeoHash& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() )
-            : GeoAccumulator( g , filter ) , _max( max ) , _near( n ), _maxDistance( maxDistance ) {
-            _farthest = -1;
-        }
+        GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN)
+            : GeoAccumulator( g , filter ) , _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _farthest(-1)
+        {}
 
-        virtual bool checkDistance( const GeoHash& h , double& d ){
-            d = _g->distance( _near , h );
+        virtual bool checkDistance( const GeoHash& h , double& d ) {
+            switch (_type) {
+            case GEO_PLAIN:
+                d = _near.distance( Point(_g, h) );
+                break;
+            case GEO_SPHERE:
+                d = spheredist_deg(_near, Point(_g, h));
+                break;
+            default:
+                assert(0);
+            }
             bool good = d < _maxDistance && ( _points.size() < _max || d < farthest() );
-            GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near << "\t" << h << "\t" << d 
+            GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near.toString() << "\t" << h << "\t" << d
                       << " ok: " << good << " farthest: " << farthest() );
             return good;
         }
-        
-        virtual void addSpecific( const KeyNode& node , double d ){
+
+        virtual void addSpecific( const KeyNode& node , double d ) {
             GEODEBUG( "\t\t" << GeoHash( node.key.firstElement() ) << "\t" << node.recordLoc.obj() << "\t" << d );
             _points.insert( GeoPoint( node.key , node.recordLoc , d ) );
-            if ( _points.size() > _max ){
+            if ( _points.size() > _max ) {
                 _points.erase( --_points.end() );
-            }
 
-            Holder::iterator i = _points.end();
-            i--;
-            _farthest = i->_distance;
+                Holder::iterator i = _points.end();
+                i--;
+                _farthest = i->_distance;
+            }
+            else {
+                if (d > _farthest)
+                    _farthest = d;
+            }
         }
 
         double farthest() const {
             return _farthest;
         }
 
+
         unsigned _max;
-        GeoHash _near;
+        Point _near;
         Holder _points;
         double _maxDistance;
+        GeoDistType _type;
         double _farthest;
     };
-    
+
     struct BtreeLocation {
         int pos;
         bool found;
         DiskLoc bucket;
-        
-        BSONObj key(){
+
+        BSONObj key() {
             if ( bucket.isNull() )
                 return BSONObj();
             return bucket.btree()->keyNode( pos ).key;
         }
-        
-        bool hasPrefix( const GeoHash& hash ){
+
+        bool hasPrefix( const GeoHash& hash ) {
             BSONElement e = key().firstElement();
             if ( e.eoo() )
                 return false;
             return GeoHash( e ).hasPrefix( hash );
         }
-        
-        bool advance( int direction , int& totalFound , GeoAccumulator* all ){
+
+        bool advance( int direction , int& totalFound , GeoAccumulator* all ) {
 
             if ( bucket.isNull() )
                 return false;
             bucket = bucket.btree()->advance( bucket , pos , direction , "btreelocation" );
-            
+
             if ( all )
                 return checkCur( totalFound , all );
-            
+
             return ! bucket.isNull();
         }
 
-        bool checkCur( int& totalFound , GeoAccumulator* all ){
+        bool checkCur( int& totalFound , GeoAccumulator* all ) {
             if ( bucket.isNull() )
                 return false;
 
-            if ( bucket.btree()->isUsed(pos) ){
+            if ( bucket.btree()->isUsed(pos) ) {
                 totalFound++;
                 all->add( bucket.btree()->keyNode( pos ) );
             }
@@ -776,51 +855,65 @@ namespace mongo {
             return true;
         }
 
-        string toString(){
+        string toString() {
             stringstream ss;
             ss << "bucket: " << bucket.toString() << " pos: " << pos << " found: " << found;
             return ss.str();
         }
 
-        static bool initial( const IndexDetails& id , const Geo2dType * spec , 
-                             BtreeLocation& min , BtreeLocation&  max , 
+        static bool initial( const IndexDetails& id , const Geo2dType * spec ,
+                             BtreeLocation& min , BtreeLocation&  max ,
                              GeoHash start ,
-                             int & found , GeoAccumulator * hopper )
-        {
-            
+                             int & found , GeoAccumulator * hopper ) {
+
             Ordering ordering = Ordering::make(spec->_order);
 
-            min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , 
+            min.bucket = id.head.btree()->locate( id , id.head , start.wrap() ,
                                                   ordering , min.pos , min.found , minDiskLoc );
-            min.checkCur( found , hopper );
+            if (hopper) min.checkCur( found , hopper );
             max = min;
-            
-            if ( min.bucket.isNull() || ( !(hopper->found()) ) ){
-                min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , 
+
+            if ( min.bucket.isNull() || ( hopper && !(hopper->found()) ) ) {
+                min.bucket = id.head.btree()->locate( id , id.head , start.wrap() ,
                                                       ordering , min.pos , min.found , minDiskLoc , -1 );
-                min.checkCur( found , hopper );
+                if (hopper) min.checkCur( found , hopper );
             }
-            
+
             return ! min.bucket.isNull() || ! max.bucket.isNull();
         }
     };
 
     class GeoSearch {
     public:
-        GeoSearch( const Geo2dType * g , const GeoHash& n , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() )
-            : _spec( g ) , _n( n ) , _start( n ) ,
+        GeoSearch( const Geo2dType * g , const GeoHash& n , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN)
+            : _spec( g ) ,_startPt(g,n), _start( n ) ,
               _numWanted( numWanted ) , _filter( filter ) , _maxDistance( maxDistance ) ,
-              _hopper( new GeoHopper( g , numWanted , n , filter , maxDistance ) )
-        {
+              _hopper( new GeoHopper( g , numWanted , _startPt , filter , maxDistance, type ) ), _type(type) {
             assert( g->getDetails() );
             _nscanned = 0;
             _found = 0;
+
+            if (type == GEO_PLAIN) {
+                _scanDistance = maxDistance;
+            }
+            else if (type == GEO_SPHERE) {
+                if (maxDistance == numeric_limits<double>::max()) {
+                    _scanDistance = maxDistance;
+                }
+                else {
+                    //TODO: consider splitting into x and y scan distances
+                    _scanDistance = computeXScanDistance(_startPt._y, rad2deg(maxDistance));
+                }
+            }
+            else {
+                assert(0);
+            }
         }
-        
-        void exec(){
+
+        void exec() {
             const IndexDetails& id = *_spec->getDetails();
-            
-            BtreeBucket * head = id.head.btree();
+
+            const BtreeBucket * head = id.head.btree();
             assert( head );
             /*
              * Search algorithm
@@ -829,144 +922,185 @@ namespace mongo {
              * 3) find optimal set of boxes that complete circle
              * 4) use regular btree cursors to scan those boxes
              */
-            
+
             GeoHopper * hopper = _hopper.get();
 
             _prefix = _start;
-            { // 1 regular geo hash algorithm
-                
+            BtreeLocation min,max;
+            {
+                // 1 regular geo hash algorithm
+
 
-                BtreeLocation min,max;
-                if ( ! BtreeLocation::initial( id , _spec , min , max , _n , _found , hopper ) )
+                if ( ! BtreeLocation::initial( id , _spec , min , max , _start , _found , NULL ) )
                     return;
-                
-                while ( _hopper->found() < _numWanted ){
+
+                while ( !_prefix.constrains() || // if next pass would cover universe, just keep going
+                        ( _hopper->found() < _numWanted && _spec->sizeEdge( _prefix ) <= _scanDistance)) {
                     GEODEBUG( _prefix << "\t" << _found << "\t DESC" );
-                    while ( min.hasPrefix( _prefix ) && min.advance( -1 , _found , hopper ) )
+                    while ( min.hasPrefix(_prefix) && min.checkCur(_found, hopper) && min.advance(-1, _found, NULL) )
                         _nscanned++;
                     GEODEBUG( _prefix << "\t" << _found << "\t ASC" );
-                    while ( max.hasPrefix( _prefix ) && max.advance( 1 , _found , hopper ) )
+                    while ( max.hasPrefix(_prefix) && max.checkCur(_found, hopper) && max.advance(+1, _found, NULL) )
                         _nscanned++;
-                    if ( ! _prefix.constrains() )
-                        break;
+
+                    if ( ! _prefix.constrains() ) {
+                        GEODEBUG( "done search w/o part 2" )
+                        return;
+                    }
+
+                    _alreadyScanned = Box(_spec, _prefix);
                     _prefix = _prefix.up();
-                    
-                    double temp = _spec->distance( _prefix , _start );
-                    if ( temp > ( _maxDistance * 2 ) )
-                        break;
                 }
             }
             GEODEBUG( "done part 1" );
-            if ( _found && _prefix.constrains() ){
+            {
                 // 2
-                Point center( _spec , _n );
                 double farthest = hopper->farthest();
-                // Phase 1 might not have found any points.
-                if (farthest == -1)
-                    farthest = _spec->sizeDiag( _prefix );
-                Box want( center._x - farthest , center._y - farthest , farthest * 2 );
-                _prefix = _n;
-                while ( _spec->sizeEdge( _prefix ) < ( farthest / 2 ) ){
+                GEODEBUGPRINT(hopper->farthest());
+                if (hopper->found() < _numWanted) {
+                    // Not enough found in Phase 1
+                    farthest = _scanDistance;
+                }
+                else if (_type == GEO_SPHERE) {
+                    farthest = std::min(_scanDistance, computeXScanDistance(_startPt._y, rad2deg(farthest)));
+                }
+                GEODEBUGPRINT(farthest);
+
+                Box want( _startPt._x - farthest , _startPt._y - farthest , farthest * 2 );
+                GEODEBUGPRINT(want.toString());
+
+                _prefix = _start;
+                while (_prefix.constrains() && _spec->sizeEdge( _prefix ) < farthest ) {
                     _prefix = _prefix.up();
                 }
-                
-                if ( logLevel > 0 ){
-                    log(1) << "want: " << want << " found:" << _found << " nscanned: " << _nscanned << " hash size:" << _spec->sizeEdge( _prefix ) 
+
+                PREFIXDEBUG(_prefix, _spec);
+
+                if (_prefix.getBits() <= 1) {
+                    // TODO consider walking in $natural order
+
+                    while ( min.checkCur(_found, hopper) && min.advance(-1, _found, NULL) )
+                        _nscanned++;
+                    while ( max.checkCur(_found, hopper) && max.advance(+1, _found, NULL) )
+                        _nscanned++;
+
+                    GEODEBUG( "done search after scanning whole collection" )
+                    return;
+                }
+
+                if ( logLevel > 0 ) {
+                    log(1) << "want: " << want << " found:" << _found << " nscanned: " << _nscanned << " hash size:" << _spec->sizeEdge( _prefix )
                            << " farthest: " << farthest << " using box: " << Box( _spec , _prefix ).toString() << endl;
                 }
-                
-                for ( int x=-1; x<=1; x++ ){
-                    for ( int y=-1; y<=1; y++ ){
+
+                for ( int x=-1; x<=1; x++ ) {
+                    for ( int y=-1; y<=1; y++ ) {
                         GeoHash toscan = _prefix;
                         toscan.move( x , y );
-                        
+
                         // 3 & 4
                         doBox( id , want , toscan );
                     }
                 }
             }
             GEODEBUG( "done search" )
-            
+
         }
 
-        void doBox( const IndexDetails& id , const Box& want , const GeoHash& toscan , int depth = 0 ){
+        void doBox( const IndexDetails& id , const Box& want , const GeoHash& toscan , int depth = 0 ) {
             Box testBox( _spec , toscan );
-            if ( logLevel > 2 ){
+            if ( logLevel > 2 ) {
                 cout << "\t";
                 for ( int i=0; i<depth; i++ )
                     cout << "\t";
                 cout << " doBox: " << testBox.toString() << "\t" << toscan.toString() << " scanned so far: " << _nscanned << endl;
             }
+            else {
+                GEODEBUGPRINT(testBox.toString());
+            }
+
+            if (_alreadyScanned.contains(testBox, _spec->_error)) {
+                GEODEBUG("skipping box: already scanned");
+                return; // been here, done this
+            }
 
             double intPer = testBox.intersects( want );
-            
-            if ( intPer <= 0 )
+
+            if ( intPer <= 0 ) {
+                GEODEBUG("skipping box: not in want");
                 return;
-            
+            }
+
             bool goDeeper = intPer < .5 && depth < 2;
 
             long long myscanned = 0;
-            
+
             BtreeLocation loc;
-            loc.bucket = id.head.btree()->locate( id , id.head , toscan.wrap() , Ordering::make(_spec->_order) , 
-                                                        loc.pos , loc.found , minDiskLoc );
+            loc.bucket = id.head.btree()->locate( id , id.head , toscan.wrap() , Ordering::make(_spec->_order) ,
+                                                  loc.pos , loc.found , minDiskLoc );
             loc.checkCur( _found , _hopper.get() );
-            while ( loc.hasPrefix( toscan ) && loc.advance( 1 , _found , _hopper.get() ) ){
+            while ( loc.hasPrefix( toscan ) && loc.advance( 1 , _found , _hopper.get() ) ) {
                 _nscanned++;
-                if ( ++myscanned > 100 && goDeeper ){
+                if ( ++myscanned > 100 && goDeeper ) {
                     doBox( id , want , toscan + "00" , depth + 1);
                     doBox( id , want , toscan + "01" , depth + 1);
                     doBox( id , want , toscan + "10" , depth + 1);
                     doBox( id , want , toscan + "11" , depth + 1);
-                    return;        
+                    return;
                 }
             }
-            
+
         }
 
 
         const Geo2dType * _spec;
 
-        GeoHash _n;
+        Point _startPt;
         GeoHash _start;
         GeoHash _prefix;
         int _numWanted;
         BSONObj _filter;
         double _maxDistance;
+        double _scanDistance;
         shared_ptr<GeoHopper> _hopper;
 
         long long _nscanned;
         int _found;
+        GeoDistType _type;
+
+        Box _alreadyScanned;
     };
 
     class GeoCursorBase : public Cursor {
     public:
         GeoCursorBase( const Geo2dType * spec )
-            : _spec( spec ), _id( _spec->getDetails() ){
+            : _spec( spec ), _id( _spec->getDetails() ) {
 
         }
 
-        virtual DiskLoc refLoc(){ return DiskLoc(); }
+        virtual DiskLoc refLoc() { return DiskLoc(); }
 
         virtual BSONObj indexKeyPattern() {
             return _spec->keyPattern();
         }
 
-        virtual void noteLocation() { 
-            assert(0);
+        virtual void noteLocation() {
+            // no-op since these are meant to be safe
         }
 
         /* called before query getmore block is iterated */
         virtual void checkLocation() {
-            assert(0);
+            // no-op since these are meant to be safe
         }
 
         virtual bool supportGetMore() { return false; }
         virtual bool supportYields() { return false; }
 
-        virtual bool getsetdup(DiskLoc loc){
-            return false;
-        }
+        virtual bool getsetdup(DiskLoc loc) { return false; }
+        virtual bool modifiedKeys() const { return true; }
+        virtual bool isMultiKey() const { return false; }
+
+
 
         const Geo2dType * _spec;
         const IndexDetails * _id;
@@ -975,20 +1109,23 @@ namespace mongo {
     class GeoSearchCursor : public GeoCursorBase {
     public:
         GeoSearchCursor( shared_ptr<GeoSearch> s )
-            : GeoCursorBase( s->_spec ) , 
-              _s( s ) , _cur( s->_hopper->_points.begin() ) , _end( s->_hopper->_points.end() ) {
+            : GeoCursorBase( s->_spec ) ,
+              _s( s ) , _cur( s->_hopper->_points.begin() ) , _end( s->_hopper->_points.end() ), _nscanned() {
+            if ( _cur != _end ) {
+                ++_nscanned;
+            }
         }
-        
+
         virtual ~GeoSearchCursor() {}
-        
-        virtual bool ok(){
+
+        virtual bool ok() {
             return _cur != _end;
         }
-        
-        virtual Record* _current(){ assert(ok()); return _cur->_loc.rec(); }
-        virtual BSONObj current(){ assert(ok()); return _cur->_o; }
-        virtual DiskLoc currLoc(){ assert(ok()); return _cur->_loc; }
-        virtual bool advance(){ _cur++; return ok(); }
+
+        virtual Record* _current() { assert(ok()); return _cur->_loc.rec(); }
+        virtual BSONObj current() { assert(ok()); return _cur->_o; }
+        virtual DiskLoc currLoc() { assert(ok()); return _cur->_loc; }
+        virtual bool advance() { _cur++; incNscanned(); return ok(); }
         virtual BSONObj currKey() const { return _cur->_key; }
 
         virtual string toString() {
@@ -996,82 +1133,103 @@ namespace mongo {
         }
 
 
-        virtual BSONObj prettyStartKey() const { 
-            return BSON( _s->_spec->_geo << _s->_prefix.toString() ); 
+        virtual BSONObj prettyStartKey() const {
+            return BSON( _s->_spec->_geo << _s->_prefix.toString() );
         }
-        virtual BSONObj prettyEndKey() const { 
+        virtual BSONObj prettyEndKey() const {
             GeoHash temp = _s->_prefix;
             temp.move( 1 , 1 );
-            return BSON( _s->_spec->_geo << temp.toString() ); 
+            return BSON( _s->_spec->_geo << temp.toString() );
         }
 
+        virtual long long nscanned() { return _nscanned; }
 
         shared_ptr<GeoSearch> _s;
         GeoHopper::Holder::iterator _cur;
         GeoHopper::Holder::iterator _end;
+
+        void incNscanned() { if ( ok() ) { ++_nscanned; } }
+        long long _nscanned;
     };
 
     class GeoBrowse : public GeoCursorBase , public GeoAccumulator {
     public:
         GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj() )
             : GeoCursorBase( g ) ,GeoAccumulator( g , filter ) ,
-              _type( type ) , _filter( filter ) , _firstCall(true) {
+              _type( type ) , _filter( filter ) , _firstCall(true), _nscanned() {
         }
-        
+
         virtual string toString() {
             return (string)"GeoBrowse-" + _type;
         }
 
-        virtual bool ok(){
-            if ( _firstCall ){
+        virtual bool ok() {
+            bool first = _firstCall;
+            if ( _firstCall ) {
                 fillStack();
                 _firstCall = false;
             }
-            if ( ! _cur.isEmpty() || _stack.size() )
+            if ( ! _cur.isEmpty() || _stack.size() ) {
+                if ( first ) {
+                    ++_nscanned;
+                }
                 return true;
+            }
 
-            while ( moreToDo() ){
+            while ( moreToDo() ) {
                 fillStack();
-                if ( ! _cur.isEmpty() )
+                if ( ! _cur.isEmpty() ) {
+                    if ( first ) {
+                        ++_nscanned;
+                    }
                     return true;
+                }
             }
-            
+
             return false;
         }
-        
-        virtual bool advance(){ 
+
+        virtual bool advance() {
             _cur._o = BSONObj();
-            
-            if ( _stack.size() ){
+
+            if ( _stack.size() ) {
                 _cur = _stack.front();
                 _stack.pop_front();
+                ++_nscanned;
                 return true;
             }
-            
+
             if ( ! moreToDo() )
                 return false;
-            
+
             while ( _cur.isEmpty() && moreToDo() )
                 fillStack();
-            return ! _cur.isEmpty();
+            return ! _cur.isEmpty() && ++_nscanned;
         }
-        
-        virtual Record* _current(){ assert(ok()); return _cur._loc.rec(); }
-        virtual BSONObj current(){ assert(ok()); return _cur._o; }
-        virtual DiskLoc currLoc(){ assert(ok()); return _cur._loc; }
+
+        virtual Record* _current() { assert(ok()); return _cur._loc.rec(); }
+        virtual BSONObj current() { assert(ok()); return _cur._o; }
+        virtual DiskLoc currLoc() { assert(ok()); return _cur._loc; }
         virtual BSONObj currKey() const { return _cur._key; }
 
 
         virtual bool moreToDo() = 0;
         virtual void fillStack() = 0;
 
-        virtual void addSpecific( const KeyNode& node , double d ){
+        virtual void addSpecific( const KeyNode& node , double d ) {
             if ( _cur.isEmpty() )
                 _cur = GeoPoint( node , d );
             else
                 _stack.push_back( GeoPoint( node , d ) );
         }
 
+        virtual long long nscanned() {
+            if ( _firstCall ) {
+                ok();
+            }
+            return _nscanned;
+        }
+
         string _type;
         BSONObj _filter;
         list<GeoPoint> _stack;
@@ -1079,25 +1237,28 @@ namespace mongo {
         GeoPoint _cur;
         bool _firstCall;
 
+        long long _nscanned;
+
     };
 
     class GeoCircleBrowse : public GeoBrowse {
     public:
-        
+
         enum State {
-            START , 
+            START ,
             DOING_EXPAND ,
             DOING_AROUND ,
             DONE
         } _state;
 
-        GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() )        
-            : GeoBrowse( g , "circle" , filter ){
-            
+        GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center")
+            : GeoBrowse( g , "circle" , filter ) {
+
             uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 );
             BSONObjIterator i(circle);
-            _startPt = Point(i.next());
-            _start = _startPt.hash(g);
+            BSONElement center = i.next();
+            _start = g->_tohash(center);
+            _startPt = Point(center);
             _prefix = _start;
             _maxDistance = i.next().numberDouble();
             uassert( 13061 , "need a max distance > 0 " , _maxDistance > 0 );
@@ -1106,17 +1267,42 @@ namespace mongo {
             _state = START;
             _found = 0;
 
+            if (type == "$center") {
+                _type = GEO_PLAIN;
+                _xScanDistance = _maxDistance;
+                _yScanDistance = _maxDistance;
+            }
+            else if (type == "$centerSphere") {
+                uassert(13461, "Spherical MaxDistance > PI. Are you sure you are using radians?", _maxDistance < M_PI);
+
+                _type = GEO_SPHERE;
+                _yScanDistance = rad2deg(_maxDistance);
+                _xScanDistance = computeXScanDistance(_startPt._y, _yScanDistance);
+
+                uassert(13462, "Spherical distance would require wrapping, which isn't implemented yet",
+                        (_startPt._x + _xScanDistance < 180) && (_startPt._x - _xScanDistance > -180) &&
+                        (_startPt._y + _yScanDistance < 90) && (_startPt._y - _yScanDistance > -90));
+
+                GEODEBUGPRINT(_maxDistance);
+                GEODEBUGPRINT(_xScanDistance);
+                GEODEBUGPRINT(_yScanDistance);
+            }
+            else {
+                uassert(13460, "invalid $center query type: " + type, false);
+            }
+
             ok();
         }
 
-        virtual bool moreToDo(){
+        virtual bool moreToDo() {
             return _state != DONE;
         }
-        
-        virtual void fillStack(){
-            if ( _state == START ){
-                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , 
-                                               _prefix , _found , this ) ){
+
+        virtual void fillStack() {
+
+            if ( _state == START ) {
+                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max ,
+                                               _prefix , _found , this ) ) {
                     _state = DONE;
                     return;
                 }
@@ -1124,10 +1310,10 @@ namespace mongo {
             }
 
 
-            if ( _state == DOING_AROUND ){
+            if ( _state == DOING_AROUND ) {
                 // TODO could rework and return rather than looping
-                for (int i=-1; i<=1; i++){
-                    for (int j=-1; j<=1; j++){
+                for (int i=-1; i<=1; i++) {
+                    for (int j=-1; j<=1; j++) {
                         if (i == 0 && j == 0)
                             continue; // main box
 
@@ -1135,10 +1321,11 @@ namespace mongo {
                         newBox.move(i, j);
 
                         PREFIXDEBUG(newBox, _g);
-                        if (needToCheckBox(newBox)){
+                        if (needToCheckBox(newBox)) {
                             // TODO consider splitting into quadrants
                             getPointsForPrefix(newBox);
-                        } else  {
+                        }
+                        else  {
                             GEODEBUG("skipping box");
                         }
                     }
@@ -1147,20 +1334,19 @@ namespace mongo {
                 _state = DONE;
                 return;
             }
-            
-            if (_state == DOING_EXPAND){
+
+            if (_state == DOING_EXPAND) {
                 GEODEBUG( "circle prefix [" << _prefix << "]" );
                 PREFIXDEBUG(_prefix, _g);
 
                 while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) );
                 while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) );
 
-                if ( ! _prefix.constrains() ){
+                if ( ! _prefix.constrains() ) {
                     GEODEBUG( "\t exhausted the btree" );
                     _state = DONE;
                     return;
                 }
-                
 
                 Point ll (_g, _prefix);
                 GeoHash trHash = _prefix;
@@ -1168,50 +1354,52 @@ namespace mongo {
                 Point tr (_g, trHash);
                 double sideLen = fabs(tr._x - ll._x);
 
-                if (sideLen > _maxDistance){ // circle must be contained by surrounding squares
-                    if ( (ll._x + _maxDistance < _startPt._x && ll._y + _maxDistance < _startPt._y) && 
-                         (tr._x - _maxDistance > _startPt._x && tr._y - _maxDistance > _startPt._y) )
-                    {
+                if (sideLen > std::max(_xScanDistance, _yScanDistance)) { // circle must be contained by surrounding squares
+                    if ( (ll._x + _xScanDistance < _startPt._x && ll._y + _yScanDistance < _startPt._y) &&
+                            (tr._x - _xScanDistance > _startPt._x && tr._y - _yScanDistance > _startPt._y) ) {
                         GEODEBUG("square fully contains circle");
                         _state = DONE;
-                    } else if (_prefix.getBits() > 1){
+                    }
+                    else if (_prefix.getBits() > 1) {
                         GEODEBUG("checking surrounding squares");
                         _state = DOING_AROUND;
-                    } else {
+                    }
+                    else {
                         GEODEBUG("using simple search");
                         _prefix = _prefix.up();
                     }
-                } else {
+                }
+                else {
                     _prefix = _prefix.up();
                 }
 
                 return;
             }
-            
+
             /* Clients are expected to use moreToDo before calling
              * fillStack, so DONE is checked for there. If any more
              * State values are defined, you should handle them
-             * here. */ 
+             * here. */
             assert(0);
         }
 
-        bool needToCheckBox(const GeoHash& prefix){
+        bool needToCheckBox(const GeoHash& prefix) {
             Point ll (_g, prefix);
-            if (fabs(ll._x - _startPt._x) <= _maxDistance) return true;
-            if (fabs(ll._y - _startPt._y) <= _maxDistance) return true;
+            if (fabs(ll._x - _startPt._x) <= _xScanDistance) return true;
+            if (fabs(ll._y - _startPt._y) <= _yScanDistance) return true;
 
-            GeoHash trHash = _prefix;
+            GeoHash trHash = prefix;
             trHash.move( 1 , 1 );
             Point tr (_g, trHash);
 
-            if (fabs(tr._x - _startPt._x) <= _maxDistance) return true;
-            if (fabs(tr._y - _startPt._y) <= _maxDistance) return true;
+            if (fabs(tr._x - _startPt._x) <= _xScanDistance) return true;
+            if (fabs(tr._y - _startPt._y) <= _yScanDistance) return true;
 
             return false;
         }
 
-        void getPointsForPrefix(const GeoHash& prefix){
-            if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ){
+        void getPointsForPrefix(const GeoHash& prefix) {
+            if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ) {
                 return;
             }
 
@@ -1219,37 +1407,50 @@ namespace mongo {
             while ( _max.hasPrefix( prefix ) && _max.advance( 1 , _found , this ) );
         }
 
-        
-        virtual bool checkDistance( const GeoHash& h , double& d ){
-            d = _g->distance( _start , h );
+
+        virtual bool checkDistance( const GeoHash& h , double& d ) {
+            switch (_type) {
+            case GEO_PLAIN:
+                d = _g->distance( _start , h );
+                break;
+            case GEO_SPHERE:
+                d = spheredist_deg(_startPt, Point(_g, h));
+                break;
+            default:
+                assert(0);
+            }
+
             GEODEBUG( "\t " << h << "\t" << d );
             return d <= _maxDistance;
         }
 
+        GeoDistType _type;
         GeoHash _start;
         Point _startPt;
-        double _maxDistance;
-        
+        double _maxDistance; // user input
+        double _xScanDistance; // effected by GeoDistType
+        double _yScanDistance; // effected by GeoDistType
+
         int _found;
-        
-        GeoHash _prefix;        
+
+        GeoHash _prefix;
         BtreeLocation _min;
         BtreeLocation _max;
 
-    };    
+    };
 
     class GeoBoxBrowse : public GeoBrowse {
     public:
-        
+
         enum State {
-            START , 
+            START ,
             DOING_EXPAND ,
             DONE
         } _state;
 
-        GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj() )        
-            : GeoBrowse( g , "box" , filter ){
-            
+        GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj() )
+            : GeoBrowse( g , "box" , filter ) {
+
             uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 );
             BSONObjIterator i(box);
             _bl = g->_tohash( i.next() );
@@ -1265,7 +1466,7 @@ namespace mongo {
 
             Point center = _want.center();
             _prefix = _g->hash( center._x , center._y );
-            
+
             GEODEBUG( "center : " << center.toString() << "\t" << _prefix );
 
             {
@@ -1280,42 +1481,43 @@ namespace mongo {
             ok();
         }
 
-        virtual bool moreToDo(){
+        virtual bool moreToDo() {
             return _state != DONE;
         }
-        
-        virtual void fillStack(){
-            if ( _state == START ){
 
-                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , 
-                                               _prefix , _found , this ) ){
+        virtual void fillStack() {
+            if ( _state == START ) {
+
+                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max ,
+                                               _prefix , _found , this ) ) {
                     _state = DONE;
                     return;
                 }
                 _state = DOING_EXPAND;
             }
-            
-            if ( _state == DOING_EXPAND ){
+
+            if ( _state == DOING_EXPAND ) {
                 int started = _found;
-                while ( started == _found || _state == DONE ){
+                while ( started == _found || _state == DONE ) {
                     GEODEBUG( "box prefix [" << _prefix << "]" );
                     while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) );
                     while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) );
-                    
+
                     if ( _state == DONE )
                         return;
 
-                    if ( ! _prefix.constrains() ){
+                    if ( ! _prefix.constrains() ) {
                         GEODEBUG( "box exhausted" );
                         _state = DONE;
                         return;
                     }
 
-                    if (_g->sizeEdge(_prefix) < _wantLen){
+                    if (_g->sizeEdge(_prefix) < _wantLen) {
                         _prefix = _prefix.up();
-                    } else {
-                        for (int i=-1; i<=1; i++){
-                            for (int j=-1; j<=1; j++){
+                    }
+                    else {
+                        for (int i=-1; i<=1; i++) {
+                            for (int j=-1; j<=1; j++) {
 
                                 if (i == 0 && j == 0)
                                     continue; // main box
@@ -1326,36 +1528,37 @@ namespace mongo {
                                 PREFIXDEBUG(newBox, _g);
 
                                 Box cur( _g , newBox );
-                                if (_want.intersects(cur)){
+                                if (_want.intersects(cur)) {
                                     // TODO consider splitting into quadrants
                                     getPointsForPrefix(newBox);
-                                } else  {
+                                }
+                                else  {
                                     GEODEBUG("skipping box");
                                 }
                             }
                         }
                         _state = DONE;
                     }
-                    
+
                 }
                 return;
             }
 
         }
 
-        void getPointsForPrefix(const GeoHash& prefix){
-            if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ){
+        void getPointsForPrefix(const GeoHash& prefix) {
+            if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ) {
                 return;
             }
 
             while ( _min.hasPrefix( prefix ) && _min.advance( -1 , _found , this ) );
             while ( _max.hasPrefix( prefix ) && _max.advance( 1 , _found , this ) );
         }
-        
-        virtual bool checkDistance( const GeoHash& h , double& d ){
+
+        virtual bool checkDistance( const GeoHash& h , double& d ) {
             bool res = _want.inside( Point( _g , h ) , _fudge );
-            GEODEBUG( "\t want : " << _want.toString() 
-                      << " point: " << Point( _g , h ).toString() 
+            GEODEBUG( "\t want : " << _want.toString()
+                      << " point: " << Point( _g , h ).toString()
                       << " in : " << res );
             return res;
         }
@@ -1366,23 +1569,23 @@ namespace mongo {
         double _wantLen;
 
         int _found;
-        
-        GeoHash _prefix;        
+
+        GeoHash _prefix;
         BtreeLocation _min;
         BtreeLocation _max;
 
         double _fudge;
-    };    
+    };
 
 
     shared_ptr<Cursor> Geo2dType::newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
         if ( numWanted < 0 )
             numWanted = numWanted * -1;
         else if ( numWanted == 0 )
-             numWanted = 100;
-        
+            numWanted = 100;
+
         BSONObjIterator i(query);
-        while ( i.more() ){
+        while ( i.more() ) {
             BSONElement e = i.next();
 
             if ( _geo != e.fieldName() )
@@ -1390,13 +1593,27 @@ namespace mongo {
 
             if ( e.type() != Object )
                 continue;
-            
-            switch ( e.embeddedObject().firstElement().getGtLtOp() ){
+
+            switch ( e.embeddedObject().firstElement().getGtLtOp() ) {
             case BSONObj::opNEAR: {
                 BSONObj n = e.embeddedObject();
                 e = n.firstElement();
+
+                const char* suffix = e.fieldName() + 5; // strlen("$near") == 5;
+                GeoDistType type;
+                if (suffix[0] == '\0') {
+                    type = GEO_PLAIN;
+                }
+                else if (strcmp(suffix, "Sphere") == 0) {
+                    type = GEO_SPHERE;
+                }
+                else {
+                    uassert(13464, string("invalid $near search type: ") + e.fieldName(), false);
+                    type = GEO_PLAIN; // prevents uninitialized warning
+                }
+
                 double maxDistance = numeric_limits<double>::max();
-                if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ){
+                if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ) {
                     BSONObjIterator i(e.embeddedObject());
                     i.next();
                     i.next();
@@ -1409,32 +1626,30 @@ namespace mongo {
                     if ( e.isNumber() )
                         maxDistance = e.numberDouble();
                 }
-                shared_ptr<GeoSearch> s( new GeoSearch( this , _tohash(e) , numWanted , query , maxDistance ) );
+                shared_ptr<GeoSearch> s( new GeoSearch( this , _tohash(e) , numWanted , query , maxDistance, type ) );
                 s->exec();
                 shared_ptr<Cursor> c;
                 c.reset( new GeoSearchCursor( s ) );
-                return c;   
+                return c;
             }
             case BSONObj::opWITHIN: {
                 e = e.embeddedObject().firstElement();
                 uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() );
                 e = e.embeddedObject().firstElement();
                 string type = e.fieldName();
-                if ( type == "$center" ){
+                if ( startsWith(type,  "$center") ) {
                     uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() );
-                    shared_ptr<Cursor> c;
-                    c.reset( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query ) );
-                    return c;   
+                    shared_ptr<Cursor> c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type) );
+                    return c;
                 }
-                else if ( type == "$box" ){
+                else if ( type == "$box" ) {
                     uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() );
-                    shared_ptr<Cursor> c;
-                    c.reset( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query ) );
-                    return c;   
+                    shared_ptr<Cursor> c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query ) );
+                    return c;
                 }
                 throw UserException( 13058 , (string)"unknown $with type: " + type );
             }
-            default: 
+            default:
                 break;
             }
         }
@@ -1448,41 +1663,41 @@ namespace mongo {
 
     class Geo2dFindNearCmd : public Command {
     public:
-        Geo2dFindNearCmd() : Command( "geoNear" ){}
-        virtual LockType locktype() const { return READ; } 
+        Geo2dFindNearCmd() : Command( "geoNear" ) {}
+        virtual LockType locktype() const { return READ; }
         bool slaveOk() const { return true; }
         void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; }
         bool slaveOverrideOk() { return true; }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string ns = dbname + "." + cmdObj.firstElement().valuestr();
 
             NamespaceDetails * d = nsdetails( ns.c_str() );
-            if ( ! d ){
+            if ( ! d ) {
                 errmsg = "can't find ns";
                 return false;
             }
 
             vector<int> idxs;
             d->findIndexByType( GEO2DNAME , idxs );
-            
-            if ( idxs.size() > 1 ){
+
+            if ( idxs.size() > 1 ) {
                 errmsg = "more than 1 geo indexes :(";
                 return false;
             }
-            
-            if ( idxs.size() == 0 ){
+
+            if ( idxs.size() == 0 ) {
                 errmsg = "no geo index :(";
                 return false;
             }
 
             int geoIdx = idxs[0];
-            
+
             result.append( "ns" , ns );
 
             IndexDetails& id = d->idx( geoIdx );
             Geo2dType * g = (Geo2dType*)id.getSpec().getType();
             assert( &id == g->getDetails() );
-            
+
             int numWanted = 100;
             if ( cmdObj["num"].isNumber() )
                 numWanted = cmdObj["num"].numberInt();
@@ -1499,37 +1714,41 @@ namespace mongo {
             if ( cmdObj["maxDistance"].isNumber() )
                 maxDistance = cmdObj["maxDistance"].number();
 
-            GeoSearch gs( g , n , numWanted , filter , maxDistance );
+            GeoDistType type = GEO_PLAIN;
+            if ( cmdObj["spherical"].trueValue() )
+                type = GEO_SPHERE;
+
+            GeoSearch gs( g , n , numWanted , filter , maxDistance , type);
 
-            if ( cmdObj["start"].type() == String){
+            if ( cmdObj["start"].type() == String) {
                 GeoHash start ((string) cmdObj["start"].valuestr());
                 gs._start = start;
             }
-            
+
             gs.exec();
 
             double distanceMultiplier = 1;
             if ( cmdObj["distanceMultiplier"].isNumber() )
                 distanceMultiplier = cmdObj["distanceMultiplier"].number();
-            
+
             double totalDistance = 0;
 
 
             BSONObjBuilder arr( result.subarrayStart( "results" ) );
             int x = 0;
-            for ( GeoHopper::Holder::iterator i=gs._hopper->_points.begin(); i!=gs._hopper->_points.end(); i++ ){
+            for ( GeoHopper::Holder::iterator i=gs._hopper->_points.begin(); i!=gs._hopper->_points.end(); i++ ) {
                 const GeoPoint& p = *i;
-                
+
                 double dis = distanceMultiplier * p._distance;
                 totalDistance += dis;
-                
-                BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ).c_str() ) );
+
+                BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ) ) );
                 bb.append( "dis" , dis );
                 bb.append( "obj" , p._o );
                 bb.done();
             }
             arr.done();
-            
+
             BSONObjBuilder stats( result.subobjStart( "stats" ) );
             stats.append( "time" , cc().curop()->elapsedMillis() );
             stats.appendNumber( "btreelocs" , gs._nscanned );
@@ -1538,23 +1757,23 @@ namespace mongo {
             stats.append( "avgDistance" , totalDistance / x );
             stats.append( "maxDistance" , gs._hopper->farthest() );
             stats.done();
-            
+
             return true;
         }
-        
+
     } geo2dFindNearCmd;
 
     class GeoWalkCmd : public Command {
     public:
-        GeoWalkCmd() : Command( "geoWalk" ){}
-        virtual LockType locktype() const { return READ; } 
+        GeoWalkCmd() : Command( "geoWalk" ) {}
+        virtual LockType locktype() const { return READ; }
         bool slaveOk() const { return true; }
         bool slaveOverrideOk() { return true; }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string ns = dbname + "." + cmdObj.firstElement().valuestr();
 
             NamespaceDetails * d = nsdetails( ns.c_str() );
-            if ( ! d ){
+            if ( ! d ) {
                 errmsg = "can't find ns";
                 return false;
             }
@@ -1562,10 +1781,10 @@ namespace mongo {
             int geoIdx = -1;
             {
                 NamespaceDetails::IndexIterator ii = d->ii();
-                while ( ii.more() ){
+                while ( ii.more() ) {
                     IndexDetails& id = ii.next();
-                    if ( id.getSpec().getTypeName() == GEO2DNAME ){
-                        if ( geoIdx >= 0 ){
+                    if ( id.getSpec().getTypeName() == GEO2DNAME ) {
+                        if ( geoIdx >= 0 ) {
                             errmsg = "2 geo indexes :(";
                             return false;
                         }
@@ -1573,12 +1792,12 @@ namespace mongo {
                     }
                 }
             }
-            
-            if ( geoIdx < 0 ){
+
+            if ( geoIdx < 0 ) {
                 errmsg = "no geo index :(";
                 return false;
             }
-            
+
 
             IndexDetails& id = d->idx( geoIdx );
             Geo2dType * g = (Geo2dType*)id.getSpec().getType();
@@ -1587,12 +1806,12 @@ namespace mongo {
             int max = 100000;
 
             BtreeCursor c( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 );
-            while ( c.ok() && max-- ){
+            while ( c.ok() && max-- ) {
                 GeoHash h( c.currKey().firstElement() );
                 int len;
                 cout << "\t" << h.toString()
-                     << "\t" << c.current()[g->_geo] 
-                     << "\t" << hex << h.getHash() 
+                     << "\t" << c.current()[g->_geo]
+                     << "\t" << hex << h.getHash()
                      << "\t" << hex << ((long long*)c.currKey().firstElement().binData(len))[0]
                      << "\t" << c.current()["_id"]
                      << endl;
@@ -1601,7 +1820,7 @@ namespace mongo {
 
             return true;
         }
-        
+
     } geoWalkCmd;
 
 }
diff --git a/db/geo/core.h b/db/geo/core.h
index 13f3636..602b513 100644
--- a/db/geo/core.h
+++ b/db/geo/core.h
@@ -31,23 +31,23 @@ namespace mongo {
 
     class GeoBitSets {
     public:
-        GeoBitSets(){
-            for ( int i=0; i<32; i++ ){
+        GeoBitSets() {
+            for ( int i=0; i<32; i++ ) {
                 masks32[i] = ( 1 << ( 31 - i ) );
             }
-            for ( int i=0; i<64; i++ ){
+            for ( int i=0; i<64; i++ ) {
                 masks64[i] = ( 1LL << ( 63 - i ) );
             }
-            
-            for ( unsigned i=0; i<16; i++ ){
+
+            for ( unsigned i=0; i<16; i++ ) {
                 unsigned fixed = 0;
-                for ( int j=0; j<4; j++ ){
+                for ( int j=0; j<4; j++ ) {
                     if ( i & ( 1 << j ) )
                         fixed |= ( 1 << ( j * 2 ) );
                 }
                 hashedToNormal[fixed] = i;
             }
-            
+
         }
         int masks32[32];
         long long masks64[64];
@@ -56,24 +56,24 @@ namespace mongo {
     };
 
     extern GeoBitSets geoBitSets;
-    
+
     class GeoHash {
     public:
         GeoHash()
-            : _hash(0),_bits(0){
+            : _hash(0),_bits(0) {
         }
 
-        explicit GeoHash( const char * hash ){
+        explicit GeoHash( const char * hash ) {
             init( hash );
         }
 
-        explicit GeoHash( const string& hash ){
+        explicit GeoHash( const string& hash ) {
             init( hash );
         }
 
-        explicit GeoHash( const BSONElement& e , unsigned bits=32 ){
+        explicit GeoHash( const BSONElement& e , unsigned bits=32 ) {
             _bits = bits;
-            if ( e.type() == BinData ){
+            if ( e.type() == BinData ) {
                 int len = 0;
                 _copy( (char*)&_hash , e.binData( len ) );
                 assert( len == 8 );
@@ -85,26 +85,26 @@ namespace mongo {
             }
             _fix();
         }
-        
-        GeoHash( unsigned x , unsigned y , unsigned bits=32){
+
+        GeoHash( unsigned x , unsigned y , unsigned bits=32) {
             init( x , y , bits );
         }
 
-        GeoHash( const GeoHash& old ){
+        GeoHash( const GeoHash& old ) {
             _hash = old._hash;
             _bits = old._bits;
         }
 
         GeoHash( long long hash , unsigned bits )
-            : _hash( hash ) , _bits( bits ){
+            : _hash( hash ) , _bits( bits ) {
             _fix();
         }
 
-        void init( unsigned x , unsigned y , unsigned bits ){
+        void init( unsigned x , unsigned y , unsigned bits ) {
             assert( bits <= 32 );
             _hash = 0;
             _bits = bits;
-            for ( unsigned i=0; i<bits; i++ ){
+            for ( unsigned i=0; i<bits; i++ ) {
                 if ( isBitSet( x , i ) ) _hash |= geoBitSets.masks64[i*2];
                 if ( isBitSet( y , i ) ) _hash |= geoBitSets.masks64[(i*2)+1];
             }
@@ -114,7 +114,7 @@ namespace mongo {
             x = 0;
             y = 0;
             char * c = (char*)(&_hash);
-            for ( int i=0; i<8; i++ ){
+            for ( int i=0; i<8; i++ ) {
                 unsigned t = (unsigned)(c[i]) & 0x55;
                 y |= ( geoBitSets.hashedToNormal[t] << (4*(i)) );
 
@@ -126,7 +126,7 @@ namespace mongo {
         void unhash_slow( unsigned& x , unsigned& y ) const {
             x = 0;
             y = 0;
-            for ( unsigned i=0; i<_bits; i++ ){
+            for ( unsigned i=0; i<_bits; i++ ) {
                 if ( getBitX(i) )
                     x |= geoBitSets.masks32[i];
                 if ( getBitY(i) )
@@ -141,14 +141,14 @@ namespace mongo {
         /**
          * @param 0 = high
          */
-        static bool isBitSet( unsigned val , unsigned  bit ){
+        static bool isBitSet( unsigned val , unsigned  bit ) {
             return geoBitSets.masks32[bit] & val;
         }
-        
+
         GeoHash up() const {
             return GeoHash( _hash , _bits - 1 );
         }
-        
+
         bool hasPrefix( const GeoHash& other ) const {
             assert( other._bits <= _bits );
             if ( other._bits == 0 )
@@ -157,9 +157,9 @@ namespace mongo {
             x = x >> (64-(other._bits*2));
             return x == 0;
         }
-        
 
-        string toString() const { 
+
+        string toString() const {
             StringBuilder buf( _bits * 2 );
             for ( unsigned x=0; x<_bits*2; x++ )
                 buf.append( _hash & geoBitSets.masks64[x] ? "1" : "0" );
@@ -172,7 +172,7 @@ namespace mongo {
             return ss.str();
         }
 
-        void init( const string& s ){
+        void init( const string& s ) {
             _hash = 0;
             _bits = s.size() / 2;
             for ( unsigned pos=0; pos<s.size(); pos++ )
@@ -180,14 +180,14 @@ namespace mongo {
                     setBit( pos , 1 );
         }
 
-        void setBit( unsigned pos , bool one ){
+        void setBit( unsigned pos , bool one ) {
             assert( pos < _bits * 2 );
             if ( one )
                 _hash |= geoBitSets.masks64[pos];
             else if ( _hash & geoBitSets.masks64[pos] )
                 _hash &= ~geoBitSets.masks64[pos];
         }
-        
+
         bool getBit( unsigned pos ) const {
             return _hash & geoBitSets.masks64[pos];
         }
@@ -201,7 +201,7 @@ namespace mongo {
             assert( pos < 32 );
             return getBit( ( pos * 2 ) + 1 );
         }
-        
+
         BSONObj wrap() const {
             BSONObjBuilder b(20);
             append( b , "" );
@@ -213,20 +213,20 @@ namespace mongo {
         bool constrains() const {
             return _bits > 0;
         }
-        
-        void move( int x , int y ){
+
+        void move( int x , int y ) {
             assert( _bits );
             _move( 0 , x );
             _move( 1 , y );
         }
 
-        void _move( unsigned offset , int d ){
+        void _move( unsigned offset , int d ) {
             if ( d == 0 )
                 return;
             assert( d <= 1 && d>= -1 ); // TEMP
-            
+
             bool from, to;
-            if ( d > 0 ){
+            if ( d > 0 ) {
                 from = 0;
                 to = 1;
             }
@@ -238,34 +238,34 @@ namespace mongo {
             unsigned pos = ( _bits * 2 ) - 1;
             if ( offset == 0 )
                 pos--;
-            while ( true ){
-                if ( getBit(pos) == from ){
+            while ( true ) {
+                if ( getBit(pos) == from ) {
                     setBit( pos , to );
                     return;
                 }
 
-                if ( pos < 2 ){
+                if ( pos < 2 ) {
                     // overflow
-                    for ( ; pos < ( _bits * 2 ) ; pos += 2 ){
+                    for ( ; pos < ( _bits * 2 ) ; pos += 2 ) {
                         setBit( pos , from );
                     }
                     return;
                 }
-                
+
                 setBit( pos , from );
                 pos -= 2;
             }
-            
+
             assert(0);
         }
 
-        GeoHash& operator=(const GeoHash& h) { 
+        GeoHash& operator=(const GeoHash& h) {
             _hash = h._hash;
             _bits = h._bits;
             return *this;
         }
-        
-        bool operator==(const GeoHash& h ){
+
+        bool operator==(const GeoHash& h ) {
             return _hash == h._hash && _bits == h._bits;
         }
 
@@ -273,7 +273,7 @@ namespace mongo {
             unsigned pos = _bits * 2;
             _bits += strlen(s) / 2;
             assert( _bits <= 32 );
-            while ( s[0] ){
+            while ( s[0] ) {
                 if ( s[0] == '1' )
                     setBit( pos , 1 );
                 pos++;
@@ -288,19 +288,19 @@ namespace mongo {
             n+=s;
             return n;
         }
-      
-        void _fix(){
+
+        void _fix() {
             static long long FULL = 0xFFFFFFFFFFFFFFFFLL;
             long long mask = FULL << ( 64 - ( _bits * 2 ) );
             _hash &= mask;
         }
-        
+
         void append( BSONObjBuilder& b , const char * name ) const {
             char buf[8];
             _copy( buf , (char*)&_hash );
             b.appendBinData( name , 8 , bdtCustom , buf );
         }
-        
+
         long long getHash() const {
             return _hash;
         }
@@ -311,9 +311,9 @@ namespace mongo {
 
         GeoHash commonPrefix( const GeoHash& other ) const {
             unsigned i=0;
-            for ( ; i<_bits && i<other._bits; i++ ){
+            for ( ; i<_bits && i<other._bits; i++ ) {
                 if ( getBitX( i ) == other.getBitX( i ) &&
-                     getBitY( i ) == other.getBitY( i ) )
+                        getBitY( i ) == other.getBitY( i ) )
                     continue;
                 break;
             }
@@ -323,7 +323,7 @@ namespace mongo {
     private:
 
         void _copy( char * dst , const char * src ) const {
-            for ( unsigned a=0; a<8; a++ ){
+            for ( unsigned a=0; a<8; a++ ) {
                 dst[a] = src[7-a];
             }
         }
@@ -332,14 +332,14 @@ namespace mongo {
         unsigned _bits; // bits per field, so 1 to 32
     };
 
-    inline ostream& operator<<( ostream &s, const GeoHash &h ){
+    inline ostream& operator<<( ostream &s, const GeoHash &h ) {
         s << h.toString();
         return s;
-    } 
+    }
 
     class GeoConvert {
     public:
-        virtual ~GeoConvert(){}
+        virtual ~GeoConvert() {}
 
         virtual void unhash( const GeoHash& h , double& x , double& y ) const = 0;
         virtual GeoHash hash( double x , double y ) const = 0;
@@ -347,31 +347,31 @@ namespace mongo {
 
     class Point {
     public:
-        
-        Point( const GeoConvert * g , const GeoHash& hash ){
+
+        Point( const GeoConvert * g , const GeoHash& hash ) {
             g->unhash( hash , _x , _y );
         }
-        
-        explicit Point( const BSONElement& e ){
+
+        explicit Point( const BSONElement& e ) {
             BSONObjIterator i(e.Obj());
             _x = i.next().number();
             _y = i.next().number();
         }
 
-        explicit Point( const BSONObj& o ){
+        explicit Point( const BSONObj& o ) {
             BSONObjIterator i(o);
             _x = i.next().number();
             _y = i.next().number();
         }
 
         Point( double x , double y )
-            : _x( x ) , _y( y ){
+            : _x( x ) , _y( y ) {
         }
-        
-        Point() : _x(0),_y(0){
+
+        Point() : _x(0),_y(0) {
         }
 
-        GeoHash hash( const GeoConvert * g ){
+        GeoHash hash( const GeoConvert * g ) {
             return g->hash( _x , _y );
         }
 
@@ -380,12 +380,12 @@ namespace mongo {
             double b = _y - p._y;
             return sqrt( ( a * a ) + ( b * b ) );
         }
-        
+
         string toString() const {
             StringBuilder buf(32);
             buf << "(" << _x << "," << _y << ")";
             return buf.str();
-  
+
         }
 
         double _x;
@@ -393,8 +393,11 @@ namespace mongo {
     };
 
 
-    extern double EARTH_RADIUS_KM;
-    extern double EARTH_RADIUS_MILES;
+    extern const double EARTH_RADIUS_KM;
+    extern const double EARTH_RADIUS_MILES;
+
+    inline double deg2rad(double deg) { return deg * (M_PI/180); }
+    inline double rad2deg(double rad) { return rad * (180/M_PI); }
 
     // WARNING: _x and _y MUST be longitude and latitude in that order
     // note: multiply by earth radius for distance
@@ -407,20 +410,26 @@ namespace mongo {
         double sin_y1(sin(p1._y)), cos_y1(cos(p1._y));
         double sin_x2(sin(p2._x)), cos_x2(cos(p2._x));
         double sin_y2(sin(p2._y)), cos_y2(cos(p2._y));
-        
-        double cross_prod = 
+
+        double cross_prod =
             (cos_y1*cos_x1 * cos_y2*cos_x2) +
             (cos_y1*sin_x1 * cos_y2*sin_x2) +
             (sin_y1        * sin_y2);
 
+        if (cross_prod >= 1 || cross_prod <= -1) {
+            // fun with floats
+            assert( fabs(cross_prod)-1 < 1e-6 );
+            return cross_prod > 0 ? 0 : M_PI;
+        }
+
         return acos(cross_prod);
     }
 
     // note: return is still in radians as that can be multiplied by radius to get arc length
     inline double spheredist_deg( const Point& p1, const Point& p2 ) {
         return spheredist_rad(
-                    Point( p1._x * (M_PI/180), p1._y * (M_PI/180)),
-                    Point( p2._x * (M_PI/180), p2._y * (M_PI/180))
+                   Point( deg2rad(p1._x), deg2rad(p1._y) ),
+                   Point( deg2rad(p2._x), deg2rad(p2._y) )
                );
     }
 
diff --git a/db/geo/haystack.cpp b/db/geo/haystack.cpp
index 4a1d4a7..7f278ca 100644
--- a/db/geo/haystack.cpp
+++ b/db/geo/haystack.cpp
@@ -17,14 +17,14 @@
  */
 
 #include "pch.h"
-#include "../namespace.h"
+#include "../namespace-inl.h"
 #include "../jsobj.h"
 #include "../index.h"
 #include "../../util/unittest.h"
 #include "../commands.h"
 #include "../pdfile.h"
 #include "../btree.h"
-#include "../curop.h"
+#include "../curop-inl.h"
 #include "../matcher.h"
 #include "core.h"
 
@@ -38,29 +38,29 @@
  * should not be used for finding the closest restaurants that are open
  */
 namespace mongo {
-    
+
     string GEOSEARCHNAME = "geoHaystack";
-    
+
     class GeoHaystackSearchHopper {
     public:
         GeoHaystackSearchHopper( const BSONObj& n , double maxDistance , unsigned limit , const string& geoField )
-            : _near( n ) , _maxDistance( maxDistance ) , _limit( limit ) , _geoField(geoField){
-            
+            : _near( n ) , _maxDistance( maxDistance ) , _limit( limit ) , _geoField(geoField) {
+
         }
-        
-        void got( const DiskLoc& loc ){
+
+        void got( const DiskLoc& loc ) {
             Point p( loc.obj().getFieldDotted( _geoField ) );
             if ( _near.distance( p ) > _maxDistance )
                 return;
             _locs.push_back( loc );
         }
 
-        int append( BSONArrayBuilder& b ){
+        int append( BSONArrayBuilder& b ) {
             for ( unsigned i=0; i<_locs.size() && i<_limit; i++ )
                 b.append( _locs[i].obj() );
             return _locs.size();
         }
-        
+
         Point _near;
         double _maxDistance;
         unsigned _limit;
@@ -70,22 +70,22 @@ namespace mongo {
     };
 
     class GeoHaystackSearchIndex : public IndexType {
-        
+
     public:
-            
+
         GeoHaystackSearchIndex( const IndexPlugin* plugin , const IndexSpec* spec )
-            : IndexType( plugin , spec ){
-             
+            : IndexType( plugin , spec ) {
+
             BSONElement e = spec->info["bucketSize"];
             uassert( 13321 , "need bucketSize" , e.isNumber() );
             _bucketSize = e.numberDouble();
-            
+
             BSONObjBuilder orderBuilder;
-            
+
             BSONObjIterator i( spec->keyPattern );
-            while ( i.more() ){
+            while ( i.more() ) {
                 BSONElement e = i.next();
-                if ( e.type() == String && GEOSEARCHNAME == e.valuestr() ){
+                if ( e.type() == String && GEOSEARCHNAME == e.valuestr() ) {
                     uassert( 13314 , "can't have 2 geo fields" , _geo.size() == 0 );
                     uassert( 13315 , "2d has to be first in index" , _other.size() == 0 );
                     _geo = e.fieldName();
@@ -95,13 +95,13 @@ namespace mongo {
                 }
                 orderBuilder.append( "" , 1 );
             }
-            
+
             uassert( 13316 , "no geo field specified" , _geo.size() );
             uassert( 13317 , "no other fields specified" , _other.size() );
             uassert( 13326 , "quadrant search can only have 1 other field for now" , _other.size() == 1 );
             _order = orderBuilder.obj();
         }
-        
+
         int hash( const BSONElement& e ) const {
             uassert( 13322 , "not a number" , e.isNumber() );
             return hash( e.numberDouble() );
@@ -126,18 +126,18 @@ namespace mongo {
                 buf.appendNull( "" );
             else
                 buf.appendAs( e , "" );
-            
+
             BSONObj key = buf.obj();
             GEOQUADDEBUG( obj << "\n\t" << root << "\n\t" << key );
             keys.insert( key );
         }
 
         void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
-            
+
             BSONElement loc = obj.getFieldDotted( _geo );
             if ( loc.eoo() )
                 return;
-                
+
             uassert( 13323 , "latlng not an array" , loc.isABSONObj() );
             string root;
             {
@@ -146,34 +146,34 @@ namespace mongo {
                 BSONElement y = i.next();
                 root = makeString( hash(x) , hash(y) );
             }
-            
-            
+
+
             assert( _other.size() == 1 );
-            
+
             BSONElementSet all;
             obj.getFieldsDotted( _other[0] , all );
-            
-            if ( all.size() == 0 ){
+
+            if ( all.size() == 0 ) {
                 _add( obj , root , BSONElement() , keys );
             }
             else {
-                for ( BSONElementSet::iterator i=all.begin(); i!=all.end(); ++i ){
+                for ( BSONElementSet::iterator i=all.begin(); i!=all.end(); ++i ) {
                     _add( obj , root , *i , keys );
                 }
             }
-            
+
         }
-            
+
         shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
             shared_ptr<Cursor> c;
             assert(0);
             return c;
         }
-            
-        void searchCommand( NamespaceDetails* nsd , int idxNo ,  
-                            const BSONObj& n /*near*/ , double maxDistance , const BSONObj& search , 
-                            BSONObjBuilder& result , unsigned limit ){
-         
+
+        void searchCommand( NamespaceDetails* nsd , int idxNo ,
+                            const BSONObj& n /*near*/ , double maxDistance , const BSONObj& search ,
+                            BSONObjBuilder& result , unsigned limit ) {
+
             Timer t;
 
             log(1) << "SEARCH near:" << n << " maxDistance:" << maxDistance << " search: " << search << endl;
@@ -184,33 +184,33 @@ namespace mongo {
                 y = hash( i.next() );
             }
             int scale = (int)ceil( maxDistance / _bucketSize );
-                
+
             GeoHaystackSearchHopper hopper(n,maxDistance,limit,_geo);
-                
+
             long long btreeMatches = 0;
 
-            for ( int a=-scale; a<=scale; a++ ){
-                for ( int b=-scale; b<=scale; b++ ){
+            for ( int a=-scale; a<=scale; a++ ) {
+                for ( int b=-scale; b<=scale; b++ ) {
 
                     BSONObjBuilder bb;
                     bb.append( "" , makeString( x + a , y + b ) );
-                    for ( unsigned i=0; i<_other.size(); i++ ){
+                    for ( unsigned i=0; i<_other.size(); i++ ) {
                         BSONElement e = search.getFieldDotted( _other[i] );
                         if ( e.eoo() )
                             bb.appendNull( "" );
                         else
                             bb.appendAs( e , "" );
                     }
-                    
+
                     BSONObj key = bb.obj();
-                    
+
                     GEOQUADDEBUG( "KEY: " << key );
-                    
+
                     set<DiskLoc> thisPass;
                     BtreeCursor cursor( nsd , idxNo , *getDetails() , key , key , true , 1 );
-                    while ( cursor.ok() ){
+                    while ( cursor.ok() ) {
                         pair<set<DiskLoc>::iterator, bool> p = thisPass.insert( cursor.currLoc() );
-                        if ( p.second ){
+                        if ( p.second ) {
                             hopper.got( cursor.currLoc() );
                             GEOQUADDEBUG( "\t" << cursor.current() );
                             btreeMatches++;
@@ -221,10 +221,10 @@ namespace mongo {
 
             }
 
-            BSONArrayBuilder arr( result.subarrayStart( "results" ) );                
+            BSONArrayBuilder arr( result.subarrayStart( "results" ) );
             int num = hopper.append( arr );
             arr.done();
-            
+
             {
                 BSONObjBuilder b( result.subobjStart( "stats" ) );
                 b.append( "time" , t.millis() );
@@ -237,20 +237,20 @@ namespace mongo {
         const IndexDetails* getDetails() const {
             return _spec->getDetails();
         }
-            
+
         string _geo;
         vector<string> _other;
-        
+
         BSONObj _order;
 
         double _bucketSize;
     };
-        
+
     class GeoHaystackSearchIndexPlugin : public IndexPlugin {
     public:
-        GeoHaystackSearchIndexPlugin() : IndexPlugin( GEOSEARCHNAME ){
+        GeoHaystackSearchIndexPlugin() : IndexPlugin( GEOSEARCHNAME ) {
         }
-        
+
         virtual IndexType* generate( const IndexSpec* spec ) const {
             return new GeoHaystackSearchIndex( this , spec );
         }
@@ -259,38 +259,38 @@ namespace mongo {
 
 
     class GeoHaystackSearchCommand : public Command {
-        public:
-        GeoHaystackSearchCommand() : Command( "geoSearch" ){}
-        virtual LockType locktype() const { return READ; } 
+    public:
+        GeoHaystackSearchCommand() : Command( "geoSearch" ) {}
+        virtual LockType locktype() const { return READ; }
         bool slaveOk() const { return true; }
         bool slaveOverrideOk() const { return true; }
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
-                
+        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
             string ns = dbname + "." + cmdObj.firstElement().valuestr();
-                
+
             NamespaceDetails * d = nsdetails( ns.c_str() );
-            if ( ! d ){
+            if ( ! d ) {
                 errmsg = "can't find ns";
                 return false;
             }
-            
+
             vector<int> idxs;
             d->findIndexByType( GEOSEARCHNAME , idxs );
-            if ( idxs.size() == 0 ){
+            if ( idxs.size() == 0 ) {
                 errmsg = "no geoSearch index";
                 return false;
             }
-            if ( idxs.size() > 1 ){
+            if ( idxs.size() > 1 ) {
                 errmsg = "more than 1 geosearch index";
                 return false;
             }
-                
+
             int idxNum = idxs[0];
-            
+
             IndexDetails& id = d->idx( idxNum );
             GeoHaystackSearchIndex * si = (GeoHaystackSearchIndex*)id.getSpec().getType();
-            assert( &id == si->getDetails() );         
-                
+            assert( &id == si->getDetails() );
+
             BSONElement n = cmdObj["near"];
             BSONElement maxDistance = cmdObj["maxDistance"];
             BSONElement search = cmdObj["search"];
@@ -298,20 +298,20 @@ namespace mongo {
             uassert( 13318 , "near needs to be an array" , n.isABSONObj() );
             uassert( 13319 , "maxDistance needs a number" , maxDistance.isNumber() );
             uassert( 13320 , "search needs to be an object" , search.type() == Object );
-            
+
             unsigned limit = 50;
             if ( cmdObj["limit"].isNumber() )
                 limit = (unsigned)cmdObj["limit"].numberInt();
 
             si->searchCommand( d , idxNum , n.Obj() , maxDistance.numberDouble() , search.Obj() , result , limit );
-            
+
             return 1;
         }
-        
-        } nameSearchCommand;
+
+    } nameSearchCommand;
+
+
 
 
-        
 
-    
 }
diff --git a/db/helpers/dblogger.h b/db/helpers/dblogger.h
index 572169b..4d6ee6d 100644
--- a/db/helpers/dblogger.h
+++ b/db/helpers/dblogger.h
@@ -18,14 +18,14 @@
 
 #pragma once
 
-namespace mongo { 
+namespace mongo {
 
     /** helper to log (and read log) of a capped collection in the database */
     class DBLogger {
         bool _inited;
     public:
         const string _ns;
-        DBLogger(string ns) : _inited(false), _ns(ns){ }
+        DBLogger(string ns) : _inited(false), _ns(ns) { }
     };
 
 }
diff --git a/db/index.cpp b/db/index.cpp
index 04eca73..c696e27 100644
--- a/db/index.cpp
+++ b/db/index.cpp
@@ -17,15 +17,16 @@
 */
 
 #include "pch.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "index.h"
 #include "btree.h"
 #include "query.h"
 #include "background.h"
+#include "repl/rs.h"
 
 namespace mongo {
 
-    int removeFromSysIndexes(const char *ns, const char *idxName) { 
+    int removeFromSysIndexes(const char *ns, const char *idxName) {
         string system_indexes = cc().database()->name + ".system.indexes";
         BSONObjBuilder b;
         b.append("ns", ns);
@@ -34,24 +35,36 @@ namespace mongo {
         return (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
     }
 
-    /* this is just an attempt to clean up old orphaned stuff on a delete all indexes 
-       call. repair database is the clean solution, but this gives one a lighter weight 
+    /* this is just an attempt to clean up old orphaned stuff on a delete all indexes
+       call. repair database is the clean solution, but this gives one a lighter weight
        partial option.  see dropIndexes()
     */
-    void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) { 
+    void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) {
         string system_indexes = cc().database()->name + ".system.indexes";
         BSONObjBuilder b;
         b.append("ns", ns);
-        if( idIndex ) { 
+        if( idIndex ) {
             b.append("name", BSON( "$ne" << idIndex->indexName().c_str() ));
         }
         BSONObj cond = b.done();
         int n = (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
-        if( n ) { 
+        if( n ) {
             log() << "info: assureSysIndexesEmptied cleaned up " << n << " entries" << endl;
         }
     }
 
+    int IndexDetails::keyPatternOffset( const string& key ) const {
+        BSONObjIterator i( keyPattern() );
+        int n = 0;
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if ( key == e.fieldName() )
+                return n;
+            n++;
+        }
+        return -1;
+    }
+
     const IndexSpec& IndexDetails::getSpec() const {
         scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
         return NamespaceDetailsTransient::get_inlock( info.obj()["ns"].valuestr() ).getIndexSpec( this );
@@ -62,29 +75,35 @@ namespace mongo {
     */
     void IndexDetails::kill_idx() {
         string ns = indexNamespace(); // e.g. foo.coll.$ts_1
+        try {
 
-        string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below 
-        
-        // clean up parent namespace index cache
-        NamespaceDetailsTransient::get_w( pns.c_str() ).deletedIndex();
+            string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below
 
-        string name = indexName();
+            // clean up parent namespace index cache
+            NamespaceDetailsTransient::get_w( pns.c_str() ).deletedIndex();
+
+            string name = indexName();
+
+            /* important to catch exception here so we can finish cleanup below. */
+            try {
+                dropNS(ns.c_str());
+            }
+            catch(DBException& ) {
+                log(2) << "IndexDetails::kill(): couldn't drop ns " << ns << endl;
+            }
+            head.setInvalid();
+            info.setInvalid();
+
+            // clean up in system.indexes.  we do this last on purpose.
+            int n = removeFromSysIndexes(pns.c_str(), name.c_str());
+            wassert( n == 1 );
 
-        /* important to catch exception here so we can finish cleanup below. */
-        try { 
-            btreeStore->drop(ns.c_str());
         }
-        catch(DBException& ) { 
-            log(2) << "IndexDetails::kill(): couldn't drop ns " << ns << endl;
+        catch ( DBException &e ) {
+            log() << "exception in kill_idx: " << e << ", ns: " << ns << endl;
         }
-        head.setInvalid();
-        info.setInvalid();
-
-        // clean up in system.indexes.  we do this last on purpose.
-        int n = removeFromSysIndexes(pns.c_str(), name.c_str());
-        wassert( n == 1 );
     }
-    
+
     void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSetDefaultOrder& keys) const {
         getSpec().getKeys( obj, keys );
     }
@@ -105,7 +124,7 @@ namespace mongo {
         }
     }
 
-    void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &changedId) { 
+    void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &changedId) {
         int z = d.nIndexesBeingBuilt();
         v.resize(z);
         NamespaceDetails::IndexIterator i = d.ii();
@@ -115,7 +134,7 @@ namespace mongo {
             IndexChanges& ch = v[i];
             idx.getKeysFromObject(oldObj, ch.oldkeys);
             idx.getKeysFromObject(newObj, ch.newkeys);
-            if( ch.newkeys.size() > 1 ) 
+            if( ch.newkeys.size() > 1 )
                 d.setIndexIsMultikey(i);
             setDifference(ch.oldkeys, ch.newkeys, ch.removed);
             setDifference(ch.newkeys, ch.oldkeys, ch.added);
@@ -133,12 +152,12 @@ namespace mongo {
         }
     }
 
-    // should be { <something> : <simpletype[1|-1]>, .keyp.. } 
-    static bool validKeyPattern(BSONObj kp) { 
+    // should be { <something> : <simpletype[1|-1]>, .keyp.. }
+    static bool validKeyPattern(BSONObj kp) {
         BSONObjIterator i(kp);
-        while( i.moreWithEOO() ) { 
+        while( i.moreWithEOO() ) {
             BSONElement e = i.next();
-            if( e.type() == Object || e.type() == Array ) 
+            if( e.type() == Object || e.type() == Array )
                 return false;
         }
         return true;
@@ -154,29 +173,23 @@ namespace mongo {
 
        throws DBException
 
-       @return 
-         true if ok to continue.  when false we stop/fail silently (index already exists)
-         sourceNS - source NS we are indexing
-         sourceCollection - its details ptr
+       @param sourceNS - source NS we are indexing
+       @param sourceCollection - its details ptr
+       @return true if ok to continue.  when false we stop/fail silently (index already exists)
     */
-    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection) {
+    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) {
         sourceCollection = 0;
 
         // logical name of the index.  todo: get rid of the name, we don't need it!
-        const char *name = io.getStringField("name"); 
+        const char *name = io.getStringField("name");
         uassert(12523, "no index name specified", *name);
 
         // the collection for which we are building an index
-        sourceNS = io.getStringField("ns");  
+        sourceNS = io.getStringField("ns");
         uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos);
-        uassert(10097, "bad table to index name on add index attempt", 
-            cc().database()->name == nsToDatabase(sourceNS.c_str()));
+        uassert(10097, "bad table to index name on add index attempt",
+                cc().database()->name == nsToDatabase(sourceNS.c_str()));
 
-        /* we can't build a new index for the ns if a build is already in progress in the background - 
-           EVEN IF this is a foreground build.
-           */
-        uassert(12588, "cannot add index with a background operation in progress", 
-            !BackgroundOperation::inProgForNs(sourceNS.c_str()));
 
         BSONObj key = io.getObjectField("key");
         uassert(12524, "index key pattern too large", key.objsize() <= 2048);
@@ -187,7 +200,7 @@ namespace mongo {
 
         if ( sourceNS.empty() || key.isEmpty() ) {
             log(2) << "bad add index attempt name:" << (name?name:"") << "\n  ns:" <<
-                sourceNS << "\n  idxobj:" << io.toString() << endl;
+                   sourceNS << "\n  idxobj:" << io.toString() << endl;
             string s = "bad add index attempt " + sourceNS + " key:" + key.toString();
             uasserted(12504, s);
         }
@@ -201,7 +214,7 @@ namespace mongo {
                 return false;
             }
             sourceCollection = nsdetails(sourceNS.c_str());
-            tlog() << "info: creating collection " << sourceNS << " on add index\n";
+            tlog() << "info: creating collection " << sourceNS << " on add index" << endl;
             assert( sourceCollection );
         }
 
@@ -222,24 +235,55 @@ namespace mongo {
             uasserted(12505,s);
         }
 
-        /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to 
+        /* we can't build a new index for the ns if a build is already in progress in the background -
+           EVEN IF this is a foreground build.
+           */
+        uassert(12588, "cannot add index with a background operation in progress",
+                !BackgroundOperation::inProgForNs(sourceNS.c_str()));
+
+        /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to
            all be treated as the same pattern.
         */
-        if ( !god && IndexDetails::isIdIndexPattern(key) ) {
-            ensureHaveIdIndex( sourceNS.c_str() );
-            return false;
+        if ( IndexDetails::isIdIndexPattern(key) ) {
+            if( !god ) {
+                ensureHaveIdIndex( sourceNS.c_str() );
+                return false;
+            }
+        }
+        else {
+            /* is buildIndexes:false set for this replica set member?
+               if so we don't build any indexes except _id
+            */
+            if( theReplSet && !theReplSet->buildIndexes() )
+                return false;
+        }
+
+        string pluginName = IndexPlugin::findPluginName( key );
+        IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0;
+
+        if ( plugin ) {
+            fixedIndexObject = plugin->adjustIndexSpec( io );
+        }
+        else if ( io["v"].eoo() ) {
+            // add "v" if it doesn't exist
+            // if it does - leave whatever value was there
+            // this is for testing and replication
+            BSONObjBuilder b( io.objsize() + 32 );
+            b.appendElements( io );
+            b.append( "v" , 0 );
+            fixedIndexObject = b.obj();
         }
 
         return true;
     }
 
 
-    void IndexSpec::reset( const IndexDetails * details ){
+    void IndexSpec::reset( const IndexDetails * details ) {
         _details = details;
         reset( details->info );
     }
 
-    void IndexSpec::reset( const DiskLoc& loc ){
+    void IndexSpec::reset( const DiskLoc& loc ) {
         info = loc.obj();
         keyPattern = info["key"].embeddedObjectUserCheck();
         if ( keyPattern.objsize() == 0 ) {
diff --git a/db/index.h b/db/index.h
index a2d7e7e..8578ed3 100644
--- a/db/index.h
+++ b/db/index.h
@@ -25,20 +25,27 @@
 
 namespace mongo {
 
-	/* Details about a particular index. There is one of these effectively for each object in 
-	   system.namespaces (although this also includes the head pointer, which is not in that 
-	   collection).
+    /* Details about a particular index. There is one of these effectively for each object in
+       system.namespaces (although this also includes the head pointer, which is not in that
+       collection).
 
        ** MemoryMapped Record ** (i.e., this is on disk data)
-	 */
+     */
     class IndexDetails {
     public:
-        DiskLoc head; /* btree head disk location */
+        /**
+         * btree head disk location
+         * TODO We should make this variable private, since btree operations
+         * may change its value and we don't want clients to rely on an old
+         * value.  If we create a btree class, we can provide a btree object
+         * to clients instead of 'head'.
+         */
+        DiskLoc head;
 
         /* Location of index info object. Format:
 
              { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
-               [, unique: <bool>, background: <bool>] 
+               [, unique: <bool>, background: <bool>]
              }
 
            This object is in the system.indexes collection.  Note that since we
@@ -70,6 +77,13 @@ namespace mongo {
             return info.obj().getObjectField("key");
         }
 
+        /**
+         * @return offset into keyPattern for key
+                   -1 if doesn't exist
+         */
+        int keyPatternOffset( const string& key ) const;
+        bool inKeyPattern( const string& key ) const { return keyPatternOffset( key ) >= 0; }
+
         /* true if the specified key is in the index */
         bool hasKey(const BSONObj& key);
         bool wouldCreateDup(const BSONObj& key, DiskLoc self);
@@ -96,11 +110,11 @@ namespace mongo {
             BSONObjIterator i(pattern);
             BSONElement e = i.next();
             if( strcmp(e.fieldName(), "_id") != 0 ) return false;
-            return i.next().eoo();            
+            return i.next().eoo();
         }
-        
+
         /* returns true if this is the _id index. */
-        bool isIdIndex() const { 
+        bool isIdIndex() const {
             return isIdIndexPattern( keyPattern() );
         }
 
@@ -112,11 +126,11 @@ namespace mongo {
             return io.getStringField("ns");
         }
 
-        bool unique() const { 
+        bool unique() const {
             BSONObj io = info.obj();
-            return io["unique"].trueValue() || 
-                /* temp: can we juse make unique:true always be there for _id and get rid of this? */
-                isIdIndex();
+            return io["unique"].trueValue() ||
+                   /* temp: can we juse make unique:true always be there for _id and get rid of this? */
+                   isIdIndex();
         }
 
         /* if set, when building index, if any duplicates, drop the duplicating object */
@@ -128,7 +142,7 @@ namespace mongo {
            (system.indexes or system.namespaces) -- only NamespaceIndex.
         */
         void kill_idx();
-        
+
         const IndexSpec& getSpec() const;
 
         string toString() const {
@@ -136,13 +150,13 @@ namespace mongo {
         }
     };
 
-    struct IndexChanges/*on an update*/ {
+    struct IndexChanges { /*on an update*/
         BSONObjSetDefaultOrder oldkeys;
         BSONObjSetDefaultOrder newkeys;
         vector<BSONObj*> removed; // these keys were removed as part of the change
         vector<BSONObj*> added;   // these keys were added as part of the change
 
-        /** @curObjLoc - the object we want to add's location.  if it is already in the 
+        /** @curObjLoc - the object we want to add's location.  if it is already in the
                          index, that is allowed here (for bg indexing case).
         */
         void dupCheck(IndexDetails& idx, DiskLoc curObjLoc) {
diff --git a/db/indexkey.cpp b/db/indexkey.cpp
index 70dd770..34f30fa 100644
--- a/db/indexkey.cpp
+++ b/db/indexkey.cpp
@@ -17,7 +17,7 @@
 */
 
 #include "pch.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "index.h"
 #include "btree.h"
 #include "query.h"
@@ -28,98 +28,136 @@ namespace mongo {
     map<string,IndexPlugin*> * IndexPlugin::_plugins;
 
     IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec )
-        : _plugin( plugin ) , _spec( spec ){
-        
+        : _plugin( plugin ) , _spec( spec ) {
+
     }
 
-    IndexType::~IndexType(){
+    IndexType::~IndexType() {
     }
-    
-    const BSONObj& IndexType::keyPattern() const { 
-        return _spec->keyPattern; 
+
+    const BSONObj& IndexType::keyPattern() const {
+        return _spec->keyPattern;
     }
 
     IndexPlugin::IndexPlugin( const string& name )
-        : _name( name ){
+        : _name( name ) {
         if ( ! _plugins )
             _plugins = new map<string,IndexPlugin*>();
         (*_plugins)[name] = this;
     }
-    
-    int IndexType::compare( const BSONObj& l , const BSONObj& r ) const {
-        return l.woCompare( r , _spec->keyPattern );
-    }
 
-    void IndexSpec::_init(){
-        assert( keyPattern.objsize() );
-        
+    string IndexPlugin::findPluginName( const BSONObj& keyPattern ) {
         string pluginName = "";
 
         BSONObjIterator i( keyPattern );
-        BSONObjBuilder nullKeyB;
+
         while( i.more() ) {
             BSONElement e = i.next();
-            _fieldNames.push_back( e.fieldName() );
-            _fixed.push_back( BSONElement() );
-            nullKeyB.appendNull( "" );
-            if ( e.type() == String ){
-                uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 );
-                pluginName = e.valuestr();
-            }
-                
+            if ( e.type() != String )
+                continue;
+
+            uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 || pluginName == e.String() );
+            pluginName = e.String();
         }
-        
-        _nullKey = nullKeyB.obj();
-
-        BSONObjBuilder b;
-        b.appendNull( "" );
-        _nullObj = b.obj();
-        _nullElt = _nullObj.firstElement();
-        
-        if ( pluginName.size() ){
-            IndexPlugin * plugin = IndexPlugin::get( pluginName );
-            if ( ! plugin ){
-                log() << "warning: can't find plugin [" << pluginName << "]" << endl;
+
+        return pluginName;
+    }
+
+    int IndexType::compare( const BSONObj& l , const BSONObj& r ) const {
+        return l.woCompare( r , _spec->keyPattern );
+    }
+
+    void IndexSpec::_init() {
+        assert( keyPattern.objsize() );
+
+        // some basics
+        _nFields = keyPattern.nFields();
+        _sparse = info["sparse"].trueValue();
+        uassert( 13529 , "sparse only works for single field keys" , ! _sparse || _nFields );
+
+
+        {
+            // build _nullKey
+
+            BSONObjBuilder b;
+            BSONObjIterator i( keyPattern );
+
+            while( i.more() ) {
+                BSONElement e = i.next();
+                _fieldNames.push_back( e.fieldName() );
+                _fixed.push_back( BSONElement() );
+                b.appendNull( "" );
             }
-            else {
-                _indexType.reset( plugin->generate( this ) );
+            _nullKey = b.obj();
+        }
+
+        {
+            // _nullElt
+            BSONObjBuilder b;
+            b.appendNull( "" );
+            _nullObj = b.obj();
+            _nullElt = _nullObj.firstElement();
+        }
+
+        {
+            // handle plugins
+            string pluginName = IndexPlugin::findPluginName( keyPattern );
+            if ( pluginName.size() ) {
+                IndexPlugin * plugin = IndexPlugin::get( pluginName );
+                if ( ! plugin ) {
+                    log() << "warning: can't find plugin [" << pluginName << "]" << endl;
+                }
+                else {
+                    _indexType.reset( plugin->generate( this ) );
+                }
             }
         }
+
         _finishedInit = true;
     }
 
-    
+
     void IndexSpec::getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
-        if ( _indexType.get() ){
+        if ( _indexType.get() ) {
             _indexType->getKeys( obj , keys );
             return;
         }
         vector<const char*> fieldNames( _fieldNames );
         vector<BSONElement> fixed( _fixed );
         _getKeys( fieldNames , fixed , obj, keys );
-        if ( keys.empty() )
+        if ( keys.empty() && ! _sparse )
             keys.insert( _nullKey );
     }
 
     void IndexSpec::_getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
         BSONElement arrElt;
         unsigned arrIdx = ~0;
+        int numNotFound = 0;
+
         for( unsigned i = 0; i < fieldNames.size(); ++i ) {
             if ( *fieldNames[ i ] == '\0' )
                 continue;
+
             BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] );
-            if ( e.eoo() )
+
+            if ( e.eoo() ) {
                 e = _nullElt; // no matching field
+                numNotFound++;
+            }
+
             if ( e.type() != Array )
                 fieldNames[ i ] = ""; // no matching field or non-array match
+
             if ( *fieldNames[ i ] == '\0' )
                 fixed[ i ] = e; // no need for further object expansion (though array expansion still possible)
+
             if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here
                 arrIdx = i;
                 arrElt = e;
             }
+
             // enforce single array path here
-            if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ){
+            if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) {
                 stringstream ss;
                 ss << "cannot index parallel arrays [" << e.fieldName() << "] [" << arrElt.fieldName() << "]";
                 uasserted( 10088 ,  ss.str() );
@@ -127,13 +165,19 @@ namespace mongo {
         }
 
         bool allFound = true; // have we found elements for all field names in the key spec?
-        for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ){
-            if ( **i != '\0' ){
+        for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) {
+            if ( **i != '\0' ) {
                 allFound = false;
                 break;
             }
         }
 
+        if ( _sparse && numNotFound == _nFields ) {
+            // we didn't find any fields
+            // so we're not going to index this document
+            return;
+        }
+
         bool insertArrayNull = false;
 
         if ( allFound ) {
@@ -143,11 +187,11 @@ namespace mongo {
                 for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
                     b.appendAs( *i, "" );
                 keys.insert( b.obj() );
-            } 
+            }
             else {
                 // terminal array element to expand, so generate all keys
                 BSONObjIterator i( arrElt.embeddedObject() );
-                if ( i.more() ){
+                if ( i.more() ) {
                     while( i.more() ) {
                         BSONObjBuilder b(_sizeTracker);
                         for( unsigned j = 0; j < fixed.size(); ++j ) {
@@ -159,18 +203,19 @@ namespace mongo {
                         keys.insert( b.obj() );
                     }
                 }
-                else if ( fixed.size() > 1 ){
+                else if ( fixed.size() > 1 ) {
                     insertArrayNull = true;
                 }
             }
-        } else {
+        }
+        else {
             // nonterminal array element to expand, so recurse
             assert( !arrElt.eoo() );
             BSONObjIterator i( arrElt.embeddedObject() );
-            if ( i.more() ){
+            if ( i.more() ) {
                 while( i.more() ) {
                     BSONElement e = i.next();
-                    if ( e.type() == Object ){
+                    if ( e.type() == Object ) {
                         _getKeys( fieldNames, fixed, e.embeddedObject(), keys );
                     }
                 }
@@ -179,12 +224,12 @@ namespace mongo {
                 insertArrayNull = true;
             }
         }
-        
+
         if ( insertArrayNull ) {
             // x : [] - need to insert undefined
             BSONObjBuilder b(_sizeTracker);
             for( unsigned j = 0; j < fixed.size(); ++j ) {
-                if ( j == arrIdx ){
+                if ( j == arrIdx ) {
                     b.appendUndefined( "" );
                 }
                 else {
@@ -199,12 +244,12 @@ namespace mongo {
         }
     }
 
-    bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ){
+    bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ) {
         BSONObjIterator x(a);
-        while ( x.more() ){
+        while ( x.more() ) {
             BSONElement e = x.next();
             BSONObjIterator y(b);
-            while ( y.more() ){
+            while ( y.more() ) {
                 BSONElement f = y.next();
                 FieldCompareResult res = compareDottedFieldNames( e.fieldName() , f.fieldName() );
                 if ( res == SAME || res == LEFT_SUBFIELD || res == RIGHT_SUBFIELD )
@@ -213,13 +258,13 @@ namespace mongo {
         }
         return false;
     }
-        
+
     IndexSuitability IndexSpec::suitability( const BSONObj& query , const BSONObj& order ) const {
         if ( _indexType.get() )
             return _indexType->suitability( query , order );
         return _suitability( query , order );
     }
-    
+
     IndexSuitability IndexSpec::_suitability( const BSONObj& query , const BSONObj& order ) const {
         // TODO: optimize
         if ( anyElementNamesMatch( keyPattern , query ) == 0 && anyElementNamesMatch( keyPattern , order ) == 0 )
diff --git a/db/indexkey.h b/db/indexkey.h
index e73d9de..be73171 100644
--- a/db/indexkey.h
+++ b/db/indexkey.h
@@ -46,16 +46,16 @@ namespace mongo {
 
         virtual void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const = 0;
         virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const = 0;
-        
+
         /** optional op : changes query to match what's in the index */
         virtual BSONObj fixKey( const BSONObj& in ) { return in; }
 
         /** optional op : compare 2 objects with regards to this index */
-        virtual int compare( const BSONObj& l , const BSONObj& r ) const;        
+        virtual int compare( const BSONObj& l , const BSONObj& r ) const;
 
         /** @return plugin */
         const IndexPlugin * getPlugin() const { return _plugin; }
-        
+
         const BSONObj& keyPattern() const;
 
         virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ;
@@ -66,7 +66,7 @@ namespace mongo {
         const IndexPlugin * _plugin;
         const IndexSpec * _spec;
     };
-    
+
     /**
      * this represents a plugin
      * a plugin could be something like full text search, sparse index, etc...
@@ -76,11 +76,21 @@ namespace mongo {
     class IndexPlugin : boost::noncopyable {
     public:
         IndexPlugin( const string& name );
-        virtual ~IndexPlugin(){}
-        
+        virtual ~IndexPlugin() {}
+
         virtual IndexType* generate( const IndexSpec * spec ) const = 0;
 
-        static IndexPlugin* get( const string& name ){
+        string getName() const { return _name; }
+
+        /**
+         * @return new keyPattern
+         * if nothing changes, should return keyPattern
+         */
+        virtual BSONObj adjustIndexSpec( const BSONObj& spec ) const { return spec; }
+
+        // ------- static below -------
+
+        static IndexPlugin* get( const string& name ) {
             if ( ! _plugins )
                 return 0;
             map<string,IndexPlugin*>::iterator i = _plugins->find( name );
@@ -89,7 +99,12 @@ namespace mongo {
             return i->second;
         }
 
-        string getName() const { return _name; }
+        /**
+         * @param keyPattern { x : "fts" }
+         * @return "" or the name
+         */
+        static string findPluginName( const BSONObj& keyPattern );
+
     private:
         string _name;
         static map<string,IndexPlugin*> * _plugins;
@@ -102,31 +117,31 @@ namespace mongo {
     public:
         BSONObj keyPattern; // e.g., { name : 1 }
         BSONObj info; // this is the same as IndexDetails::info.obj()
-        
+
         IndexSpec()
-            : _details(0) , _finishedInit(false){
+            : _details(0) , _finishedInit(false) {
         }
 
         IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() )
-            : keyPattern(k) , info(m) , _details(0) , _finishedInit(false){
+            : keyPattern(k) , info(m) , _details(0) , _finishedInit(false) {
             _init();
         }
-        
+
         /**
            this is a DiscLoc of an IndexDetails info
-           should have a key field 
+           should have a key field
          */
-        IndexSpec( const DiskLoc& loc ){
+        IndexSpec( const DiskLoc& loc ) {
             reset( loc );
         }
-        
+
         void reset( const DiskLoc& loc );
         void reset( const IndexDetails * details );
-        
+
         void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const;
 
         BSONElement missingField() const { return _nullElt; }
-        
+
         string getTypeName() const {
             if ( _indexType.get() )
                 return _indexType->getPlugin()->getName();
@@ -148,20 +163,24 @@ namespace mongo {
         IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ;
 
         void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const;
-        
+
         BSONSizeTracker _sizeTracker;
 
         vector<const char*> _fieldNames;
         vector<BSONElement> _fixed;
-        BSONObj _nullKey;
-        
-        BSONObj _nullObj;
-        BSONElement _nullElt;
-        
+
+        BSONObj _nullKey; // a full key with all fields null
+
+        BSONObj _nullObj; // only used for _nullElt
+        BSONElement _nullElt; // jstNull
+
+        int _nFields; // number of fields in the index
+        bool _sparse; // if the index is sparse
+
         shared_ptr<IndexType> _indexType;
 
         const IndexDetails * _details;
-        
+
         void _init();
 
     public:
diff --git a/db/instance.cpp b/db/instance.cpp
index a6873f2..3b668ee 100644
--- a/db/instance.cpp
+++ b/db/instance.cpp
@@ -27,7 +27,6 @@
 #include "lasterror.h"
 #include "security.h"
 #include "json.h"
-//#include "reccache.h"
 #include "replpair.h"
 #include "../s/d_logic.h"
 #include "../util/file_allocator.h"
@@ -38,6 +37,8 @@
 #endif
 #include "stats/counters.h"
 #include "background.h"
+#include "dur_journal.h"
+#include "dur_recover.h"
 
 namespace mongo {
 
@@ -61,29 +62,30 @@ namespace mongo {
 
     bool useCursors = true;
     bool useHints = true;
-    
-    void flushOpLog( stringstream &ss ) {
+
+    void flushDiagLog() {
         if( _diaglog.f && _diaglog.f->is_open() ) {
-            ss << "flushing op log and files\n";
+            log() << "flushing diag log" << endl;
             _diaglog.flush();
         }
     }
 
-    int ctr = 0;
-
     KillCurrentOp killCurrentOp;
-    
+
     int lockFile = 0;
+#ifdef WIN32
+    HANDLE lockFileHandle;
+#endif
 
     // see FSyncCommand:
-    unsigned lockedForWriting; 
+    unsigned lockedForWriting;
     mongo::mutex lockedForWritingMutex("lockedForWriting");
     bool unlockRequested = false;
 
     void inProgCmd( Message &m, DbResponse &dbresponse ) {
         BSONObjBuilder b;
 
-        if( ! cc().isAdmin() ){
+        if( ! cc().isAdmin() ) {
             BSONObjBuilder b;
             b.append("err", "unauthorized");
         }
@@ -95,12 +97,13 @@ namespace mongo {
             {
                 Client& me = cc();
                 scoped_lock bl(Client::clientsMutex);
-                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { 
+                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) {
                     Client *c = *i;
                     assert( c );
-                    if ( c == &me )
-                        continue;
                     CurOp* co = c->curop();
+                    if ( c == &me && !co ) {
+                        continue;
+                    }
                     assert( co );
                     if( all || co->active() )
                         vals.push_back( co->infoNoauth() );
@@ -113,26 +116,26 @@ namespace mongo {
                 b.append("info", "use db.$cmd.sys.unlock.findOne() to terminate the fsync write/snapshot lock");
             }
         }
-        
+
         replyToQuery(0, m, dbresponse, b.obj());
     }
-    
+
     void killOp( Message &m, DbResponse &dbresponse ) {
         BSONObj obj;
-        if( ! cc().isAdmin() ){
+        if( ! cc().isAdmin() ) {
             obj = fromjson("{\"err\":\"unauthorized\"}");
         }
-        /*else if( !dbMutexInfo.isLocked() ) 
+        /*else if( !dbMutexInfo.isLocked() )
             obj = fromjson("{\"info\":\"no op in progress/not locked\"}");
             */
         else {
             DbMessage d(m);
             QueryMessage q(d);
             BSONElement e = q.query.getField("op");
-            if( !e.isNumber() ) { 
+            if( !e.isNumber() ) {
                 obj = fromjson("{\"err\":\"no op number field specified?\"}");
             }
-            else { 
+            else {
                 log() << "going to kill op: " << e << endl;
                 obj = fromjson("{\"info\":\"attempting to kill op\"}");
                 killCurrentOp.kill( (unsigned) e.number() );
@@ -143,23 +146,23 @@ namespace mongo {
 
     void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) {
         BSONObj obj;
-        if( ! cc().isAdmin() || strncmp(ns, "admin.", 6) != 0 ) { 
+        if( ! cc().isAdmin() || strncmp(ns, "admin.", 6) != 0 ) {
             obj = fromjson("{\"err\":\"unauthorized\"}");
         }
         else {
-            if( lockedForWriting ) { 
-				log() << "command: unlock requested" << endl;
+            if( lockedForWriting ) {
+                log() << "command: unlock requested" << endl;
                 obj = fromjson("{ok:1,\"info\":\"unlock requested\"}");
                 unlockRequested = true;
             }
-            else { 
+            else {
                 obj = fromjson("{ok:0,\"errmsg\":\"not locked\"}");
             }
         }
         replyToQuery(0, m, dbresponse, obj);
     }
 
-    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ){
+    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) {
         bool ok = true;
         MSGID responseTo = m.header()->id;
 
@@ -168,7 +171,7 @@ namespace mongo {
         auto_ptr< Message > resp( new Message() );
 
         CurOp& op = *(c.curop());
-        
+
         try {
             dbresponse.exhaust = runQuery(m, q, op, *resp);
             assert( !resp->empty() );
@@ -176,9 +179,9 @@ namespace mongo {
         catch ( AssertionException& e ) {
             ok = false;
             op.debug().str << " exception ";
-            LOGSOME { 
+            LOGSOME {
                 log() << "assertion " << e.toString() << " ns:" << q.ns << " query:" <<
-                    (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
+                (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
                 if( q.ntoskip || q.ntoreturn )
                     log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
             }
@@ -207,18 +210,18 @@ namespace mongo {
             resp->setData( msgdata, true );
         }
 
-        if ( op.shouldDBProfile( 0 ) ){
+        if ( op.shouldDBProfile( 0 ) ) {
             op.debug().str << " bytes:" << resp->header()->dataLen();
         }
-        
+
         dbresponse.response = resp.release();
         dbresponse.responseTo = responseTo;
-        
+
         return ok;
     }
 
     // Returns false when request includes 'end'
-    bool assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client ) {
+    void assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client ) {
 
         // before we lock...
         int op = m.operation();
@@ -228,18 +231,18 @@ namespace mongo {
             if( strstr(ns, ".$cmd") ) {
                 isCommand = true;
                 opwrite(m);
-                if( strstr(ns, ".$cmd.sys.") ) { 
+                if( strstr(ns, ".$cmd.sys.") ) {
                     if( strstr(ns, "$cmd.sys.inprog") ) {
                         inProgCmd(m, dbresponse);
-                        return true;
+                        return;
                     }
-                    if( strstr(ns, "$cmd.sys.killop") ) { 
+                    if( strstr(ns, "$cmd.sys.killop") ) {
                         killOp(m, dbresponse);
-                        return true;
+                        return;
                     }
-                    if( strstr(ns, "$cmd.sys.unlock") ) { 
+                    if( strstr(ns, "$cmd.sys.unlock") ) {
                         unlockFsync(ns, m, dbresponse);
-                        return true;
+                        return;
                     }
                 }
             }
@@ -253,30 +256,30 @@ namespace mongo {
         else {
             opwrite(m);
         }
-        
+
         globalOpCounters.gotOp( op , isCommand );
-        
+
         Client& c = cc();
-        
+
         auto_ptr<CurOp> nestedOp;
         CurOp* currentOpP = c.curop();
-        if ( currentOpP->active() ){
+        if ( currentOpP->active() ) {
             nestedOp.reset( new CurOp( &c , currentOpP ) );
             currentOpP = nestedOp.get();
         }
         CurOp& currentOp = *currentOpP;
         currentOp.reset(client,op);
-        
+
         OpDebug& debug = currentOp.debug();
         StringBuilder& ss = debug.str;
         ss << opToString( op ) << " ";
 
         int logThreshold = cmdLine.slowMS;
         bool log = logLevel >= 1;
-        
+
         if ( op == dbQuery ) {
             if ( handlePossibleShardedMessage( m , &dbresponse ) )
-                return true;
+                return;
             receivedQuery(c , dbresponse, m );
         }
         else if ( op == dbGetMore ) {
@@ -289,7 +292,7 @@ namespace mongo {
             int len = strlen(p);
             if ( len > 400 )
                 out() << curTimeMillis() % 10000 <<
-                    " long msg received, len:" << len << endl;
+                      " long msg received, len:" << len << endl;
 
             Message *resp = new Message();
             if ( strcmp( "end" , p ) == 0 )
@@ -304,7 +307,7 @@ namespace mongo {
             const char *ns = m.singleData()->_data + 4;
             char cl[256];
             nsToDatabase(ns, cl);
-            if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) { 
+            if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) {
                 uassert_nothrow("unauthorized");
             }
             else {
@@ -330,37 +333,40 @@ namespace mongo {
                         log = true;
                     }
                 }
+                catch ( UserException& ue ) {
+                    tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << ue.toString() << endl;
+                    ss << " exception " << ue.toString();
+                }
                 catch ( AssertionException& e ) {
-                    static int n;
-                    tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing" << endl;
-                    ss << " exception " + e.toString();
-                    log = ++n < 10;
+                    tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << e.toString() << endl;
+                    ss << " exception " << e.toString();
+                    log = true;
                 }
             }
         }
         currentOp.ensureStarted();
         currentOp.done();
         int ms = currentOp.totalTimeMillis();
-        
-        log = log || (logLevel >= 2 && ++ctr % 512 == 0);
-        //DEV log = true; 
+
+        //DEV log = true;
         if ( log || ms > logThreshold ) {
             if( logLevel < 3 && op == dbGetMore && strstr(ns, ".oplog.") && ms < 3000 && !log ) {
                 /* it's normal for getMore on the oplog to be slow because of use of awaitdata flag. */
-            } else {
+            }
+            else {
                 ss << ' ' << ms << "ms";
                 mongo::tlog() << ss.str() << endl;
             }
         }
-        
-        if ( currentOp.shouldDBProfile( ms ) ){
+
+        if ( currentOp.shouldDBProfile( ms ) ) {
             // performance profiling is on
-            if ( dbMutex.getState() < 0 ){
+            if ( dbMutex.getState() < 0 ) {
                 mongo::log(1) << "note: not profiling because recursive read lock" << endl;
             }
             else {
-                mongolock lk(true);
-                if ( dbHolder.isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ){
+                writelock lk;
+                if ( dbHolder.isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ) {
                     Client::Context c( currentOp.getNS() );
                     profile(ss.str().c_str(), ms);
                 }
@@ -370,37 +376,44 @@ namespace mongo {
             }
         }
 
-        return true;
     } /* assembleResponse() */
 
-    void killCursors(int n, long long *ids);
     void receivedKillCursors(Message& m) {
         int *x = (int *) m.singleData()->_data;
         x++; // reserved
         int n = *x++;
+
+        assert( m.dataSize() == 8 + ( 8 * n ) );
+
         uassert( 13004 , "sent 0 cursors to kill" , n >= 1 );
         if ( n > 2000 ) {
             log( n < 30000 ? LL_WARNING : LL_ERROR ) << "receivedKillCursors, n=" << n << endl;
             assert( n < 30000 );
         }
-        killCursors(n, (long long *) x);
+
+        int found = ClientCursor::erase(n, (long long *) x);
+
+        if ( logLevel > 0 || found != n ) {
+            log( found == n ) << "killcursors: found " << found << " of " << n << endl;
+        }
+
     }
 
     /* db - database name
        path - db directory
     */
-    void closeDatabase( const char *db, const string& path ) {
+    /*static*/ void Database::closeDatabase( const char *db, const string& path ) {
         assertInWriteLock();
-        
+
         Client::Context * ctx = cc().getContext();
         assert( ctx );
         assert( ctx->inDB( db , path ) );
         Database *database = ctx->db();
         assert( database->name == db );
-        
-        oplogCheckCloseDatabase( database );
 
-        if( BackgroundOperation::inProgForDb(db) ) { 
+        oplogCheckCloseDatabase( database ); // oplog caches some things, dirty its caches
+
+        if( BackgroundOperation::inProgForDb(db) ) {
             log() << "warning: bg op in prog during close db? " << db << endl;
         }
 
@@ -412,8 +425,8 @@ namespace mongo {
         NamespaceDetailsTransient::clearForPrefix( prefix.c_str() );
 
         dbHolder.erase( db, path );
-        delete database; // closes files
         ctx->clear();
+        delete database; // closes files
     }
 
     void receivedUpdate(Message& m, CurOp& op) {
@@ -428,7 +441,7 @@ namespace mongo {
         assert( d.moreJSObjs() );
         assert( query.objsize() < m.header()->dataLen() );
         BSONObj toupdate = d.nextJsObj();
-        uassert( 10055 , "update object too large", toupdate.objsize() <= MaxBSONObjectSize);
+        uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize);
         assert( toupdate.objsize() < m.header()->dataLen() );
         assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() );
         bool upsert = flags & UpdateOption_Upsert;
@@ -436,15 +449,15 @@ namespace mongo {
         bool broadcast = flags & UpdateOption_Broadcast;
         {
             string s = query.toString();
-            /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. 
-               instead, let's just story the query BSON in the debug object, and it can toString() 
+            /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down.
+               instead, let's just story the query BSON in the debug object, and it can toString()
                lazily
             */
             op.debug().str << " query: " << s;
             op.setQuery(query);
-        }        
+        }
 
-        mongolock lk(1);
+        writelock lk;
 
         // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
         if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
@@ -461,6 +474,7 @@ namespace mongo {
         const char *ns = d.getns();
         assert(*ns);
         uassert( 10056 ,  "not master", isMasterNs( ns ) );
+        op.debug().str << ns << ' ';
         int flags = d.pullInt();
         bool justOne = flags & RemoveOption_JustOne;
         bool broadcast = flags & RemoveOption_Broadcast;
@@ -470,63 +484,63 @@ namespace mongo {
             string s = pattern.toString();
             op.debug().str << " query: " << s;
             op.setQuery(pattern);
-        }        
+        }
 
         writelock lk(ns);
         // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
         if ( ! broadcast & handlePossibleShardedMessage( m , 0 ) )
             return;
-        
+
         Client::Context ctx(ns);
-        
+
         long long n = deleteObjects(ns, pattern, justOne, true);
         lastError.getSafe()->recordDelete( n );
     }
-    
+
     QueryResult* emptyMoreResult(long long);
 
     bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) {
         StringBuilder& ss = curop.debug().str;
         bool ok = true;
-        
+
         DbMessage d(m);
 
         const char *ns = d.getns();
         int ntoreturn = d.pullInt();
         long long cursorid = d.pullInt64();
-        
+
         ss << ns << " cid:" << cursorid;
-        if( ntoreturn ) 
+        if( ntoreturn )
             ss << " ntoreturn:" << ntoreturn;
 
-		time_t start = 0;
-        int pass = 0;        
+        time_t start = 0;
+        int pass = 0;
         bool exhaust = false;
         QueryResult* msgdata;
         while( 1 ) {
             try {
-                mongolock lk(false);
+                readlock lk;
                 Client::Context ctx(ns);
                 msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust);
             }
-            catch ( GetMoreWaitException& ) { 
+            catch ( GetMoreWaitException& ) {
                 exhaust = false;
                 massert(13073, "shutting down", !inShutdown() );
-				if( pass == 0 ) { 
-  				    start = time(0);
-				}
-				else { 
-				  if( time(0) - start >= 4 ) {
-					// after about 4 seconds, return.  this is a sanity check.  pass stops at 1000 normally 
-					// for DEV this helps and also if sleep is highly inaccurate on a platform.  we want to 
-					// return occasionally so slave can checkpoint.
-					pass = 10000;
-				  }
-				}
+                if( pass == 0 ) {
+                    start = time(0);
+                }
+                else {
+                    if( time(0) - start >= 4 ) {
+                        // after about 4 seconds, return.  this is a sanity check.  pass stops at 1000 normally
+                        // for DEV this helps and also if sleep is highly inaccurate on a platform.  we want to
+                        // return occasionally so slave can checkpoint.
+                        pass = 10000;
+                    }
+                }
                 pass++;
-                DEV 
-                    sleepmillis(20);
-                else 
+                DEV
+                sleepmillis(20);
+                else
                     sleepmillis(2);
                 continue;
             }
@@ -545,8 +559,8 @@ namespace mongo {
         ss << " nreturned:" << msgdata->nReturned;
         dbresponse.response = resp;
         dbresponse.responseTo = m.header()->id;
-        if( exhaust ) { 
-            ss << " exhaust "; 
+        if( exhaust ) {
+            ss << " exhaust ";
             dbresponse.exhaust = ns;
         }
         return ok;
@@ -554,8 +568,8 @@ namespace mongo {
 
     void receivedInsert(Message& m, CurOp& op) {
         DbMessage d(m);
-		const char *ns = d.getns();
-		assert(*ns);
+        const char *ns = d.getns();
+        assert(*ns);
         uassert( 10058 ,  "not master", isMasterNs( ns ) );
         op.debug().str << ns;
 
@@ -564,31 +578,32 @@ namespace mongo {
         if ( handlePossibleShardedMessage( m , 0 ) )
             return;
 
-        Client::Context ctx(ns);		
+        Client::Context ctx(ns);
+        int n = 0;
         while ( d.moreJSObjs() ) {
             BSONObj js = d.nextJsObj();
-            uassert( 10059 , "object to insert too large", js.objsize() <= MaxBSONObjectSize);
+            uassert( 10059 , "object to insert too large", js.objsize() <= BSONObjMaxUserSize);
+
+            {
+                // check no $ modifiers
+                BSONObjIterator i( js );
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    uassert( 13511 , "object to insert can't have $ modifiers" , e.fieldName()[0] != '$' );
+                }
+            }
+
             theDataFileMgr.insertWithObjMod(ns, js, false);
             logOp("i", ns, js);
-            globalOpCounters.gotInsert();
+
+            if( ++n % 4 == 0 ) {
+                // if we are inserting quite a few, we may need to commit along the way
+                getDur().commitIfNeeded();
+            }
         }
+        globalOpCounters.incInsertInWriteLock(n);
     }
 
-    class JniMessagingPort : public AbstractMessagingPort {
-    public:
-        JniMessagingPort(Message& _container) : container(_container) { }
-        void reply(Message& received, Message& response, MSGID) {
-            container = response;
-        }
-        void reply(Message& received, Message& response) {
-            container = response;
-        }
-        unsigned remotePort(){
-            return 1;
-        }
-        Message & container;
-    };
-    
     void getDatabaseNames( vector< string > &names , const string& usePath ) {
         boost::filesystem::path path( usePath );
         for ( boost::filesystem::directory_iterator i( path );
@@ -599,7 +614,8 @@ namespace mongo {
                 p /= ( dbName + ".ns" );
                 if ( MMF::exists( p ) )
                     names.push_back( dbName );
-            } else {
+            }
+            else {
                 string fileName = boost::filesystem::path(*i).leaf();
                 if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" )
                     names.push_back( fileName.substr( 0, fileName.length() - 3 ) );
@@ -607,14 +623,14 @@ namespace mongo {
         }
     }
 
-    /* returns true if there is data on this server.  useful when starting replication. 
+    /* returns true if there is data on this server.  useful when starting replication.
        local database does NOT count except for rsoplog collection.
     */
-    bool replHasDatabases() { 
+    bool replHasDatabases() {
         vector<string> names;
         getDatabaseNames(names);
         if( names.size() >= 2 ) return true;
-        if( names.size() == 1 ){
+        if( names.size() == 1 ) {
             if( names[0] != "local" )
                 return true;
             // we have a local database.  return true if oplog isn't empty
@@ -628,7 +644,7 @@ namespace mongo {
         return false;
     }
 
-    bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk ) {
+    bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) {
         if ( lastError._get() )
             lastError.startRequest( toSend, lastError._get() );
         DbResponse dbResponse;
@@ -636,6 +652,7 @@ namespace mongo {
         assert( dbResponse.response );
         dbResponse.response->concat(); // can get rid of this if we make response handling smarter
         response = *dbResponse.response;
+        getDur().commitIfNeeded();
         return true;
     }
 
@@ -644,11 +661,12 @@ namespace mongo {
             lastError.startRequest( toSend, lastError._get() );
         DbResponse dbResponse;
         assembleResponse( toSend, dbResponse );
+        getDur().commitIfNeeded();
     }
 
     auto_ptr<DBClientCursor> DBDirectClient::query(const string &ns, Query query, int nToReturn , int nToSkip ,
-                                                   const BSONObj *fieldsToReturn , int queryOptions ){
-        
+            const BSONObj *fieldsToReturn , int queryOptions ) {
+
         //if ( ! query.obj.isEmpty() || nToReturn != 0 || nToSkip != 0 || fieldsToReturn || queryOptions )
         return DBClientBase::query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions );
         //
@@ -656,128 +674,181 @@ namespace mongo {
         //throw UserException( (string)"yay:" + ns );
     }
 
-    void DBDirectClient::killCursor( long long id ){
+    void DBDirectClient::killCursor( long long id ) {
         ClientCursor::erase( id );
     }
 
-    DBClientBase * createDirectClient(){
-        return new DBDirectClient();
+    unsigned long long DBDirectClient::count(const string &ns, const BSONObj& query, int options, int limit, int skip ) {
+        readlock lk( ns );
+        string errmsg;
+        long long res = runCount( ns.c_str() , _countCmd( ns , query , options , limit , skip ) , errmsg );
+        if ( res == -1 )
+            return 0;
+        uassert( 13637 , str::stream() << "count failed in DBDirectClient: " << errmsg , res >= 0 );
+        return (unsigned long long )res;
     }
 
-    //void recCacheCloseAll();
+    DBClientBase * createDirectClient() {
+        return new DBDirectClient();
+    }
 
     mongo::mutex exitMutex("exit");
     int numExitCalls = 0;
-    void shutdown();
 
-    bool inShutdown(){
+    bool inShutdown() {
         return numExitCalls > 0;
     }
 
-    void tryToOutputFatal( const string& s ){
+    void tryToOutputFatal( const string& s ) {
         try {
             rawOut( s );
             return;
         }
-        catch ( ... ){}
+        catch ( ... ) {}
 
         try {
             cerr << s << endl;
             return;
         }
-        catch ( ... ){}
-        
+        catch ( ... ) {}
+
         // uh - oh, not sure there is anything else we can do...
     }
 
+    /** also called by ntservice.cpp */
+    void shutdownServer() {
+
+        log() << "shutdown: going to close listening sockets..." << endl;
+        ListeningSockets::get()->closeAll();
+
+        log() << "shutdown: going to flush diaglog..." << endl;
+        flushDiagLog();
+
+        /* must do this before unmapping mem or you may get a seg fault */
+        log() << "shutdown: going to close sockets..." << endl;
+        boost::thread close_socket_thread( boost::bind(MessagingPort::closeAllSockets, 0) );
+
+        // wait until file preallocation finishes
+        // we would only hang here if the file_allocator code generates a
+        // synchronous signal, which we don't expect
+        log() << "shutdown: waiting for fs preallocator..." << endl;
+        FileAllocator::get()->waitUntilFinished();
+
+        if( cmdLine.dur ) {
+            log() << "shutdown: lock for final commit..." << endl;
+            {
+                int n = 10;
+                while( 1 ) {
+                    // we may already be in a read lock from earlier in the call stack, so do read lock here 
+                    // to be consistent with that.
+                    readlocktry w("", 20000);
+                    if( w.got() ) { 
+                        log() << "shutdown: final commit..." << endl;
+                        getDur().commitNow();
+                        break;
+                    }
+                    if( --n <= 0 ) {
+                        log() << "shutdown: couldn't acquire write lock, aborting" << endl;
+                        abort();
+                    }
+                    log() << "shutdown: waiting for write lock..." << endl;
+                }
+            }
+            MemoryMappedFile::flushAll(true);
+        }
+
+        log() << "shutdown: closing all files..." << endl;
+        stringstream ss3;
+        MemoryMappedFile::closeAllFiles( ss3 );
+        rawOut( ss3.str() );
+
+        if( cmdLine.dur ) {
+            log() << "shutdown: journalCleanup..." << endl;
+            dur::journalCleanup();
+        }
+
+#if !defined(__sunos__)
+        if ( lockFile ) {
+            log() << "shutdown: removing fs lock..." << endl;
+            /* This ought to be an unlink(), but Eliot says the last
+               time that was attempted, there was a race condition
+               with acquirePathLock().  */
+#ifdef WIN32
+            if( _chsize( lockFile , 0 ) )
+                log() << "couldn't remove fs lock " << getLastError() << endl;
+            CloseHandle(lockFileHandle);
+#else
+            if( ftruncate( lockFile , 0 ) )
+                log() << "couldn't remove fs lock " << errnoWithDescription() << endl;
+            flock( lockFile, LOCK_UN );
+#endif
+        }
+#endif
+    }
+
     /* not using log() herein in case we are already locked */
-    void dbexit( ExitCode rc, const char *why) {        
+    void dbexit( ExitCode rc, const char *why, bool tryToGetLock ) {
+
+        auto_ptr<writelocktry> wlt;
+        if ( tryToGetLock ) {
+            wlt.reset( new writelocktry( "" , 2 * 60 * 1000 ) );
+            uassert( 13455 , "dbexit timed out getting lock" , wlt->got() );
+        }
+
         Client * c = currentClient.get();
         {
             scoped_lock lk( exitMutex );
             if ( numExitCalls++ > 0 ) {
-                if ( numExitCalls > 5 ){
+                if ( numExitCalls > 5 ) {
                     // this means something horrible has happened
                     ::_exit( rc );
                 }
                 stringstream ss;
-                ss << "dbexit: " << why << "; exiting immediately" << endl;
+                ss << "dbexit: " << why << "; exiting immediately";
                 tryToOutputFatal( ss.str() );
                 if ( c ) c->shutdown();
-                ::exit( rc );                
+                ::exit( rc );
             }
         }
-        
-        stringstream ss;
-        ss << "dbexit: " << why << endl;
-        tryToOutputFatal( ss.str() );
-        
+
+        {
+            stringstream ss;
+            ss << "dbexit: " << why;
+            tryToOutputFatal( ss.str() );
+        }
+
         try {
-            shutdown(); // gracefully shutdown instance
+            shutdownServer(); // gracefully shutdown instance
         }
-        catch ( ... ){
+        catch ( ... ) {
             tryToOutputFatal( "shutdown failed with exception" );
         }
 
-        try { 
+        try {
             mutexDebugger.programEnding();
         }
         catch (...) { }
-        
+
         tryToOutputFatal( "dbexit: really exiting now" );
         if ( c ) c->shutdown();
         ::exit(rc);
     }
-    
-    void shutdown() {
-
-        log() << "shutdown: going to close listening sockets..." << endl;        
-        ListeningSockets::get()->closeAll();
 
-        log() << "shutdown: going to flush oplog..." << endl;
-        stringstream ss2;
-        flushOpLog( ss2 );
-        rawOut( ss2.str() );
-
-        /* must do this before unmapping mem or you may get a seg fault */
-        log() << "shutdown: going to close sockets..." << endl;
-        boost::thread close_socket_thread( boost::bind(MessagingPort::closeAllSockets, 0) );
-
-        // wait until file preallocation finishes
-        // we would only hang here if the file_allocator code generates a
-        // synchronous signal, which we don't expect
-        log() << "shutdown: waiting for fs preallocator..." << endl;
-        theFileAllocator().waitUntilFinished();
-        
-        log() << "shutdown: closing all files..." << endl;
-        stringstream ss3;
-        MemoryMappedFile::closeAllFiles( ss3 );
-        rawOut( ss3.str() );
-
-        // should we be locked here?  we aren't. might be ok as-is.
-        //recCacheCloseAll();
-        
-#if !defined(_WIN32) && !defined(__sunos__)
-        if ( lockFile ){
-            log() << "shutdown: removing fs lock..." << endl;
-            if( ftruncate( lockFile , 0 ) ) 
-                log() << "couldn't remove fs lock " << errnoWithDescription() << endl;
-            flock( lockFile, LOCK_UN );
-        }
-#endif
-    }
-
-#if !defined(_WIN32) && !defined(__sunos__)
+#if !defined(__sunos__)
     void writePid(int fd) {
         stringstream ss;
         ss << getpid() << endl;
         string s = ss.str();
         const char * data = s.c_str();
+#ifdef WIN32
+        assert ( _write( fd, data, strlen( data ) ) );
+#else
         assert ( write( fd, data, strlen( data ) ) );
+#endif
     }
 
     void acquirePathLock() {
-      string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
+        string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
 
         bool oldFile = false;
 
@@ -785,37 +856,117 @@ namespace mongo {
             oldFile = true;
         }
 
+#ifdef WIN32
+        lockFileHandle = CreateFileA( name.c_str(), GENERIC_READ | GENERIC_WRITE,
+            0 /* do not allow anyone else access */, NULL, 
+            OPEN_ALWAYS /* success if fh can open */, 0, NULL );
+
+        if (lockFileHandle == INVALID_HANDLE_VALUE) {
+            DWORD code = GetLastError();
+            char *msg;
+            FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+                NULL, code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+                (LPSTR)&msg, 0, NULL);
+            uasserted( 13627 , msg );
+        }
+        lockFile = _open_osfhandle((intptr_t)lockFileHandle, 0);
+#else
         lockFile = open( name.c_str(), O_RDWR | O_CREAT , S_IRWXU | S_IRWXG | S_IRWXO );
-		if( lockFile <= 0 ) {
-		    uasserted( 10309 , str::stream() << "Unable to create / open lock file for lockfilepath: " << name << ' ' << errnoWithDescription());
+        if( lockFile <= 0 ) {
+            uasserted( 10309 , str::stream() << "Unable to create / open lock file for lockfilepath: " << name << ' ' << errnoWithDescription());
         }
         if (flock( lockFile, LOCK_EX | LOCK_NB ) != 0) {
             close ( lockFile );
             lockFile = 0;
             uassert( 10310 ,  "Unable to acquire lock for lockfilepath: " + name,  0 );
         }
+#endif
 
-        if ( oldFile ){
+        if ( oldFile ) {
             // we check this here because we want to see if we can get the lock
             // if we can't, then its probably just another mongod running
-            cout << "************** \n" 
-                 << "old lock file: " << name << ".  probably means unclean shutdown\n"
-                 << "recommend removing file and running --repair\n" 
-                 << "see: http://dochub.mongodb.org/core/repair for more information\n"
-                 << "*************" << endl;
-            close ( lockFile );
-            lockFile = 0;
-            uassert( 12596 , "old lock file" , 0 );
+            
+            string errmsg;
+            if (cmdLine.dur) {
+                if (!dur::haveJournalFiles()) {
+                    
+                    vector<string> dbnames;
+                    getDatabaseNames( dbnames );
+                    
+                    if ( dbnames.size() == 0 ) {
+                        // this means that mongod crashed
+                        // between initial startup and when journaling was initialized
+                        // it is safe to continue
+                    }
+                    else {
+                        errmsg = str::stream()
+                            << "************** \n"
+                            << "old lock file: " << name << ".  probably means unclean shutdown,\n"
+                            << "but there are no journal files to recover.\n"
+                            << "this is likely human error or filesystem corruption.\n"
+                            << "found " << dbnames.size() << " dbs.\n"
+                            << "see: http://dochub.mongodb.org/core/repair for more information\n"
+                            << "*************";
+                    }
+
+
+                }
+            }
+            else {
+                errmsg = str::stream()
+                         << "************** \n"
+                         << "old lock file: " << name << ".  probably means unclean shutdown\n"
+                         << "recommend removing file and running --repair\n"
+                         << "see: http://dochub.mongodb.org/core/repair for more information\n"
+                         << "*************";
+            }
+
+            if (!errmsg.empty()) {
+                cout << errmsg << endl;
+#ifdef WIN32
+                CloseHandle( lockFileHandle );
+#else
+                close ( lockFile );
+#endif
+                lockFile = 0;
+                uassert( 12596 , "old lock file" , 0 );
+            }
+        }
+
+        // Not related to lock file, but this is where we handle unclean shutdown
+        if( !cmdLine.dur && dur::haveJournalFiles() ) {
+            cout << "**************" << endl;
+            cout << "Error: journal files are present in journal directory, yet starting without --dur enabled." << endl;
+            cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
+            cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl;
+            cout << "**************" << endl;
+            uasserted(13597, "can't start without --dur enabled when journal/ files are present");
         }
 
+#ifdef WIN32
+        uassert( 13625, "Unable to truncate lock file", _chsize(lockFile, 0) == 0);
+        writePid( lockFile );
+        _commit( lockFile );
+#else
         uassert( 13342, "Unable to truncate lock file", ftruncate(lockFile, 0) == 0);
         writePid( lockFile );
         fsync( lockFile );
+#endif
     }
 #else
     void acquirePathLock() {
-        // TODO - this is very bad
+        // TODO - this is very bad that the code above not running here.
+
+        // Not related to lock file, but this is where we handle unclean shutdown
+        if( !cmdLine.dur && dur::haveJournalFiles() ) {
+            cout << "**************" << endl;
+            cout << "Error: journal files are present in journal directory, yet starting without --dur enabled." << endl;
+            cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
+            cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl;
+            cout << "**************" << endl;
+            uasserted(13618, "can't start without --dur enabled when journal/ files are present");
+        }
     }
-#endif    
-    
+#endif
+
 } // namespace mongo
diff --git a/db/instance.h b/db/instance.h
index 5458fc1..2516aec 100644
--- a/db/instance.h
+++ b/db/instance.h
@@ -21,7 +21,7 @@
 
 
 #include "../client/dbclient.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "security.h"
 #include "cmdline.h"
 #include "client.h"
@@ -40,7 +40,7 @@ namespace mongo {
 
         DiagLog() : f(0) , level(0), mutex("DiagLog") { }
         void init() {
-            if ( ! f && level ){
+            if ( ! f && level ) {
                 log() << "diagLogging = " << level << endl;
                 stringstream ss;
                 ss << dbpath << "/diaglog." << hex << time(0);
@@ -55,20 +55,20 @@ namespace mongo {
         /**
          * @return old
          */
-        int setLevel( int newLevel ){
+        int setLevel( int newLevel ) {
             int old = level;
             level = newLevel;
             init();
             return old;
         }
         void flush() {
-            if ( level ){
+            if ( level ) {
                 scoped_lock lk(mutex);
                 f->flush();
             }
         }
         void write(char *data,int len) {
-            if ( level & 1 ){
+            if ( level & 1 ) {
                 scoped_lock lk(mutex);
                 f->write(data,len);
             }
@@ -77,7 +77,7 @@ namespace mongo {
             if ( level & 2 ) {
                 bool log = (level & 4) == 0;
                 OCCASIONALLY log = true;
-                if ( log ){
+                if ( log ) {
                     scoped_lock lk(mutex);
                     assert( f );
                     f->write(data,len);
@@ -102,52 +102,56 @@ namespace mongo {
         }
         ~DbResponse() { delete response; }
     };
-    
-    bool assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client = unknownAddress );
+
+    void assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client = unknownAddress );
 
     void getDatabaseNames( vector< string > &names , const string& usePath = dbpath );
 
-    /* returns true if there is no data on this server.  useful when starting replication. 
-       local database does NOT count. 
+    /* returns true if there is no data on this server.  useful when starting replication.
+       local database does NOT count.
     */
     bool replHasDatabases();
 
-// --- local client ---
-    
+    /** "embedded" calls to the local server directly. 
+        Caller does not need to lock, that is handled within.
+     */
     class DBDirectClient : public DBClientBase {
-        
     public:
         virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0,
                                                const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
-        
+
         virtual bool isFailed() const {
             return false;
         }
         virtual string toString() {
             return "DBDirectClient";
         }
-        virtual string getServerAddress() const{
+        virtual string getServerAddress() const {
             return "localhost"; // TODO: should this have the port?
         }
-        virtual bool call( Message &toSend, Message &response, bool assertOk=true );
+        virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 );
         virtual void say( Message &toSend );
         virtual void sayPiggyBack( Message &toSend ) {
             // don't need to piggy back when connected locally
             return say( toSend );
         }
-        
+
         virtual void killCursor( long long cursorID );
-        
-        virtual bool callRead( Message& toSend , Message& response ){
+
+        virtual bool callRead( Message& toSend , Message& response ) {
             return call( toSend , response );
         }
-
-        virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; }  
-        virtual bool isMember( const DBConnector * conn ) const { return this == conn; };
+        
+        virtual unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0, int limit=0, int skip=0 );
+        
+        virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; }
     };
 
     extern int lockFile;
+#ifdef WIN32
+    extern HANDLE lockFileHandle;
+#endif
     void acquirePathLock();
     void maybeCreatePidFile();
-    
+
 } // namespace mongo
diff --git a/db/introspect.cpp b/db/introspect.cpp
index d72bb3f..cee0da8 100644
--- a/db/introspect.cpp
+++ b/db/introspect.cpp
@@ -26,8 +26,7 @@
 
 namespace mongo {
 
-    void profile( const char *str, int millis)
-    {
+    void profile( const char *str, int millis) {
         BSONObjBuilder b;
         b.appendDate("ts", jsTime());
         b.append("info", str);
diff --git a/db/jsobj.cpp b/db/jsobj.cpp
index 9f613c7..25ab8a8 100644
--- a/db/jsobj.cpp
+++ b/db/jsobj.cpp
@@ -18,6 +18,7 @@
  */
 
 #include "pch.h"
+#include "../bson/oid.h"
 #include "jsobj.h"
 #include "nonce.h"
 #include "../bson/util/atomic_int.h"
@@ -34,6 +35,7 @@
 #define assert MONGO_assert
 
 // make sure our assumptions are valid
+BOOST_STATIC_ASSERT( sizeof(short) == 2 );
 BOOST_STATIC_ASSERT( sizeof(int) == 4 );
 BOOST_STATIC_ASSERT( sizeof(long long) == 8 );
 BOOST_STATIC_ASSERT( sizeof(double) == 8 );
@@ -48,6 +50,9 @@ namespace mongo {
 
     DateNowLabeler DATENOW;
 
+    MinKeyLabeler MINKEY;
+    MaxKeyLabeler MAXKEY;
+
     string escape( string s , bool escape_slash=false) {
         StringBuilder ret;
         for ( string::iterator i = s.begin(); i != s.end(); ++i ) {
@@ -81,7 +86,8 @@ namespace mongo {
                     //TODO: these should be utf16 code-units not bytes
                     char c = *i;
                     ret << "\\u00" << toHexLower(&c, 1);
-                } else {
+                }
+                else {
                     ret << *i;
                 }
             }
@@ -111,7 +117,8 @@ namespace mongo {
                     number() <= numeric_limits< double >::max() ) {
                 s.precision( 16 );
                 s << number();
-            } else {
+            }
+            else {
                 StringBuilder ss;
                 ss << "Number " << number() << " cannot be represented in JSON";
                 string message = ss.str();
@@ -170,13 +177,15 @@ namespace mongo {
         case jstOID:
             if ( format == TenGen ) {
                 s << "ObjectId( ";
-            } else {
+            }
+            else {
                 s << "{ \"$oid\" : ";
             }
             s << '"' << __oid() << '"';
             if ( format == TenGen ) {
                 s << " )";
-            } else {
+            }
+            else {
                 s << " }";
             }
             break;
@@ -203,7 +212,8 @@ namespace mongo {
                 if( d == 0 ) s << '0';
                 else
                     s << '"' << date().toString() << '"';
-            } else
+            }
+            else
                 s << date();
             if ( format == Strict )
                 s << " }";
@@ -211,13 +221,14 @@ namespace mongo {
                 s << " )";
             break;
         case RegEx:
-            if ( format == Strict ){
+            if ( format == Strict ) {
                 s << "{ \"$regex\" : \"" << escape( regex() );
                 s << "\", \"$options\" : \"" << regexFlags() << "\" }";
-            } else {
+            }
+            else {
                 s << "/" << escape( regex() , true ) << "/";
                 // FIXME Worry about alpha order?
-                for ( const char *f = regexFlags(); *f; ++f ){
+                for ( const char *f = regexFlags(); *f; ++f ) {
                     switch ( *f ) {
                     case 'g':
                     case 'i':
@@ -232,7 +243,7 @@ namespace mongo {
 
         case CodeWScope: {
             BSONObj scope = codeWScopeObject();
-            if ( ! scope.isEmpty() ){
+            if ( ! scope.isEmpty() ) {
                 s << "{ \"$code\" : " << _asCode() << " , "
                   << " \"$scope\" : " << scope.jsonString() << " }";
                 break;
@@ -243,7 +254,7 @@ namespace mongo {
         case Code:
             s << _asCode();
             break;
-            
+
         case Timestamp:
             s << "{ \"t\" : " << timestampTime() << " , \"i\" : " << timestampInc() << " }";
             break;
@@ -259,7 +270,7 @@ namespace mongo {
         default:
             StringBuilder ss;
             ss << "Cannot create a properly formatted JSON string with "
-            << "element: " << toString() << " of type: " << type();
+               << "element: " << toString() << " of type: " << type();
             string message = ss.str();
             massert( 10312 ,  message.c_str(), false );
         }
@@ -279,13 +290,13 @@ namespace mongo {
                     else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::LTE;
                 }
             }
-            else if ( fn[1] == 'n' && fn[2] == 'e' ){
+            else if ( fn[1] == 'n' && fn[2] == 'e' ) {
                 if ( fn[3] == 0 )
                     return BSONObj::NE;
-                if ( fn[3] == 'a' && fn[4] == 'r' && fn[5] == 0 )
+                if ( fn[3] == 'a' && fn[4] == 'r') // matches anything with $near prefix
                     return BSONObj::opNEAR;
             }
-            else if ( fn[1] == 'm' ){
+            else if ( fn[1] == 'm' ) {
                 if ( fn[2] == 'o' && fn[3] == 'd' && fn[4] == 0 )
                     return BSONObj::opMOD;
                 if ( fn[2] == 'a' && fn[3] == 'x' && fn[4] == 'D' && fn[5] == 'i' && fn[6] == 's' && fn[7] == 't' && fn[8] == 'a' && fn[9] == 'n' && fn[10] == 'c' && fn[11] == 'e' && fn[12] == 0 )
@@ -301,7 +312,7 @@ namespace mongo {
                 return BSONObj::opALL;
             else if ( fn[1] == 's' && fn[2] == 'i' && fn[3] == 'z' && fn[4] == 'e' && fn[5] == 0 )
                 return BSONObj::opSIZE;
-            else if ( fn[1] == 'e' ){
+            else if ( fn[1] == 'e' ) {
                 if ( fn[2] == 'x' && fn[3] == 'i' && fn[4] == 's' && fn[5] == 't' && fn[6] == 's' && fn[7] == 0 )
                     return BSONObj::opEXISTS;
                 if ( fn[2] == 'l' && fn[3] == 'e' && fn[4] == 'm' && fn[5] == 'M' && fn[6] == 'a' && fn[7] == 't' && fn[8] == 'c' && fn[9] == 'h' && fn[10] == 0 )
@@ -370,22 +381,24 @@ namespace mongo {
             double left = l.number();
             double right = r.number();
             bool lNan = !( left <= numeric_limits< double >::max() &&
-                         left >= -numeric_limits< double >::max() );
+                           left >= -numeric_limits< double >::max() );
             bool rNan = !( right <= numeric_limits< double >::max() &&
-                         right >= -numeric_limits< double >::max() );
+                           right >= -numeric_limits< double >::max() );
             if ( lNan ) {
                 if ( rNan ) {
                     return 0;
-                } else {
+                }
+                else {
                     return -1;
                 }
-            } else if ( rNan ) {
+            }
+            else if ( rNan ) {
                 return 1;
             }
             x = left - right;
             if ( x < 0 ) return -1;
             return x == 0 ? 0 : 1;
-            }
+        }
         case jstOID:
             return memcmp(l.value(), r.value(), 12);
         case Code:
@@ -408,8 +421,7 @@ namespace mongo {
             if ( lsz - rsz != 0 ) return lsz - rsz;
             return memcmp(l.value()+4, r.value()+4, lsz+1);
         }
-        case RegEx:
-        {
+        case RegEx: {
             int c = strcmp(l.regex(), r.regex());
             if ( c )
                 return c;
@@ -462,11 +474,14 @@ namespace mongo {
         return fe.getGtLtOp();
     }
 
-    FieldCompareResult compareDottedFieldNames( const string& l , const string& r ){
+    FieldCompareResult compareDottedFieldNames( const string& l , const string& r ) {
+        static int maxLoops = 1024 * 1024;
+
         size_t lstart = 0;
         size_t rstart = 0;
-        while ( 1 ){
-            if ( lstart >= l.size() ){
+
+        for ( int i=0; i<maxLoops; i++ ) {
+            if ( lstart >= l.size() ) {
                 if ( rstart >= r.size() )
                     return SAME;
                 return RIGHT_SUBFIELD;
@@ -493,6 +508,10 @@ namespace mongo {
             lstart = lend + 1;
             rstart = rend + 1;
         }
+
+        log() << "compareDottedFieldNames ERROR  l: " << l << " r: " << r << "  TOO MANY LOOPS" << endl;
+        assert(0);
+        return SAME; // will never get here
     }
 
     /* BSONObj ------------------------------------------------------------*/
@@ -534,33 +553,35 @@ namespace mongo {
         return s.str();
     }
 
-// todo: can be a little faster if we don't use toString() here.
     bool BSONObj::valid() const {
-        try{
+        try {
             BSONObjIterator it(*this);
-            while( it.moreWithEOO() ){
+            while( it.moreWithEOO() ) {
                 // both throw exception on failure
                 BSONElement e = it.next(true);
                 e.validate();
 
-                if (e.eoo()){
+                if (e.eoo()) {
                     if (it.moreWithEOO())
                         return false;
                     return true;
-                }else if (e.isABSONObj()){
+                }
+                else if (e.isABSONObj()) {
                     if(!e.embeddedObject().valid())
                         return false;
-                }else if (e.type() == CodeWScope){
+                }
+                else if (e.type() == CodeWScope) {
                     if(!e.codeWScopeObject().valid())
                         return false;
                 }
             }
-        } catch (...) {
+        }
+        catch (...) {
         }
         return false;
     }
 
-    int BSONObj::woCompare(const BSONObj& r, const Ordering &o, bool considerFieldName) const { 
+    int BSONObj::woCompare(const BSONObj& r, const Ordering &o, bool considerFieldName) const {
         if ( isEmpty() )
             return r.isEmpty() ? 0 : -1;
         if ( r.isEmpty() )
@@ -619,13 +640,13 @@ namespace mongo {
                 return 1;
 
             int x;
-/*
-            if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 && 
-                l.type() == String && r.type() == String ) { 
-                // note: no negative support yet, as this is just sort of a POC
-                x = _stricmp(l.valuestr(), r.valuestr());
-            }
-            else*/ {
+            /*
+                        if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 &&
+                            l.type() == String && r.type() == String ) {
+                            // note: no negative support yet, as this is just sort of a POC
+                            x = _stricmp(l.valuestr(), r.valuestr());
+                        }
+                        else*/ {
                 x = l.woCompare( r, considerFieldName );
                 if ( ordered && o.number() < 0 )
                     x = -x;
@@ -639,7 +660,7 @@ namespace mongo {
     BSONObj staticNull = fromjson( "{'':null}" );
 
     /* well ordered compare */
-    int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const{
+    int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const {
         if ( isEmpty() )
             return other.isEmpty() ? 0 : -1;
         if ( other.isEmpty() )
@@ -648,7 +669,7 @@ namespace mongo {
         uassert( 10060 ,  "woSortOrder needs a non-empty sortKey" , ! sortKey.isEmpty() );
 
         BSONObjIterator i(sortKey);
-        while ( 1 ){
+        while ( 1 ) {
             BSONElement f = i.next();
             if ( f.eoo() )
                 return 0;
@@ -678,36 +699,41 @@ namespace mongo {
                 const char* next = p+1;
                 BSONElement e = getField( left.c_str() );
 
-                if (e.type() == Object){
+                if (e.type() == Object) {
                     e.embeddedObject().getFieldsDotted(next, ret);
-                } else if (e.type() == Array) {
+                }
+                else if (e.type() == Array) {
                     bool allDigits = false;
-                    if ( isdigit( *next ) ){
+                    if ( isdigit( *next ) ) {
                         const char * temp = next + 1;
                         while ( isdigit( *temp ) )
                             temp++;
-                        allDigits = *temp == '.';
+                        allDigits = (*temp == '.' || *temp == '\0');
                     }
                     if (allDigits) {
                         e.embeddedObject().getFieldsDotted(next, ret);
-                    } else {
+                    }
+                    else {
                         BSONObjIterator i(e.embeddedObject());
-                        while ( i.more() ){
+                        while ( i.more() ) {
                             BSONElement e2 = i.next();
                             if (e2.type() == Object || e2.type() == Array)
                                 e2.embeddedObject().getFieldsDotted(next, ret);
                         }
                     }
-                } else {
+                }
+                else {
                     // do nothing: no match
                 }
             }
-        } else {
-            if (e.type() == Array){
+        }
+        else {
+            if (e.type() == Array) {
                 BSONObjIterator i(e.embeddedObject());
                 while ( i.more() )
                     ret.insert(i.next());
-            } else {
+            }
+            else {
                 ret.insert(e);
             }
         }
@@ -715,15 +741,18 @@ namespace mongo {
 
     BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const {
         const char *p = strchr(name, '.');
-        string left;
+        
+        BSONElement sub;
+
         if ( p ) {
-            left = string(name, p-name);
+            sub = getField( string(name, p-name) );
             name = p + 1;
-        } else {
-            left = string(name);
+        }
+        else {
+            sub = getField( name );
             name = name + strlen(name);
         }
-        BSONElement sub = getField(left.c_str());
+
         if ( sub.eoo() )
             return nullElement;
         else if ( sub.type() == Array || name[0] == '\0')
@@ -778,7 +807,7 @@ namespace mongo {
                 break;
             BSONElement x = filter.getField( e.fieldName() );
             if ( ( x.eoo() && !inFilter ) ||
-                ( !x.eoo() && inFilter ) )
+                    ( !x.eoo() && inFilter ) )
                 b.append( e );
         }
         return b.obj();
@@ -858,7 +887,8 @@ namespace mongo {
                 gotId = gotId || strcmp(fname, "_id")==0;
                 if ( n == N && gotId )
                     break;
-            } else if ( strcmp(fname, "_id")==0 ) {
+            }
+            else if ( strcmp(fname, "_id")==0 ) {
                 b.append(e);
                 gotId = true;
                 if ( n == N && gotId )
@@ -882,20 +912,20 @@ namespace mongo {
             if ( e.eoo() )
                 break;
             switch( e.type() ) {
-                case MinKey: {
-                    BSONObjBuilder m;
-                    m.append( "$minElement", 1 );
-                    b.append( e.fieldName(), m.done() );
-                    break;
-                }
-                case MaxKey: {
-                    BSONObjBuilder m;
-                    m.append( "$maxElement", 1 );
-                    b.append( e.fieldName(), m.done() );
-                    break;
-                }
-                default:
-                    b.append( e );
+            case MinKey: {
+                BSONObjBuilder m;
+                m.append( "$minElement", 1 );
+                b.append( e.fieldName(), m.done() );
+                break;
+            }
+            case MaxKey: {
+                BSONObjBuilder m;
+                m.append( "$maxElement", 1 );
+                b.append( e.fieldName(), m.done() );
+                break;
+            }
+            default:
+                b.append( e );
             }
         }
         return b.obj();
@@ -913,7 +943,8 @@ namespace mongo {
             if ( !f.eoo() ) {
                 b.appendAs( e, f.fieldName() );
                 f = j.next();
-            } else {
+            }
+            else {
                 b.append( e );
             }
         }
@@ -922,20 +953,20 @@ namespace mongo {
 
     bool BSONObj::okForStorage() const {
         BSONObjIterator i( *this );
-        while ( i.more() ){
+        while ( i.more() ) {
             BSONElement e = i.next();
             const char * name = e.fieldName();
-            
+
             if ( strchr( name , '.' ) ||
-                 strchr( name , '$' ) ){
-                return 
+                    strchr( name , '$' ) ) {
+                return
                     strcmp( name , "$ref" ) == 0 ||
                     strcmp( name , "$id" ) == 0
                     ;
             }
-            
-            if ( e.mayEncapsulate() ){
-                switch ( e.type() ){
+
+            if ( e.mayEncapsulate() ) {
+                switch ( e.type() ) {
                 case Object:
                 case Array:
                     if ( ! e.embeddedObject().okForStorage() )
@@ -948,7 +979,7 @@ namespace mongo {
                 default:
                     uassert( 12579, "unhandled cases in BSONObj okForStorage" , 0 );
                 }
-                
+
             }
         }
         return true;
@@ -982,25 +1013,26 @@ namespace mongo {
         return ss.str();
     }
 
-    void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base){
+    void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base) {
         BSONObjIterator it(obj);
-        while (it.more()){
+        while (it.more()) {
             BSONElement e = it.next();
-            if (e.type() == Object){
+            if (e.type() == Object) {
                 string newbase = base + e.fieldName() + ".";
                 nested2dotted(b, e.embeddedObject(), newbase);
-            }else{
+            }
+            else {
                 string newbase = base + e.fieldName();
                 b.appendAs(e, newbase);
             }
         }
     }
 
-    void dotted2nested(BSONObjBuilder& b, const BSONObj& obj){
+    void dotted2nested(BSONObjBuilder& b, const BSONObj& obj) {
         //use map to sort fields
         BSONMap sorted = bson2map(obj);
         EmbeddedBuilder eb(&b);
-        for(BSONMap::const_iterator it=sorted.begin(); it!=sorted.end(); ++it){
+        for(BSONMap::const_iterator it=sorted.begin(); it!=sorted.end(); ++it) {
             eb.appendAs(it->second, it->first);
         }
         eb.done();
@@ -1037,16 +1069,16 @@ namespace mongo {
     } minkeydata;
     BSONObj minKey((const char *) &minkeydata);
 
-/*
-    struct JSObj0 {
-        JSObj0() {
-            totsize = 5;
-            eoo = EOO;
-        }
-        int totsize;
-        char eoo;
-    } js0;
-*/
+    /*
+        struct JSObj0 {
+            JSObj0() {
+                totsize = 5;
+                eoo = EOO;
+            }
+            int totsize;
+            char eoo;
+        } js0;
+    */
 #pragma pack()
 
     struct BsonUnitTest : public UnitTest {
@@ -1078,7 +1110,7 @@ namespace mongo {
             assert( b == id );
         }
 
-        void testbounds(){
+        void testbounds() {
             BSONObj l , r;
             {
                 BSONObjBuilder b;
@@ -1101,7 +1133,7 @@ namespace mongo {
             assert( r.woCompare( l ) > 0 );
         }
 
-        void testorder(){
+        void testorder() {
             {
                 BSONObj x,y,z;
                 { BSONObjBuilder b; b.append( "x" , (long long)2 ); x = b.obj(); }
@@ -1176,84 +1208,6 @@ namespace mongo {
         }
     } bson_unittest;
 
-/*
-    BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const char * value ) {
-        _builder->append( _fieldName , value );
-        return *_builder;
-    }
-
-    BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const int value ) {
-        _builder->append( _fieldName , value );
-        return *_builder;
-    }
-
-    BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const double value ) {
-        _builder->append( _fieldName , value );
-        return *_builder;
-    }
-*/
-
-    void OID::init() {
-        static AtomicUInt inc = getRandomNumber();
-        unsigned t = (unsigned) time(0);
-        char *T = (char *) &t;
-        data[0] = T[3];
-        data[1] = T[2];
-        data[2] = T[1];
-        data[3] = T[0];
-
-        (unsigned&) data[4] = _machine;
-
-        int new_inc = inc++;
-        T = (char *) &new_inc;
-        char * raw = (char*)&b;
-        raw[0] = T[3];
-        raw[1] = T[2];
-        raw[2] = T[1];
-        raw[3] = T[0];
-    }
-
-    unsigned OID::_machine = (unsigned) security.getNonceInitSafe();
-    void OID::newState(){
-        unsigned before = _machine;
-        // using fresh Security object to avoid buffered devrandom
-        _machine = (unsigned)security.getNonce();
-        assert( _machine != before );
-    }
-    
-    void OID::init( string s ){
-        assert( s.size() == 24 );
-        const char *p = s.c_str();
-        for( int i = 0; i < 12; i++ ) {
-            data[i] = fromHex(p);
-            p += 2;
-        }
-    }
-
-    void OID::init(Date_t date, bool max){
-        int time = (int) (date / 1000);
-        char* T = (char *) &time;
-        data[0] = T[3];
-        data[1] = T[2];
-        data[2] = T[1];
-        data[3] = T[0];
-
-        if (max)
-            *(long long*)(data + 4) = 0xFFFFFFFFFFFFFFFFll;
-        else
-            *(long long*)(data + 4) = 0x0000000000000000ll;
-    }
-
-    time_t OID::asTimeT(){
-        int time;
-        char* T = (char *) &time;
-        T[0] = data[3];
-        T[1] = data[2];
-        T[2] = data[1];
-        T[3] = data[0];
-        return time;
-    }
-
     Labeler::Label GT( "$gt" );
     Labeler::Label GTE( "$gte" );
     Labeler::Label LT( "$lt" );
@@ -1268,21 +1222,20 @@ namespace mongo {
             timestamp = OpTime::now().asDate();
     }
 
-    void BSONObjBuilder::appendMinForType( const StringData& fieldName , int t ){
-        switch ( t ){
+    void BSONObjBuilder::appendMinForType( const StringData& fieldName , int t ) {
+        switch ( t ) {
         case MinKey: appendMinKey( fieldName ); return;
         case MaxKey: appendMinKey( fieldName ); return;
         case NumberInt:
         case NumberDouble:
         case NumberLong:
             append( fieldName , - numeric_limits<double>::max() ); return;
-        case jstOID:
-            {
-                OID o;
-                memset(&o, 0, sizeof(o));
-                appendOID( fieldName , &o);
-                return;
-            }
+        case jstOID: {
+            OID o;
+            memset(&o, 0, sizeof(o));
+            appendOID( fieldName , &o);
+            return;
+        }
         case Bool: appendBool( fieldName , false); return;
         case Date: appendDate( fieldName , 0); return;
         case jstNULL: appendNull( fieldName ); return;
@@ -1296,13 +1249,12 @@ namespace mongo {
         case Undefined:
             appendUndefined( fieldName ); return;
         case RegEx: appendRegex( fieldName , "" ); return;
-        case DBRef:
-            {
-                OID o;
-                memset(&o, 0, sizeof(o));
-                appendDBRef( fieldName , "" , o );
-                return;
-            }
+        case DBRef: {
+            OID o;
+            memset(&o, 0, sizeof(o));
+            appendDBRef( fieldName , "" , o );
+            return;
+        }
         case Code: appendCode( fieldName , "" ); return;
         case CodeWScope: appendCodeWScope( fieldName , "" , BSONObj() ); return;
         case Timestamp: appendTimestamp( fieldName , 0); return;
@@ -1312,8 +1264,8 @@ namespace mongo {
         uassert( 10061 ,  "type not supported for appendMinElementForType" , false );
     }
 
-    void BSONObjBuilder::appendMaxForType( const StringData& fieldName , int t ){
-        switch ( t ){
+    void BSONObjBuilder::appendMaxForType( const StringData& fieldName , int t ) {
+        switch ( t ) {
         case MinKey: appendMaxKey( fieldName );  break;
         case MaxKey: appendMaxKey( fieldName ); break;
         case NumberInt:
@@ -1324,13 +1276,12 @@ namespace mongo {
         case BinData:
             appendMinForType( fieldName , jstOID );
             break;
-        case jstOID:
-            {
-                OID o;
-                memset(&o, 0xFF, sizeof(o));
-                appendOID( fieldName , &o);
-                break;
-            }
+        case jstOID: {
+            OID o;
+            memset(&o, 0xFF, sizeof(o));
+            appendOID( fieldName , &o);
+            break;
+        }
         case Undefined:
         case jstNULL:
             appendMinForType( fieldName , NumberInt );
@@ -1349,7 +1300,7 @@ namespace mongo {
     }
 
     const string BSONObjBuilder::numStrs[] = {
-         "0",  "1",  "2",  "3",  "4",  "5",  "6",  "7",  "8",  "9",
+        "0",  "1",  "2",  "3",  "4",  "5",  "6",  "7",  "8",  "9",
         "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
         "20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
         "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
@@ -1361,77 +1312,77 @@ namespace mongo {
         "90", "91", "92", "93", "94", "95", "96", "97", "98", "99",
     };
 
-    bool BSONObjBuilder::appendAsNumber( const StringData& fieldName , const string& data ){
+    bool BSONObjBuilder::appendAsNumber( const StringData& fieldName , const string& data ) {
         if ( data.size() == 0 || data == "-")
             return false;
-        
+
         unsigned int pos=0;
         if ( data[0] == '-' )
             pos++;
-        
+
         bool hasDec = false;
-        
-        for ( ; pos<data.size(); pos++ ){
+
+        for ( ; pos<data.size(); pos++ ) {
             if ( isdigit(data[pos]) )
                 continue;
 
-            if ( data[pos] == '.' ){
+            if ( data[pos] == '.' ) {
                 if ( hasDec )
                     return false;
                 hasDec = true;
                 continue;
             }
-            
+
             return false;
         }
-        
-        if ( hasDec ){
+
+        if ( hasDec ) {
             double d = atof( data.c_str() );
             append( fieldName , d );
             return true;
         }
-        
-        if ( data.size() < 8 ){
+
+        if ( data.size() < 8 ) {
             append( fieldName , atoi( data.c_str() ) );
             return true;
         }
-        
+
         try {
             long long num = boost::lexical_cast<long long>( data );
             append( fieldName , num );
             return true;
         }
-        catch(bad_lexical_cast &){
+        catch(bad_lexical_cast &) {
             return false;
         }
 
     }
 
-    void BSONObjBuilder::appendKeys( const BSONObj& keyPattern , const BSONObj& values ){
+    void BSONObjBuilder::appendKeys( const BSONObj& keyPattern , const BSONObj& values ) {
         BSONObjIterator i(keyPattern);
         BSONObjIterator j(values);
-        
-        while ( i.more() && j.more() ){
+
+        while ( i.more() && j.more() ) {
             appendAs( j.next() , i.next().fieldName() );
         }
-        
+
         assert( ! i.more() );
         assert( ! j.more() );
     }
 
-    int BSONElementFieldSorter( const void * a , const void * b ){
+    int BSONElementFieldSorter( const void * a , const void * b ) {
         const char * x = *((const char**)a);
         const char * y = *((const char**)b);
         x++; y++;
         return lexNumCmp( x , y );
     }
-    
-    BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ){
+
+    BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ) {
         _nfields = o.nFields();
         _fields = new const char*[_nfields];
         int x = 0;
         BSONObjIterator i( o );
-        while ( i.more() ){
+        while ( i.more() ) {
             _fields[x++] = i.next().rawdata();
             assert( _fields[x-1] );
         }
@@ -1441,10 +1392,10 @@ namespace mongo {
     }
 
     /** transform a BSON array into a vector of BSONElements.
-        we match array # positions with their vector position, and ignore 
-        any non-numeric fields. 
+        we match array # positions with their vector position, and ignore
+        any fields with non-numeric field names.
         */
-    vector<BSONElement> BSONElement::Array() const { 
+    vector<BSONElement> BSONElement::Array() const {
         chk(mongo::Array);
         vector<BSONElement> v;
         BSONObjIterator i(Obj());
@@ -1453,7 +1404,7 @@ namespace mongo {
             const char *f = e.fieldName();
             try {
                 unsigned u = stringToNum(f);
-                assert( u < 4096 );
+                assert( u < 1000000 );
                 if( u >= v.size() )
                     v.resize(u+1);
                 v[u] = e;
diff --git a/db/jsobj.h b/db/jsobj.h
index 258a952..a6472d5 100644
--- a/db/jsobj.h
+++ b/db/jsobj.h
@@ -1,4 +1,4 @@
-/** @file jsobj.h 
+/** @file jsobj.h
     BSON classes
 */
 
@@ -40,7 +40,7 @@
 #include "../bson/bsonmisc.h"
 #include "../bson/bsonobjbuilder.h"
 #include "../bson/bsonobjiterator.h"
-#include "../bson/bsoninlines.h"
+#include "../bson/bson-inl.h"
 #include "../bson/ordering.h"
 #include "../bson/stringdata.h"
 
diff --git a/db/jsobjmanipulator.h b/db/jsobjmanipulator.h
index c43e876..0b3c0c2 100644
--- a/db/jsobjmanipulator.h
+++ b/db/jsobjmanipulator.h
@@ -19,6 +19,7 @@
 #pragma once
 
 #include "jsobj.h"
+#include "dur.h"
 
 namespace mongo {
 
@@ -35,41 +36,68 @@ namespace mongo {
             OpTime::now().asDate()
         */
         void initTimestamp();
-        
+
         /** Change the value, in place, of the number. */
         void setNumber(double d) {
             if ( _element.type() == NumberDouble ) *reinterpret_cast< double * >( value() )  = d;
             else if ( _element.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d;
             else assert(0);
         }
-        void setLong(long long n) { 
+        void SetNumber(double d) {
+            if ( _element.type() == NumberDouble )
+                *getDur().writing( reinterpret_cast< double * >( value() )  ) = d;
+            else if ( _element.type() == NumberInt )
+                *getDur().writing( reinterpret_cast< int * >( value() ) ) = (int) d;
+            else assert(0);
+        }
+        void setLong(long long n) {
             assert( _element.type() == NumberLong );
             *reinterpret_cast< long long * >( value() ) = n;
         }
-        void setInt(int n) { 
+        void SetLong(long long n) {
+            assert( _element.type() == NumberLong );
+            *getDur().writing( reinterpret_cast< long long * >(value()) ) = n;
+        }
+        void setInt(int n) {
             assert( _element.type() == NumberInt );
             *reinterpret_cast< int * >( value() ) = n;
         }
+        void SetInt(int n) {
+            assert( _element.type() == NumberInt );
+            getDur().writingInt( *reinterpret_cast< int * >( value() ) ) = n;
+        }
+
 
-        
         /** Replace the type and value of the element with the type and value of e,
             preserving the original fieldName */
         void replaceTypeAndValue( const BSONElement &e ) {
             *data() = e.type();
             memcpy( value(), e.value(), e.valuesize() );
         }
-        
-        static void lookForTimestamps( const BSONObj& obj ){
+
+        /* dur:: version */
+        void ReplaceTypeAndValue( const BSONElement &e ) {
+            char *d = data();
+            char *v = value();
+            int valsize = e.valuesize();
+            int ofs = (int) (v-d);
+            dassert( ofs > 0 );
+            char *p = (char *) getDur().writingPtr(d, valsize + ofs);
+            *p = e.type();
+            memcpy( p + ofs, e.value(), valsize );
+        }
+
+        static void lookForTimestamps( const BSONObj& obj ) {
             // If have a Timestamp field as the first or second element,
             // update it to a Date field set to OpTime::now().asDate().  The
             // replacement policy is a work in progress.
-            
+
             BSONObjIterator i( obj );
             for( int j = 0; i.moreWithEOO() && j < 2; ++j ) {
                 BSONElement e = i.next();
                 if ( e.eoo() )
                     break;
-                if ( e.type() == Timestamp ){
+                if ( e.type() == Timestamp ) {
                     BSONElementManipulator( e ).initTimestamp();
                     break;
                 }
diff --git a/db/json.cpp b/db/json.cpp
index 185a8ca..4a6fad8 100644
--- a/db/json.cpp
+++ b/db/json.cpp
@@ -43,12 +43,12 @@ using namespace boost::spirit;
 namespace mongo {
 
     struct ObjectBuilder : boost::noncopyable {
-        ~ObjectBuilder(){
+        ~ObjectBuilder() {
             unsigned i = builders.size();
-            if ( i ){
+            if ( i ) {
                 i--;
-                for ( ; i>=1; i-- ){
-                    if ( builders[i] ){
+                for ( ; i>=1; i-- ) {
+                    if ( builders[i] ) {
                         builders[i]->done();
                     }
                 }
@@ -205,7 +205,8 @@ namespace mongo {
             else if ( first < 0x08 ) {
                 b.ss << char( 0xc0 | ( ( first << 2 ) | ( second >> 6 ) ) );
                 b.ss << char( 0x80 | ( ~0xc0 & second ) );
-            } else {
+            }
+            else {
                 b.ss << char( 0xe0 | ( first >> 4 ) );
                 b.ss << char( 0x80 | ( ~0xc0 & ( ( first << 2 ) | ( second >> 6 ) ) ) );
                 b.ss << char( 0x80 | ( ~0xc0 & second ) );
@@ -342,7 +343,7 @@ namespace mongo {
     struct dbrefEnd {
         dbrefEnd( ObjectBuilder &_b ) : b( _b ) {}
         void operator() ( const char *start, const char *end ) const {
-            b.back()->appendDBRef( b.fieldName(), b.ns.c_str(), b.oid );
+            b.back()->appendDBRef( b.fieldName(), b.ns, b.oid );
         }
         ObjectBuilder &b;
     };
@@ -417,8 +418,7 @@ namespace mongo {
     struct regexEnd {
         regexEnd( ObjectBuilder &_b ) : b( _b ) {}
         void operator() ( const char *start, const char *end ) const {
-            b.back()->appendRegex( b.fieldName(), b.regex.c_str(),
-                                   b.regexOptions.c_str() );
+            b.back()->appendRegex( b.fieldName(), b.regex, b.regexOptions );
         }
         ObjectBuilder &b;
     };
@@ -438,7 +438,7 @@ namespace mongo {
 // in the original z example on line 3, if the input was "ab", foo() would only
 // be called once.
     struct JsonGrammar : public grammar< JsonGrammar > {
-public:
+    public:
         JsonGrammar( ObjectBuilder &_b ) : b( _b ) {}
 
         template < typename ScannerT >
@@ -472,32 +472,32 @@ public:
                 str = lexeme_d[ ch_p( '"' )[ chClear( self.b ) ] >>
                                 *( ( ch_p( '\\' ) >>
                                      (
-                                       ch_p( 'b' )[ chE( self.b ) ] |
-                                       ch_p( 'f' )[ chE( self.b ) ] |
-                                       ch_p( 'n' )[ chE( self.b ) ] |
-                                       ch_p( 'r' )[ chE( self.b ) ] |
-                                       ch_p( 't' )[ chE( self.b ) ] |
-                                       ch_p( 'v' )[ chE( self.b ) ] |
-                                       ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
-                                       ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+                                         ch_p( 'b' )[ chE( self.b ) ] |
+                                         ch_p( 'f' )[ chE( self.b ) ] |
+                                         ch_p( 'n' )[ chE( self.b ) ] |
+                                         ch_p( 'r' )[ chE( self.b ) ] |
+                                         ch_p( 't' )[ chE( self.b ) ] |
+                                         ch_p( 'v' )[ chE( self.b ) ] |
+                                         ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+                                         ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
                                      )
                                    ) |
                                    ( ~range_p( 0x00, 0x1f ) & ~ch_p( '"' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '"' ];
 
                 singleQuoteStr = lexeme_d[ ch_p( '\'' )[ chClear( self.b ) ] >>
-                                *( ( ch_p( '\\' ) >>
-                                     (
-                                       ch_p( 'b' )[ chE( self.b ) ] |
-                                       ch_p( 'f' )[ chE( self.b ) ] |
-                                       ch_p( 'n' )[ chE( self.b ) ] |
-                                       ch_p( 'r' )[ chE( self.b ) ] |
-                                       ch_p( 't' )[ chE( self.b ) ] |
-                                       ch_p( 'v' )[ chE( self.b ) ] |
-                                       ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
-                                       ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
-                                     )
-                                   ) |
-                                   ( ~range_p( 0x00, 0x1f ) & ~ch_p( '\'' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '\'' ];
+                                           *( ( ch_p( '\\' ) >>
+                                                (
+                                                    ch_p( 'b' )[ chE( self.b ) ] |
+                                                    ch_p( 'f' )[ chE( self.b ) ] |
+                                                    ch_p( 'n' )[ chE( self.b ) ] |
+                                                    ch_p( 'r' )[ chE( self.b ) ] |
+                                                    ch_p( 't' )[ chE( self.b ) ] |
+                                                    ch_p( 'v' )[ chE( self.b ) ] |
+                                                    ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+                                                    ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+                                                )
+                                              ) |
+                                              ( ~range_p( 0x00, 0x1f ) & ~ch_p( '\'' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '\'' ];
 
                 // real_p accepts numbers with nonsignificant zero prefixes, which
                 // aren't allowed in JSON.  Oh well.
@@ -548,8 +548,8 @@ public:
                                    >> ( *( ch_p( 'i' ) | ch_p( 'g' ) | ch_p( 'm' ) ) )[ regexOptions( self.b ) ] ];
             }
             rule< ScannerT > object, members, array, elements, value, str, number, integer,
-            dbref, dbrefS, dbrefT, oid, oidS, oidT, bindata, date, dateS, dateT,
-            regex, regexS, regexT, quotedOid, fieldName, unquotedFieldName, singleQuoteStr;
+                  dbref, dbrefS, dbrefT, oid, oidS, oidT, bindata, date, dateS, dateT,
+                  regex, regexS, regexT, quotedOid, fieldName, unquotedFieldName, singleQuoteStr;
             const rule< ScannerT > &start() const {
                 return object;
             }
@@ -558,7 +558,7 @@ public:
     };
 
     BSONObj fromjson( const char *str , int* len) {
-        if ( str[0] == '\0' ){
+        if ( str[0] == '\0' ) {
             if (len) *len = 0;
             return BSONObj();
         }
@@ -568,7 +568,8 @@ public:
         parse_info<> result = parse( str, parser, space_p );
         if (len) {
             *len = result.stop - str;
-        } else if ( !result.full ) {
+        }
+        else if ( !result.full ) {
             int limit = strnlen(result.stop , 10);
             if (limit == -1) limit = 10;
             msgasserted(10340, "Failure parsing JSON string near: " + string( result.stop, limit ));
diff --git a/db/lasterror.cpp b/db/lasterror.cpp
index 12fc694..ba52111 100644
--- a/db/lasterror.cpp
+++ b/db/lasterror.cpp
@@ -34,28 +34,37 @@ namespace mongo {
     void raiseError(int code , const char *msg) {
         LastError *le = lastError.get();
         if ( le == 0 ) {
-            /* might be intentional (non-user thread) */    
+            /* might be intentional (non-user thread) */
             DEV {
                 static unsigned n;
                 if( ++n < 4 && !isShell ) log() << "dev: lastError==0 won't report:" << msg << endl;
             }
-        } else if ( le->disabled ) {
+        }
+        else if ( le->disabled ) {
             log() << "lastError disabled, can't report: " << code << ":" << msg << endl;
-        } else {
+        }
+        else {
             le->raiseError(code, msg);
         }
     }
-    
-    void LastError::appendSelf( BSONObjBuilder &b ) {
+
+    bool LastError::appendSelf( BSONObjBuilder &b , bool blankErr ) {
         if ( !valid ) {
-            b.appendNull( "err" );
+            if ( blankErr )
+                b.appendNull( "err" );
             b.append( "n", 0 );
-            return;
+            return false;
         }
-        if ( msg.empty() )
-            b.appendNull( "err" );
-        else
+
+        if ( msg.empty() ) {
+            if ( blankErr ) {
+                b.appendNull( "err" );
+            }
+        }
+        else {
             b.append( "err", msg );
+        }
+
         if ( code )
             b.append( "code" , code );
         if ( updatedExisting != NotUpdate )
@@ -65,13 +74,24 @@ namespace mongo {
         if ( writebackId.isSet() )
             b.append( "writeback" , writebackId );
         b.appendNumber( "n", nObjects );
+
+        return ! msg.empty();
+    }
+
+    LastErrorHolder::~LastErrorHolder() {
+        for ( IDMap::iterator i = _ids.begin(); i != _ids.end(); ++i ) {
+            delete i->second.lerr;
+            i->second.lerr = 0;
+        }
+        _ids.clear();
     }
 
-    void LastErrorHolder::setID( int id ){
+
+    void LastErrorHolder::setID( int id ) {
         _id.set( id );
     }
-    
-    int LastErrorHolder::getID(){
+
+    int LastErrorHolder::getID() {
         return _id.get();
     }
 
@@ -89,24 +109,24 @@ namespace mongo {
             return ret;
         return 0;
     }
-    
-    LastError * LastErrorHolder::_get( bool create ){
+
+    LastError * LastErrorHolder::_get( bool create ) {
         int id = _id.get();
-        if ( id == 0 ){
+        if ( id == 0 ) {
             LastError * le = _tl.get();
-            if ( ! le && create ){
+            if ( ! le && create ) {
                 le = new LastError();
                 _tl.reset( le );
             }
             return le;
         }
 
-        scoped_lock lock(_idsmutex);        
+        scoped_lock lock(_idsmutex);
         map<int,Status>::iterator i = _ids.find( id );
-        if ( i == _ids.end() ){
+        if ( i == _ids.end() ) {
             if ( ! create )
                 return 0;
-            
+
             LastError * le = new LastError();
             Status s;
             s.time = time(0);
@@ -114,42 +134,42 @@ namespace mongo {
             _ids[id] = s;
             return le;
         }
-        
+
         Status &status = i->second;
         status.time = time(0);
         return status.lerr;
     }
 
-    void LastErrorHolder::remove( int id ){
+    void LastErrorHolder::remove( int id ) {
         scoped_lock lock(_idsmutex);
         map<int,Status>::iterator i = _ids.find( id );
         if ( i == _ids.end() )
             return;
-        
+
         delete i->second.lerr;
         _ids.erase( i );
     }
 
-    void LastErrorHolder::release(){
+    void LastErrorHolder::release() {
         int id = _id.get();
-        if ( id == 0 ){
+        if ( id == 0 ) {
             _tl.release();
             return;
         }
-        
+
         remove( id );
     }
 
     /** ok to call more than once. */
-    void LastErrorHolder::initThread() { 
+    void LastErrorHolder::initThread() {
         if( _tl.get() ) return;
         assert( _id.get() == 0 );
         _tl.reset( new LastError() );
     }
-    
-    void LastErrorHolder::reset( LastError * le ){
+
+    void LastErrorHolder::reset( LastError * le ) {
         int id = _id.get();
-        if ( id == 0 ){
+        if ( id == 0 ) {
             _tl.reset( le );
             return;
         }
@@ -159,17 +179,18 @@ namespace mongo {
         status.time = time(0);
         status.lerr = le;
     }
-    
+
     void prepareErrForNewRequest( Message &m, LastError * err ) {
         // a killCursors message shouldn't affect last error
         if ( m.operation() == dbKillCursors ) {
             err->disabled = true;
-        } else {
+        }
+        else {
             err->disabled = false;
             err->nPrev++;
-        }        
+        }
     }
-    
+
     LastError * LastErrorHolder::startRequest( Message& m , int clientId ) {
         assert( clientId );
         setID( clientId );
@@ -183,33 +204,33 @@ namespace mongo {
         prepareErrForNewRequest( m, connectionOwned );
     }
 
-    void LastErrorHolder::disconnect( int clientId ){
+    void LastErrorHolder::disconnect( int clientId ) {
         if ( clientId )
             remove(clientId);
     }
 
     struct LastErrorHolderTest : public UnitTest {
     public:
-        
-        void test( int i ){
+
+        void test( int i ) {
             _tl.set( i );
             assert( _tl.get() == i );
         }
-        
-        void tlmaptest(){
+
+        void tlmaptest() {
             test( 1 );
             test( 12123123 );
             test( -123123 );
             test( numeric_limits<int>::min() );
             test( numeric_limits<int>::max() );
         }
-        
-        void run(){
+
+        void run() {
             tlmaptest();
 
             LastError * a = new LastError();
             LastError * b = new LastError();
-            
+
             LastErrorHolder holder;
             holder.reset( a );
             assert( a == holder.get() );
@@ -219,10 +240,10 @@ namespace mongo {
             assert( b == holder.get() );
             holder.setID( 0 );
             assert( a == holder.get() );
-            
+
             holder.remove( 1 );
         }
-        
+
         ThreadLocalValue<int> _tl;
     } lastErrorHolderTest;
 
diff --git a/db/lasterror.h b/db/lasterror.h
index 2006f1c..c77ec74 100644
--- a/db/lasterror.h
+++ b/db/lasterror.h
@@ -33,7 +33,7 @@ namespace mongo {
         int nPrev;
         bool valid;
         bool disabled;
-        void writeback( OID& oid ){
+        void writeback( OID& oid ) {
             reset( true );
             writebackId = oid;
         }
@@ -42,13 +42,13 @@ namespace mongo {
             code = _code;
             msg = _msg;
         }
-        void recordUpdate( bool _updateObjects , long long _nObjects , OID _upsertedId ){
+        void recordUpdate( bool _updateObjects , long long _nObjects , OID _upsertedId ) {
             reset( true );
             nObjects = _nObjects;
             updatedExisting = _updateObjects ? True : False;
             if ( _upsertedId.isSet() )
                 upsertedId = _upsertedId;
-                
+
         }
         void recordDelete( long long nDeleted ) {
             reset( true );
@@ -68,20 +68,25 @@ namespace mongo {
             upsertedId.clear();
             writebackId.clear();
         }
-        void appendSelf( BSONObjBuilder &b );
+
+        /**
+         * @return if there is an err
+         */
+        bool appendSelf( BSONObjBuilder &b , bool blankErr = true );
 
         struct Disabled : boost::noncopyable {
-            Disabled( LastError * le ){
+            Disabled( LastError * le ) {
                 _le = le;
-                if ( _le ){
+                if ( _le ) {
                     _prev = _le->disabled;
                     _le->disabled = true;
-                } else {
+                }
+                else {
                     _prev = false;
                 }
             }
-            
-            ~Disabled(){
+
+            ~Disabled() {
                 if ( _le )
                     _le->disabled = _prev;
             }
@@ -89,18 +94,19 @@ namespace mongo {
             LastError * _le;
             bool _prev;
         };
-        
+
         static LastError noError;
     };
 
     extern class LastErrorHolder {
     public:
         LastErrorHolder() : _id( 0 ) {}
+        ~LastErrorHolder();
 
         LastError * get( bool create = false );
-        LastError * getSafe(){
+        LastError * getSafe() {
             LastError * le = get(false);
-            if ( ! le ){
+            if ( ! le ) {
                 log( LL_ERROR ) << " no LastError!  id: " << getID() << endl;
                 assert( le );
             }
@@ -122,11 +128,11 @@ namespace mongo {
 
         void remove( int id );
         void release();
-        
+
         /** when db receives a message/request, call this */
         void startRequest( Message& m , LastError * connectionOwned );
         LastError * startRequest( Message& m , int clientId );
-        
+
         void disconnect( int clientId );
 
         // used to disable lastError reporting while processing a killCursors message
@@ -135,13 +141,15 @@ namespace mongo {
     private:
         ThreadLocalValue<int> _id;
         boost::thread_specific_ptr<LastError> _tl;
-        
+
         struct Status {
             time_t time;
             LastError *lerr;
         };
+        typedef map<int,Status> IDMap;
+
         static mongo::mutex _idsmutex;
-        map<int,Status> _ids;    
+        IDMap _ids;
     } lastError;
 
     void raiseError(int code , const char *msg);
diff --git a/db/matcher.cpp b/db/matcher.cpp
index cd62563..38e8e05 100644
--- a/db/matcher.cpp
+++ b/db/matcher.cpp
@@ -30,7 +30,7 @@
 #include "pdfile.h"
 
 namespace {
-    inline pcrecpp::RE_Options flags2options(const char* flags){
+    inline pcrecpp::RE_Options flags2options(const char* flags) {
         pcrecpp::RE_Options options;
         options.set_utf8(true);
         while ( flags && *flags ) {
@@ -52,7 +52,7 @@ namespace {
 namespace mongo {
 
     extern BSONObj staticNull;
-        
+
     class Where {
     public:
         Where() {
@@ -64,22 +64,22 @@ namespace mongo {
             if ( scope.get() )
                 scope->execSetup( "_mongo.readOnly = false;" , "make not read only" );
 
-            if ( jsScope ){
+            if ( jsScope ) {
                 delete jsScope;
                 jsScope = 0;
             }
             func = 0;
         }
-        
+
         auto_ptr<Scope> scope;
         ScriptingFunction func;
         BSONObj *jsScope;
-        
+
         void setFunc(const char *code) {
             massert( 10341 ,  "scope has to be created first!" , scope.get() );
             func = scope->createFunction( code );
         }
-        
+
     };
 
     Matcher::~Matcher() {
@@ -87,37 +87,48 @@ namespace mongo {
         where = 0;
     }
 
-    ElementMatcher::ElementMatcher( BSONElement _e , int _op, bool _isNot ) : toMatch( _e ) , compareOp( _op ), isNot( _isNot ) {
-        if ( _op == BSONObj::opMOD ){
+    ElementMatcher::ElementMatcher( BSONElement _e , int _op, bool _isNot )
+        : toMatch( _e ) , compareOp( _op ), isNot( _isNot ), subMatcherOnPrimitives(false) {
+        if ( _op == BSONObj::opMOD ) {
             BSONObj o = _e.embeddedObject();
             mod = o["0"].numberInt();
             modm = o["1"].numberInt();
-            
+
             uassert( 10073 ,  "mod can't be 0" , mod );
         }
-        else if ( _op == BSONObj::opTYPE ){
+        else if ( _op == BSONObj::opTYPE ) {
             type = (BSONType)(_e.numberInt());
         }
-        else if ( _op == BSONObj::opELEM_MATCH ){
+        else if ( _op == BSONObj::opELEM_MATCH ) {
             BSONElement m = _e;
             uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object );
-            subMatcher.reset( new Matcher( m.embeddedObject() ) );
+            BSONObj x = m.embeddedObject();
+            if ( x.firstElement().getGtLtOp() == 0 ) {
+                subMatcher.reset( new Matcher( x ) );
+                subMatcherOnPrimitives = false;
+            }
+            else {
+                // meant to act on primitives
+                subMatcher.reset( new Matcher( BSON( "" << x ) ) );
+                subMatcherOnPrimitives = true;
+            }
         }
     }
 
-    ElementMatcher::ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot ) 
-        : toMatch( _e ) , compareOp( _op ), isNot( _isNot ) {
-        
+    ElementMatcher::ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot )
+        : toMatch( _e ) , compareOp( _op ), isNot( _isNot ), subMatcherOnPrimitives(false) {
+
         myset.reset( new set<BSONElement,element_lt>() );
-        
+
         BSONObjIterator i( array );
         while ( i.more() ) {
             BSONElement ie = i.next();
-            if ( _op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ){
+            if ( _op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
                 shared_ptr<Matcher> s;
                 s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) );
                 allMatchers.push_back( s );
-            } else if ( ie.type() == RegEx ) {
+            }
+            else if ( ie.type() == RegEx ) {
                 if ( !myregex.get() ) {
                     myregex.reset( new vector< RegexMatcher >() );
                 }
@@ -132,19 +143,20 @@ namespace mongo {
                 string prefix = simpleRegex(rm.regex, rm.flags, &purePrefix);
                 if (purePrefix)
                     rm.prefix = prefix;
-            } else {
+            }
+            else {
                 myset->insert(ie);
             }
         }
-        
-        if ( allMatchers.size() ){
+
+        if ( allMatchers.size() ) {
             uassert( 13020 , "with $all, can't mix $elemMatch and others" , myset->size() == 0 && !myregex.get());
         }
-        
+
     }
-    
-    
-    void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot){
+
+
+    void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot) {
 
         if ( nRegex >= 4 ) {
             out() << "ERROR: too many regexes in query" << endl;
@@ -158,106 +170,106 @@ namespace mongo {
             rm.isNot = isNot;
             nRegex++;
 
-            if (!isNot){ //TODO something smarter
+            if (!isNot) { //TODO something smarter
                 bool purePrefix;
                 string prefix = simpleRegex(regex, flags, &purePrefix);
                 if (purePrefix)
                     rm.prefix = prefix;
             }
-        }        
+        }
     }
-    
+
     bool Matcher::addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ) {
         const char *fn = fe.fieldName();
         int op = fe.getGtLtOp( -1 );
-        if ( op == -1 ){
-            if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ){
+        if ( op == -1 ) {
+            if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ) {
                 return false; // { $ref : xxx } - treat as normal object
             }
             uassert( 10068 ,  (string)"invalid operator: " + fn , op != -1 );
         }
-        
-        switch ( op ){
-            case BSONObj::GT:
-            case BSONObj::GTE:
-            case BSONObj::LT:
-            case BSONObj::LTE:{
-                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
-                _builders.push_back( b );
-                b->appendAs(fe, e.fieldName());
-                addBasic(b->done().firstElement(), op, isNot);
-                break;
-            }
-            case BSONObj::NE:{
-                haveNeg = true;
-                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
-                _builders.push_back( b );
-                b->appendAs(fe, e.fieldName());
-                addBasic(b->done().firstElement(), BSONObj::NE, isNot);
-                break;
-            }
-            case BSONObj::opALL:
-                all = true;
-            case BSONObj::opIN:
-                uassert( 13276 , "$in needs an array" , fe.isABSONObj() );
-                basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
-                break;
-            case BSONObj::NIN:
-                uassert( 13277 , "$nin needs an array" , fe.isABSONObj() );
-                haveNeg = true;
-                basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
-                break;
-            case BSONObj::opMOD:
-            case BSONObj::opTYPE:
-            case BSONObj::opELEM_MATCH: {
-                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
-                _builders.push_back( b );
-                b->appendAs(fe, e.fieldName());                                
-                // these are types where ElementMatcher has all the info
-                basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) );
-                break;                                
-            }
-            case BSONObj::opSIZE:{
-                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
-                _builders.push_back( b );
-                b->appendAs(fe, e.fieldName());
-                addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot);    
-                haveSize = true;
-                break;
-            }
-            case BSONObj::opEXISTS:{
-                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
-                _builders.push_back( b );
-                b->appendAs(fe, e.fieldName());
-                addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot);
-                break;
-            }
-            case BSONObj::opREGEX:{
-                uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot );
-                if ( fe.type() == RegEx ){
-                    regex = fe.regex();
-                    flags = fe.regexFlags();
-                }
-                else {
-                    regex = fe.valuestrsafe();
-                }
-                break;
-            }
-            case BSONObj::opOPTIONS:{
-                uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot );
-                flags = fe.valuestrsafe();
-                break;
-            }
-            case BSONObj::opNEAR:
-            case BSONObj::opWITHIN:
-            case BSONObj::opMAX_DISTANCE:
-                break;
-            default:
-                uassert( 10069 ,  (string)"BUG - can't operator for: " + fn , 0 );
-        }        
+
+        switch ( op ) {
+        case BSONObj::GT:
+        case BSONObj::GTE:
+        case BSONObj::LT:
+        case BSONObj::LTE: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), op, isNot);
+            break;
+        }
+        case BSONObj::NE: {
+            haveNeg = true;
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), BSONObj::NE, isNot);
+            break;
+        }
+        case BSONObj::opALL:
+            all = true;
+        case BSONObj::opIN:
+            uassert( 13276 , "$in needs an array" , fe.isABSONObj() );
+            basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+            break;
+        case BSONObj::NIN:
+            uassert( 13277 , "$nin needs an array" , fe.isABSONObj() );
+            haveNeg = true;
+            basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+            break;
+        case BSONObj::opMOD:
+        case BSONObj::opTYPE:
+        case BSONObj::opELEM_MATCH: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            // these are types where ElementMatcher has all the info
+            basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) );
+            break;
+        }
+        case BSONObj::opSIZE: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot);
+            haveSize = true;
+            break;
+        }
+        case BSONObj::opEXISTS: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot);
+            break;
+        }
+        case BSONObj::opREGEX: {
+            uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot );
+            if ( fe.type() == RegEx ) {
+                regex = fe.regex();
+                flags = fe.regexFlags();
+            }
+            else {
+                regex = fe.valuestrsafe();
+            }
+            break;
+        }
+        case BSONObj::opOPTIONS: {
+            uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot );
+            flags = fe.valuestrsafe();
+            break;
+        }
+        case BSONObj::opNEAR:
+        case BSONObj::opWITHIN:
+        case BSONObj::opMAX_DISTANCE:
+            break;
+        default:
+            uassert( 10069 ,  (string)"BUG - can't operator for: " + fn , 0 );
+        }
         return true;
     }
-    
+
     void Matcher::parseOr( const BSONElement &e, bool subMatcher, list< shared_ptr< Matcher > > &matchers ) {
         uassert( 13090, "nested $or/$nor not allowed", !subMatcher );
         uassert( 13086, "$or/$nor must be a nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
@@ -276,14 +288,16 @@ namespace mongo {
             return false;
         if ( ef[ 1 ] == 'o' && ef[ 2 ] == 'r' && ef[ 3 ] == 0 ) {
             parseOr( e, subMatcher, _orMatchers );
-        } else if ( ef[ 1 ] == 'n' && ef[ 2 ] == 'o' && ef[ 3 ] == 'r' && ef[ 4 ] == 0 ) {
+        }
+        else if ( ef[ 1 ] == 'n' && ef[ 2 ] == 'o' && ef[ 3 ] == 'r' && ef[ 4 ] == 0 ) {
             parseOr( e, subMatcher, _norMatchers );
-        } else {
+        }
+        else {
             return false;
         }
         return true;
     }
-    
+
     /* _jsobj          - the query pattern
     */
     Matcher::Matcher(const BSONObj &_jsobj, bool subMatcher) :
@@ -293,6 +307,8 @@ namespace mongo {
         while ( i.more() ) {
             BSONElement e = i.next();
             
+            uassert( 13629 , "can't have undefined in a query expression" , e.type() != Undefined );
+
             if ( parseOrNor( e, subMatcher ) ) {
                 continue;
             }
@@ -301,7 +317,7 @@ namespace mongo {
                 // $where: function()...
                 uassert( 10066 , "$where occurs twice?", where == 0 );
                 uassert( 10067 , "$where query, but no script engine", globalScriptEngine );
-                massert( 13089 , "no current client needed for $where" , haveClient() ); 
+                massert( 13089 , "no current client needed for $where" , haveClient() );
                 where = new Where();
                 where->scope = globalScriptEngine->getPooledScope( cc().ns() );
                 where->scope->localConnect( cc().database()->name.c_str() );
@@ -314,7 +330,7 @@ namespace mongo {
                     const char *code = e.valuestr();
                     where->setFunc(code);
                 }
-                
+
                 where->scope->execSetup( "_mongo.readOnly = true;" , "make read only" );
 
                 continue;
@@ -324,7 +340,7 @@ namespace mongo {
                 addRegex( e.fieldName(), e.regex(), e.regexFlags() );
                 continue;
             }
-            
+
             // greater than / less than...
             // e.g., e == { a : { $gt : 3 } }
             //       or
@@ -333,35 +349,36 @@ namespace mongo {
                 // support {$regex:"a|b", $options:"imx"}
                 const char* regex = NULL;
                 const char* flags = "";
-                
+
                 // e.g., fe == { $gt : 3 }
                 BSONObjIterator j(e.embeddedObject());
                 bool isOperator = false;
                 while ( j.more() ) {
                     BSONElement fe = j.next();
                     const char *fn = fe.fieldName();
-                    
+
                     if ( fn[0] == '$' && fn[1] ) {
                         isOperator = true;
-                        
+
                         if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) {
                             haveNeg = true;
                             switch( fe.type() ) {
-                                case Object: {
-                                    BSONObjIterator k( fe.embeddedObject() );
-                                    uassert( 13030, "$not cannot be empty", k.more() );
-                                    while( k.more() ) {
-                                        addOp( e, k.next(), true, regex, flags );   
-                                    }
-                                    break;
+                            case Object: {
+                                BSONObjIterator k( fe.embeddedObject() );
+                                uassert( 13030, "$not cannot be empty", k.more() );
+                                while( k.more() ) {
+                                    addOp( e, k.next(), true, regex, flags );
                                 }
-                                case RegEx:
-                                    addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true );
-                                    break;
-                                default:
-                                    uassert( 13031, "invalid use of $not", false );
+                                break;
+                            }
+                            case RegEx:
+                                addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true );
+                                break;
+                            default:
+                                uassert( 13031, "invalid use of $not", false );
                             }
-                        } else {
+                        }
+                        else {
                             if ( !addOp( e, fe, false, regex, flags ) ) {
                                 isOperator = false;
                                 break;
@@ -373,43 +390,43 @@ namespace mongo {
                         break;
                     }
                 }
-                if (regex){
+                if (regex) {
                     addRegex(e.fieldName(), regex, flags);
                 }
                 if ( isOperator )
                     continue;
             }
 
-            if ( e.type() == Array ){
+            if ( e.type() == Array ) {
                 hasArray = true;
             }
             else if( strcmp(e.fieldName(), "$atomic") == 0 ) {
                 _atomic = e.trueValue();
                 continue;
             }
-            
+
             // normal, simple case e.g. { a : "foo" }
             addBasic(e, BSONObj::Equality, false);
         }
     }
-    
+
     Matcher::Matcher( const Matcher &other, const BSONObj &key ) :
-    where(0), constrainIndexKey_( key ), haveSize(), all(), hasArray(0), haveNeg(), _atomic(false), nRegex(0) {
+        where(0), constrainIndexKey_( key ), haveSize(), all(), hasArray(0), haveNeg(), _atomic(false), nRegex(0) {
         // do not include fields which would make keyMatch() false
         for( vector< ElementMatcher >::const_iterator i = other.basics.begin(); i != other.basics.end(); ++i ) {
             if ( key.hasField( i->toMatch.fieldName() ) ) {
                 switch( i->compareOp ) {
-                    case BSONObj::opSIZE:
-                    case BSONObj::opALL:
-                    case BSONObj::NE:
-                    case BSONObj::NIN:
-                        break;
-                    default: {
-                        if ( !i->isNot && i->toMatch.type() != Array ) {
-                            basics.push_back( *i );                            
-                        }
+                case BSONObj::opSIZE:
+                case BSONObj::opALL:
+                case BSONObj::NE:
+                case BSONObj::NIN:
+                    break;
+                default: {
+                    if ( !i->isNot && i->toMatch.type() != Array ) {
+                        basics.push_back( *i );
                     }
                 }
+                }
             }
         }
         for( int i = 0; i < other.nRegex; ++i ) {
@@ -421,29 +438,29 @@ namespace mongo {
             _orMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) );
         }
     }
-    
+
     inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) {
-        switch (e.type()){
-            case String:
-            case Symbol:
-                if (rm.prefix.empty())
-                    return rm.re->PartialMatch(e.valuestr());
-                else
-                    return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size());
-            case RegEx:
-                return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags());
-            default:
-                return false;
+        switch (e.type()) {
+        case String:
+        case Symbol:
+            if (rm.prefix.empty())
+                return rm.re->PartialMatch(e.valuestr());
+            else
+                return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size());
+        case RegEx:
+            return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags());
+        default:
+            return false;
         }
     }
-        
+
     inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) {
         assert( op != BSONObj::NE && op != BSONObj::NIN );
-        
+
         if ( op == BSONObj::Equality ) {
             return l.valuesEqual(r);
         }
-        
+
         if ( op == BSONObj::opIN ) {
             // { $in : [1,2,3] }
             int count = bm.myset->count(l);
@@ -471,15 +488,15 @@ namespace mongo {
             }
             return count == r.number();
         }
-        
-        if ( op == BSONObj::opMOD ){
+
+        if ( op == BSONObj::opMOD ) {
             if ( ! l.isNumber() )
                 return false;
-            
+
             return l.numberLong() % bm.mod == bm.modm;
         }
-        
-        if ( op == BSONObj::opTYPE ){
+
+        if ( op == BSONObj::opTYPE ) {
             return bm.type == l.type();
         }
 
@@ -506,7 +523,7 @@ namespace mongo {
             return 0;
         return bm.toMatch.boolean() ? -1 : 1;
     }
-    
+
     /* Check if a particular field matches.
 
        fieldName - field to match "a.b" if we are reaching into an embedded object.
@@ -519,8 +536,8 @@ namespace mongo {
 
          { "a.b" : 3 }             means       obj.a.b == 3
          { a : { $lt : 3 } }       means       obj.a < 3
-    	 { a : { $in : [1,2] } }   means       [1,2].contains(obj.a)
-         
+         { a : { $in : [1,2] } }   means       [1,2].contains(obj.a)
+
          return value
        -1 mismatch
         0 missing element
@@ -529,20 +546,20 @@ namespace mongo {
     int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& em , bool isArr, MatchDetails * details ) {
         DEBUGMATCHER( "\t matchesDotted : " << fieldName << " hasDetails: " << ( details ? "yes" : "no" ) );
         if ( compareOp == BSONObj::opALL ) {
-            
-            if ( em.allMatchers.size() ){
+
+            if ( em.allMatchers.size() ) {
                 BSONElement e = obj.getFieldDotted( fieldName );
                 uassert( 13021 , "$all/$elemMatch needs to be applied to array" , e.type() == Array );
-                
-                for ( unsigned i=0; i<em.allMatchers.size(); i++ ){
+
+                for ( unsigned i=0; i<em.allMatchers.size(); i++ ) {
                     bool found = false;
                     BSONObjIterator x( e.embeddedObject() );
-                    while ( x.more() ){
+                    while ( x.more() ) {
                         BSONElement f = x.next();
 
                         if ( f.type() != Object )
                             continue;
-                        if ( em.allMatchers[i]->matches( f.embeddedObject() ) ){
+                        if ( em.allMatchers[i]->matches( f.embeddedObject() ) ) {
                             found = true;
                             break;
                         }
@@ -551,36 +568,32 @@ namespace mongo {
                     if ( ! found )
                         return -1;
                 }
-                
+
                 return 1;
             }
-            
+
             if ( em.myset->size() == 0 && !em.myregex.get() )
                 return -1; // is this desired?
-            
-            BSONObjSetDefaultOrder actualKeys;
-            IndexSpec( BSON( fieldName << 1 ) ).getKeys( obj, actualKeys );
-            if ( actualKeys.size() == 0 )
-                return 0;
-            
+
+            BSONElementSet myValues;
+            obj.getFieldsDotted( fieldName , myValues );
+
             for( set< BSONElement, element_lt >::const_iterator i = em.myset->begin(); i != em.myset->end(); ++i ) {
                 // ignore nulls
                 if ( i->type() == jstNULL )
                     continue;
-                // parallel traversal would be faster worst case I guess
-                BSONObjBuilder b;
-                b.appendAs( *i, "" );
-                if ( !actualKeys.count( b.done() ) )
+
+                if ( myValues.count( *i ) == 0 )
                     return -1;
             }
 
             if ( !em.myregex.get() )
                 return 1;
-            
+
             for( vector< RegexMatcher >::const_iterator i = em.myregex->begin(); i != em.myregex->end(); ++i ) {
                 bool match = false;
-                for( BSONObjSetDefaultOrder::const_iterator j = actualKeys.begin(); j != actualKeys.end(); ++j ) {
-                    if ( regexMatches( *i, j->firstElement() ) ) {
+                for( BSONElementSet::const_iterator j = myValues.begin(); j != myValues.end(); ++j ) {
+                    if ( regexMatches( *i, *j ) ) {
                         match = true;
                         break;
                     }
@@ -588,10 +601,10 @@ namespace mongo {
                 if ( !match )
                     return -1;
             }
-            
+
             return 1;
         } // end opALL
-        
+
         if ( compareOp == BSONObj::NE )
             return matchesNe( fieldName, toMatch, obj, em , details );
         if ( compareOp == BSONObj::NIN ) {
@@ -613,18 +626,19 @@ namespace mongo {
             }
             return 1;
         }
-        
+
         BSONElement e;
         bool indexed = !constrainIndexKey_.isEmpty();
         if ( indexed ) {
             e = obj.getFieldUsingIndexNames(fieldName, constrainIndexKey_);
-            if( e.eoo() ){
+            if( e.eoo() ) {
                 cout << "obj: " << obj << endl;
                 cout << "fieldName: " << fieldName << endl;
                 cout << "constrainIndexKey_: " << constrainIndexKey_ << endl;
                 assert( !e.eoo() );
             }
-        } else {
+        }
+        else {
 
             const char *p = strchr(fieldName, '.');
             if ( p ) {
@@ -662,7 +676,7 @@ namespace mongo {
                             if ( details )
                                 details->elemMatchKey = z.fieldName();
                             return 1;
-                        } 
+                        }
                         else if ( cmp < 0 ) {
                             found = true;
                         }
@@ -671,7 +685,7 @@ namespace mongo {
                 return found ? -1 : retMissing( em );
             }
 
-            if( p ) { 
+            if( p ) {
                 return retMissing( em );
             }
             else {
@@ -681,21 +695,31 @@ namespace mongo {
 
         if ( compareOp == BSONObj::opEXISTS ) {
             return ( e.eoo() ^ ( toMatch.boolean() ^ em.isNot ) ) ? 1 : -1;
-        } else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) &&
-            valuesMatch(e, toMatch, compareOp, em ) ) {
+        }
+        else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) &&
+                  valuesMatch(e, toMatch, compareOp, em ) ) {
             return 1;
-        } else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) {
+        }
+        else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) {
             BSONObjIterator ai(e.embeddedObject());
 
             while ( ai.moreWithEOO() ) {
                 BSONElement z = ai.next();
-                
-                if ( compareOp == BSONObj::opELEM_MATCH ){
-                    // SERVER-377
-                    if ( z.type() == Object && em.subMatcher->matches( z.embeddedObject() ) ){
-                        if ( details )
-                            details->elemMatchKey = z.fieldName();
-                        return 1;
+
+                if ( compareOp == BSONObj::opELEM_MATCH ) {
+                    if ( z.type() == Object ) {
+                        if ( em.subMatcher->matches( z.embeddedObject() ) ) {
+                            if ( details )
+                                details->elemMatchKey = z.fieldName();
+                            return 1;
+                        }
+                    }
+                    else if ( em.subMatcherOnPrimitives ) {
+                        if ( z.type() && em.subMatcher->matches( z.wrap( "" ) ) ) {
+                            if ( details )
+                                details->elemMatchKey = z.fieldName();
+                            return 1;
+                        }
                     }
                 }
                 else {
@@ -707,12 +731,12 @@ namespace mongo {
                 }
 
             }
-            
-            if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ){
+
+            if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ) {
                 // match an entire array to itself
                 return 1;
             }
-            
+
         }
         else if ( e.eoo() ) {
             // 0 indicates "missing element"
@@ -745,7 +769,8 @@ namespace mongo {
                     if ( ( bm.compareOp == BSONObj::NE ) ^ bm.isNot ) {
                         return false;
                     }
-                } else {
+                }
+                else {
                     if ( !bm.isNot ) {
                         return false;
                     }
@@ -760,7 +785,8 @@ namespace mongo {
                 BSONElement e = jsobj.getFieldUsingIndexNames(rm.fieldName, constrainIndexKey_);
                 if ( !e.eoo() )
                     s.insert( e );
-            } else {
+            }
+            else {
                 jsobj.getFieldsDotted( rm.fieldName, s );
             }
             bool match = false;
@@ -770,11 +796,11 @@ namespace mongo {
             if ( !match ^ rm.isNot )
                 return false;
         }
-        
+
         if ( _orMatchers.size() > 0 ) {
             bool match = false;
             for( list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin();
-                i != _orMatchers.end(); ++i ) {
+                    i != _orMatchers.end(); ++i ) {
                 // SERVER-205 don't submit details - we don't want to track field
                 // matched within $or, and at this point we've already loaded the
                 // whole document
@@ -787,55 +813,56 @@ namespace mongo {
                 return false;
             }
         }
-        
+
         if ( _norMatchers.size() > 0 ) {
             for( list< shared_ptr< Matcher > >::const_iterator i = _norMatchers.begin();
-                i != _norMatchers.end(); ++i ) {
+                    i != _norMatchers.end(); ++i ) {
                 // SERVER-205 don't submit details - we don't want to track field
                 // matched within $nor, and at this point we've already loaded the
                 // whole document
                 if ( (*i)->matches( jsobj ) ) {
                     return false;
                 }
-            }            
+            }
         }
-        
+
         for( vector< shared_ptr< FieldRangeVector > >::const_iterator i = _orConstraints.begin();
-            i != _orConstraints.end(); ++i ) {
+                i != _orConstraints.end(); ++i ) {
             if ( (*i)->matches( jsobj ) ) {
                 return false;
             }
         }
-                
+
         if ( where ) {
             if ( where->func == 0 ) {
                 uassert( 10070 , "$where compile error", false);
                 return false; // didn't compile
             }
-            
-            if ( where->jsScope ){
+
+            if ( where->jsScope ) {
                 where->scope->init( where->jsScope );
             }
             where->scope->setThis( const_cast< BSONObj * >( &jsobj ) );
             where->scope->setObject( "obj", const_cast< BSONObj & >( jsobj ) );
             where->scope->setBoolean( "fullObject" , true ); // this is a hack b/c fullObject used to be relevant
-            
+
             int err = where->scope->invoke( where->func , BSONObj() , 1000 * 60 , false );
             where->scope->setThis( 0 );
             if ( err == -3 ) { // INVOKE_ERROR
                 stringstream ss;
-                ss << "error on invocation of $where function:\n" 
+                ss << "error on invocation of $where function:\n"
                    << where->scope->getError();
                 uassert( 10071 , ss.str(), false);
                 return false;
-            } else if ( err != 0 ) { // ! INVOKE_SUCCESS
+            }
+            else if ( err != 0 ) {   // ! INVOKE_SUCCESS
                 uassert( 10072 , "unknown error in invocation of $where function", false);
-                return false;                
+                return false;
             }
             return where->scope->getBoolean( "return" ) != 0;
 
         }
-        
+
         return true;
     }
 
@@ -880,9 +907,9 @@ namespace mongo {
             }
         }
         return true;
-    }        
-        
-    
+    }
+
+
     /*- just for testing -- */
 #pragma pack(1)
     struct JSObj1 {
@@ -946,7 +973,7 @@ namespace mongo {
             assert( !n.matches(j2) );
 
             BSONObj j0 = BSONObj();
-//		BSONObj j0((const char *) &js0);
+//      BSONObj j0((const char *) &js0);
             Matcher p(j0);
             assert( p.matches(j1) );
             assert( p.matches(j2) );
@@ -959,7 +986,7 @@ namespace mongo {
 
         RXTest() {
         }
-        
+
         void run() {
             /*
             static const boost::regex e("(\\d{4}[- ]){3}\\d{4}");
@@ -969,7 +996,7 @@ namespace mongo {
             */
 
             int ret = 0;
-            
+
             pcre_config( PCRE_CONFIG_UTF8 , &ret );
             massert( 10342 ,  "pcre not compiled with utf8 support" , ret );
 
@@ -987,7 +1014,7 @@ namespace mongo {
             pcre_config( PCRE_CONFIG_UNICODE_PROPERTIES , &ret );
             if ( ! ret )
                 cout << "warning: some regex utf8 things will not work.  pcre build doesn't have --enable-unicode-properties" << endl;
-            
+
         }
     } rxtest;
 
diff --git a/db/matcher.h b/db/matcher.h
index a4e1667..d242df6 100644
--- a/db/matcher.h
+++ b/db/matcher.h
@@ -24,7 +24,7 @@
 #include <pcrecpp.h>
 
 namespace mongo {
-    
+
     class Cursor;
     class CoveredIndexMatcher;
     class Matcher;
@@ -40,11 +40,9 @@ namespace mongo {
         bool isNot;
         RegexMatcher() : isNot() {}
     };
-    
-    struct element_lt
-    {
-        bool operator()(const BSONElement& l, const BSONElement& r) const
-        {
+
+    struct element_lt {
+        bool operator()(const BSONElement& l, const BSONElement& r) const {
             int x = (int) l.canonicalType() - (int) r.canonicalType();
             if ( x < 0 ) return true;
             else if ( x > 0 ) return false;
@@ -52,17 +50,17 @@ namespace mongo {
         }
     };
 
-    
+
     class ElementMatcher {
     public:
-    
+
         ElementMatcher() {
         }
-        
+
         ElementMatcher( BSONElement _e , int _op, bool _isNot );
-        
+
         ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot );
-        
+
         ~ElementMatcher() { }
 
         BSONElement toMatch;
@@ -70,13 +68,14 @@ namespace mongo {
         bool isNot;
         shared_ptr< set<BSONElement,element_lt> > myset;
         shared_ptr< vector<RegexMatcher> > myregex;
-        
+
         // these are for specific operators
         int mod;
         int modm;
         BSONType type;
 
         shared_ptr<Matcher> subMatcher;
+        bool subMatcherOnPrimitives ;
 
         vector< shared_ptr<Matcher> > allMatchers;
     };
@@ -85,15 +84,15 @@ namespace mongo {
     class DiskLoc;
 
     struct MatchDetails {
-        MatchDetails(){
+        MatchDetails() {
             reset();
         }
-        
-        void reset(){
+
+        void reset() {
             loadedObject = false;
             elemMatchKey = 0;
         }
-        
+
         string toString() const {
             stringstream ss;
             ss << "loadedObject: " << loadedObject << " ";
@@ -129,7 +128,7 @@ namespace mongo {
             const char *fieldName,
             const BSONElement &toMatch, const BSONObj &obj,
             const ElementMatcher&bm, MatchDetails * details );
-        
+
     public:
         static int opDirection(int op) {
             return op <= BSONObj::LTE ? -1 : 1;
@@ -140,14 +139,14 @@ namespace mongo {
         ~Matcher();
 
         bool matches(const BSONObj& j, MatchDetails * details = 0 );
-        
+
         // fast rough check to see if we must load the real doc - we also
         // compare field counts against covereed index matcher; for $or clauses
         // we just compare field counts
         bool keyMatch() const { return !all && !haveSize && !hasArray && !haveNeg; }
 
         bool atomic() const { return _atomic; }
-        
+
         bool hasType( BSONObj::MatchType type ) const;
 
         string toString() const {
@@ -157,18 +156,18 @@ namespace mongo {
         void addOrConstraint( const shared_ptr< FieldRangeVector > &frv ) {
             _orConstraints.push_back( frv );
         }
-        
+
         void popOrClause() {
             _orMatchers.pop_front();
         }
-        
+
         bool sameCriteriaCount( const Matcher &other ) const;
-        
+
     private:
         // Only specify constrainIndexKey if matches() will be called with
         // index keys having empty string field names.
         Matcher( const Matcher &other, const BSONObj &constrainIndexKey );
-        
+
         void addBasic(const BSONElement &e, int c, bool isNot) {
             // TODO May want to selectively ignore these element types based on op type.
             if ( e.type() == MinKey || e.type() == MaxKey )
@@ -178,7 +177,7 @@ namespace mongo {
 
         void addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot = false);
         bool addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags );
-        
+
         int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm);
 
         bool parseOrNor( const BSONElement &e, bool subMatcher );
@@ -194,7 +193,7 @@ namespace mongo {
         bool haveNeg;
 
         /* $atomic - if true, a multi document operation (some removes, updates)
-                     should be done atomically.  in that case, we do not yield - 
+                     should be done atomically.  in that case, we do not yield -
                      i.e. we stay locked the whole time.
                      http://www.mongodb.org/display/DOCS/Removing[
         */
@@ -211,26 +210,27 @@ namespace mongo {
 
         friend class CoveredIndexMatcher;
     };
-    
+
     // If match succeeds on index key, then attempt to match full document.
     class CoveredIndexMatcher : boost::noncopyable {
     public:
         CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false );
-        bool matches(const BSONObj &o){ return _docMatcher->matches( o ); }
-        bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 );
+        bool matches(const BSONObj &o) { return _docMatcher->matches( o ); }
+        bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 , bool keyUsable = true );
         bool matchesCurrent( Cursor * cursor , MatchDetails * details = 0 );
-        bool needRecord(){ return _needRecord; }
-        
+        bool needRecord() { return _needRecord; }
+
         Matcher& docMatcher() { return *_docMatcher; }
 
         // once this is called, shouldn't use this matcher for matching any more
         void advanceOrClause( const shared_ptr< FieldRangeVector > &frv ) {
             _docMatcher->addOrConstraint( frv );
-            // TODO this is not an optimal optimization, since we could skip an entire
+            // TODO this is not yet optimal.  Since we could skip an entire
             // or clause (if a match is impossible) between calls to advanceOrClause()
+            // we may not pop all the clauses we can.
             _docMatcher->popOrClause();
         }
-        
+
         CoveredIndexMatcher *nextClauseMatcher( const BSONObj &indexKeyPattern, bool alwaysUseRecord=false ) {
             return new CoveredIndexMatcher( _docMatcher, indexKeyPattern, alwaysUseRecord );
         }
@@ -239,7 +239,10 @@ namespace mongo {
         void init( bool alwaysUseRecord );
         shared_ptr< Matcher > _docMatcher;
         Matcher _keyMatcher;
-        bool _needRecord;
+
+        bool _needRecord; // if the key itself isn't good enough to determine a positive match
+        bool _needRecordReject; // if the key itself isn't good enough to determine a negative match
+        bool _useRecordOnly;
     };
-    
+
 } // namespace mongo
diff --git a/db/matcher_covered.cpp b/db/matcher_covered.cpp
index 5866505..18892be 100644
--- a/db/matcher_covered.cpp
+++ b/db/matcher_covered.cpp
@@ -33,48 +33,51 @@ namespace mongo {
 
     CoveredIndexMatcher::CoveredIndexMatcher( const BSONObj &jsobj, const BSONObj &indexKeyPattern, bool alwaysUseRecord) :
         _docMatcher( new Matcher( jsobj ) ),
-        _keyMatcher( *_docMatcher, indexKeyPattern )
-    {
+        _keyMatcher( *_docMatcher, indexKeyPattern ) {
         init( alwaysUseRecord );
     }
- 
+
     CoveredIndexMatcher::CoveredIndexMatcher( const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord ) :
         _docMatcher( docMatcher ),
-        _keyMatcher( *_docMatcher, indexKeyPattern )
-    {
+        _keyMatcher( *_docMatcher, indexKeyPattern ) {
         init( alwaysUseRecord );
     }
 
     void CoveredIndexMatcher::init( bool alwaysUseRecord ) {
-        _needRecord = 
-        alwaysUseRecord || 
-        ! ( _docMatcher->keyMatch() && 
-           _keyMatcher.sameCriteriaCount( *_docMatcher ) &&
-           ! _keyMatcher.hasType( BSONObj::opEXISTS ) );
-        ;        
+        _needRecord =
+            alwaysUseRecord ||
+            ! ( _docMatcher->keyMatch() &&
+                _keyMatcher.sameCriteriaCount( *_docMatcher ) );
+
+        _needRecordReject = _keyMatcher.hasType( BSONObj::opEXISTS );
     }
-    
-    bool CoveredIndexMatcher::matchesCurrent( Cursor * cursor , MatchDetails * details ){
-        return matches( cursor->currKey() , cursor->currLoc() , details );
+
+    bool CoveredIndexMatcher::matchesCurrent( Cursor * cursor , MatchDetails * details ) {
+        // bool keyUsable = ! cursor->isMultiKey() && check for $orish like conditions in matcher SERVER-1264
+        return matches( cursor->currKey() , cursor->currLoc() , details  );
     }
-    
-    bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details ) {
+
+    bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details , bool keyUsable ) {
         if ( details )
             details->reset();
-        
-        if ( !_keyMatcher.matches(key, details ) ){
-            return false;
-        }
-        
-        if ( ! _needRecord ){
-            return true;
+
+        if ( _needRecordReject == false && keyUsable ) {
+
+            if ( !_keyMatcher.matches(key, details ) ) {
+                return false;
+            }
+
+            if ( ! _needRecord ) {
+                return true;
+            }
+
         }
 
         if ( details )
             details->loadedObject = true;
 
-        return _docMatcher->matches(recLoc.rec() , details );
+        return _docMatcher->matches(recLoc.obj() , details );
     }
-    
+
 
 }
diff --git a/db/minilex.h b/db/minilex.h
index ba8df26..677514a 100644
--- a/db/minilex.h
+++ b/db/minilex.h
@@ -17,37 +17,39 @@
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#error does anything use this?
+
 namespace mongo {
 
 #if defined(_WIN32)
-    
+
 } // namespace mongo
 
 #include <hash_map>
 using namespace stdext;
 
 namespace mongo {
-    
+
     typedef const char * MyStr;
     struct less_str {
         bool operator()(const MyStr & x, const MyStr & y) const {
             if ( strcmp(x, y) > 0)
                 return true;
-            
+
             return false;
         }
     };
-    
+
     typedef hash_map<const char*, int, hash_compare<const char *, less_str> > strhashmap;
-    
+
 #else
-    
+
 } // namespace mongo
 
 #include <ext/hash_map>
 
 namespace mongo {
-    
+
     using namespace __gnu_cxx;
 
     typedef const char * MyStr;
@@ -55,106 +57,108 @@ namespace mongo {
         bool operator()(const MyStr & x, const MyStr & y) const {
             if ( strcmp(x, y) == 0)
                 return true;
-            
+
             return false;
         }
     };
-    
+
     typedef hash_map<const char*, int, hash<const char *>, eq_str > strhashmap;
-    
+
 #endif
-    
-    struct MiniLex {
+
+    /*
+    struct MiniLexNotUsed {
         strhashmap reserved;
         bool ic[256]; // ic=Identifier Character
         bool starter[256];
 
         // dm: very dumb about comments and escaped quotes -- but we are faster then at least,
         // albeit returning too much (which is ok for jsbobj current usage).
-        void grabVariables(char *code /*modified and must stay in scope*/, strhashmap& vars) {
-            char *p = code;
-            char last = 0;
-            while ( *p ) {
-                if ( starter[*p] ) {
-                    char *q = p+1;
-                    while ( *q && ic[*q] ) q++;
-                    const char *identifier = p;
-                    bool done = *q == 0;
-                    *q = 0;
-                    if ( !reserved.count(identifier) ) {
-                        // we try to be smart about 'obj' but have to be careful as obj.obj
-                        // can happen; this is so that nFields is right for simplistic where cases
-                        // so we can stop scanning in jsobj when we find the field of interest.
-                        if ( strcmp(identifier,"obj")==0 && p>code && p[-1] != '.' )
-                            ;
-                        else
-                            vars[identifier] = 1;
-                    }
-                    if ( done )
-                        break;
-                    p = q + 1;
-                    continue;
-                }
-
-                if ( *p == '\'' ) {
-                    p++;
-                    while ( *p && *p != '\'' ) p++;
-                }
-                else if ( *p == '"' ) {
-                    p++;
-                    while ( *p && *p != '"' ) p++;
-                }
-                p++;
+        void grabVariables(char *code , strhashmap& vars) { // 'code' modified and must stay in scope*/
+    char *p = code;
+    char last = 0;
+    while ( *p ) {
+        if ( starter[*p] ) {
+            char *q = p+1;
+            while ( *q && ic[*q] ) q++;
+            const char *identifier = p;
+            bool done = *q == 0;
+            *q = 0;
+            if ( !reserved.count(identifier) ) {
+                // we try to be smart about 'obj' but have to be careful as obj.obj
+                // can happen; this is so that nFields is right for simplistic where cases
+                // so we can stop scanning in jsobj when we find the field of interest.
+                if ( strcmp(identifier,"obj")==0 && p>code && p[-1] != '.' )
+                    ;
+                else
+                    vars[identifier] = 1;
             }
+            if ( done )
+                break;
+            p = q + 1;
+            continue;
         }
 
-        MiniLex() {
-            strhashmap atest;
-            atest["foo"] = 3;
-            assert( atest.count("bar") == 0 );
-            assert( atest.count("foo") == 1 );
-            assert( atest["foo"] == 3 );
-
-            for ( int i = 0; i < 256; i++ ) {
-                ic[i] = starter[i] = false;
-            }
-            for ( int i = 'a'; i <= 'z'; i++ )
-                ic[i] = starter[i] = true;
-            for ( int i = 'A'; i <= 'Z'; i++ )
-                ic[i] = starter[i] = true;
-            for ( int i = '0'; i <= '9'; i++ )
-                ic[i] = true;
-            for ( int i = 128; i < 256; i++ )
-                ic[i] = starter[i] = true;
-            ic['$'] = starter['$'] = true;
-            ic['_'] = starter['_'] = true;
-
-            reserved["break"] = true;
-            reserved["case"] = true;
-            reserved["catch"] = true;
-            reserved["continue"] = true;
-            reserved["default"] = true;
-            reserved["delete"] = true;
-            reserved["do"] = true;
-            reserved["else"] = true;
-            reserved["finally"] = true;
-            reserved["for"] = true;
-            reserved["function"] = true;
-            reserved["if"] = true;
-            reserved["in"] = true;
-            reserved["instanceof"] = true;
-            reserved["new"] = true;
-            reserved["return"] = true;
-            reserved["switch"] = true;
-            reserved["this"] = true;
-            reserved["throw"] = true;
-            reserved["try"] = true;
-            reserved["typeof"] = true;
-            reserved["var"] = true;
-            reserved["void"] = true;
-            reserved["while"] = true;
-            reserved["with "] = true;
+        if ( *p == '\'' ) {
+            p++;
+            while ( *p && *p != '\'' ) p++;
         }
-    };
+        else if ( *p == '"' ) {
+            p++;
+            while ( *p && *p != '"' ) p++;
+        }
+        p++;
+    }
+}
+
+MiniLex() {
+    strhashmap atest;
+    atest["foo"] = 3;
+    assert( atest.count("bar") == 0 );
+    assert( atest.count("foo") == 1 );
+    assert( atest["foo"] == 3 );
+
+    for ( int i = 0; i < 256; i++ ) {
+        ic[i] = starter[i] = false;
+    }
+    for ( int i = 'a'; i <= 'z'; i++ )
+        ic[i] = starter[i] = true;
+    for ( int i = 'A'; i <= 'Z'; i++ )
+        ic[i] = starter[i] = true;
+    for ( int i = '0'; i <= '9'; i++ )
+        ic[i] = true;
+    for ( int i = 128; i < 256; i++ )
+        ic[i] = starter[i] = true;
+    ic['$'] = starter['$'] = true;
+    ic['_'] = starter['_'] = true;
+
+    reserved["break"] = true;
+    reserved["case"] = true;
+    reserved["catch"] = true;
+    reserved["continue"] = true;
+    reserved["default"] = true;
+    reserved["delete"] = true;
+    reserved["do"] = true;
+    reserved["else"] = true;
+    reserved["finally"] = true;
+    reserved["for"] = true;
+    reserved["function"] = true;
+    reserved["if"] = true;
+    reserved["in"] = true;
+    reserved["instanceof"] = true;
+    reserved["new"] = true;
+    reserved["return"] = true;
+    reserved["switch"] = true;
+    reserved["this"] = true;
+    reserved["throw"] = true;
+    reserved["try"] = true;
+    reserved["typeof"] = true;
+    reserved["var"] = true;
+    reserved["void"] = true;
+    reserved["while"] = true;
+    reserved["with "] = true;
+}
+};
+*/
 
 } // namespace mongo
diff --git a/db/module.cpp b/db/module.cpp
index 1e4f511..6a182f2 100644
--- a/db/module.cpp
+++ b/db/module.cpp
@@ -24,29 +24,29 @@ namespace mongo {
     std::list<Module*> * Module::_all;
 
     Module::Module( const string& name )
-        : _name( name ) , _options( (string)"Module " + name + " options" ){
+        : _name( name ) , _options( (string)"Module " + name + " options" ) {
         if ( ! _all )
             _all = new list<Module*>();
         _all->push_back( this );
     }
 
-    Module::~Module(){}
+    Module::~Module() {}
 
-    void Module::addOptions( program_options::options_description& options ){
+    void Module::addOptions( program_options::options_description& options ) {
         if ( ! _all ) {
             return;
         }
-        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ){
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
             Module* m = *i;
             options.add( m->_options );
         }
     }
 
-    void Module::configAll( program_options::variables_map& params ){
+    void Module::configAll( program_options::variables_map& params ) {
         if ( ! _all ) {
             return;
         }
-        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ){
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
             Module* m = *i;
             m->config( params );
         }
@@ -54,11 +54,11 @@ namespace mongo {
     }
 
 
-    void Module::initAll(){
+    void Module::initAll() {
         if ( ! _all ) {
             return;
         }
-        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ){
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
             Module* m = *i;
             m->init();
         }
diff --git a/db/module.h b/db/module.h
index d4939dd..e90923a 100644
--- a/db/module.h
+++ b/db/module.h
@@ -34,8 +34,8 @@ namespace mongo {
     public:
         Module( const string& name );
         virtual ~Module();
-        
-        boost::program_options::options_description_easy_init add_options(){
+
+        boost::program_options::options_description_easy_init add_options() {
             return _options.add_options();
         }
 
@@ -54,10 +54,10 @@ namespace mongo {
          */
         virtual void shutdown() = 0;
 
-        const string& getName(){ return _name; }
-        
+        const string& getName() { return _name; }
+
         // --- static things
-        
+
         static void addOptions( program_options::options_description& options );
         static void configAll( program_options::variables_map& params );
         static void initAll();
diff --git a/db/modules/mms.cpp b/db/modules/mms.cpp
index 40e9001..b180262 100644
--- a/db/modules/mms.cpp
+++ b/db/modules/mms.cpp
@@ -37,54 +37,54 @@ namespace mongo {
         MMS()
             : Module( "mms" ) , _baseurl( "" ) ,
               _secsToSleep(1) , _token( "" ) , _name( "" ) {
-            
+
             add_options()
-                ( "mms-url" , po::value<string>()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" )
-                ( "mms-token" , po::value<string>() , "account token for mongo monitoring server" )
-                ( "mms-name" , po::value<string>() , "server name for mongo monitoring server" )
-                ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" )
-                ;
-        }    
-        
-        ~MMS(){}
-        
-        void config( program_options::variables_map& params ){
+            ( "mms-url" , po::value<string>()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" )
+            ( "mms-token" , po::value<string>() , "account token for mongo monitoring server" )
+            ( "mms-name" , po::value<string>() , "server name for mongo monitoring server" )
+            ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" )
+            ;
+        }
+
+        ~MMS() {}
+
+        void config( program_options::variables_map& params ) {
             _baseurl = params["mms-url"].as<string>();
-            if ( params.count( "mms-token" ) ){
+            if ( params.count( "mms-token" ) ) {
                 _token = params["mms-token"].as<string>();
             }
-            if ( params.count( "mms-name" ) ){
+            if ( params.count( "mms-name" ) ) {
                 _name = params["mms-name"].as<string>();
             }
             _secsToSleep = params["mms-interval"].as<int>();
         }
-        
-        void run(){
-            if ( _token.size() == 0  && _name.size() == 0 ){
+
+        void run() {
+            if ( _token.size() == 0  && _name.size() == 0 ) {
                 log(1) << "mms not configured" << endl;
                 return;
             }
 
-            if ( _token.size() == 0 ){
+            if ( _token.size() == 0 ) {
                 log() << "no token for mms - not running" << endl;
                 return;
             }
-        
-            if ( _name.size() == 0 ){
+
+            if ( _name.size() == 0 ) {
                 log() << "no name for mms - not running" << endl;
                 return;
             }
-            
+
             log() << "mms monitor staring...  token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl;
             Client::initThread( "mms" );
             Client& c = cc();
-            
-            
+
+
             // TODO: using direct client is bad, but easy for now
-            
-            while ( ! inShutdown() ){
+
+            while ( ! inShutdown() ) {
                 sleepsecs( _secsToSleep );
-                
+
                 try {
                     stringstream url;
                     url << _baseurl << "?"
@@ -92,47 +92,47 @@ namespace mongo {
                         << "name=" << _name << "&"
                         << "ts=" << time(0)
                         ;
-                    
+
                     BSONObjBuilder bb;
                     // duplicated so the post has everything
                     bb.append( "token" , _token );
                     bb.append( "name" , _name );
                     bb.appendDate( "ts" , jsTime()  );
-                    
+
                     // any commands
                     _add( bb , "buildinfo" );
                     _add( bb , "serverStatus" );
-                    
+
                     BSONObj postData = bb.obj();
-                    
+
                     log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;;
-                    
+
                     HttpClient c;
                     HttpClient::Result r;
                     int rc = c.post( url.str() , postData.jsonString() , &r );
                     log(1) << "\t response code: " << rc << endl;
-                    if ( rc != 200 ){
+                    if ( rc != 200 ) {
                         log() << "mms error response code:" << rc << endl;
                         log(1) << "mms error body:" << r.getEntireResponse() << endl;
                     }
                 }
-                catch ( std::exception& e ){
+                catch ( std::exception& e ) {
                     log() << "mms exception: " << e.what() << endl;
                 }
             }
-            
+
             c.shutdown();
         }
-        
-        void _add( BSONObjBuilder& postData , const char* cmd ){
+
+        void _add( BSONObjBuilder& postData , const char* cmd ) {
             Command * c = Command::findCommand( cmd );
-            if ( ! c ){
+            if ( ! c ) {
                 log() << "MMS can't find command: " << cmd << endl;
                 postData.append( cmd , "can't find command" );
                 return;
             }
-            
-            if ( c->locktype() ){
+
+            if ( c->locktype() ) {
                 log() << "MMS can only use noLocking commands not: " << cmd << endl;
                 postData.append( cmd , "not noLocking" );
                 return;
@@ -147,24 +147,24 @@ namespace mongo {
             else
                 postData.append( cmd , sub.obj() );
         }
-        
 
-        void init(){ go(); }
 
-        void shutdown(){
+        void init() { go(); }
+
+        void shutdown() {
             // TODO
         }
 
     private:
         string _baseurl;
         int _secsToSleep;
-        
+
         string _token;
         string _name;
-        
+
     } /*mms*/ ;
 
 }
 
-        
+
 
diff --git a/db/mongommf.cpp b/db/mongommf.cpp
new file mode 100644
index 0000000..5ae573d
--- /dev/null
+++ b/db/mongommf.cpp
@@ -0,0 +1,391 @@
+// @file mongommf.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* this module adds some of our layers atop memory mapped files - specifically our handling of private views & such
+   if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class, not this.
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "mongommf.h"
+#include "dur.h"
+#include "dur_journalformat.h"
+#include "../util/mongoutils/str.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+#if defined(_WIN32)
+    extern mutex mapViewMutex;
+
+    __declspec(noinline) void makeChunkWritable(size_t chunkno) { 
+        scoped_lock lk(mapViewMutex);
+
+        if( writable.get(chunkno) ) // double check lock
+            return;
+
+        // remap all maps in this chunk.  common case is a single map, but could have more than one with smallfiles or .ns files
+        size_t chunkStart = chunkno * MemoryMappedFile::ChunkSize;
+        size_t chunkNext = chunkStart + MemoryMappedFile::ChunkSize;
+
+        scoped_lock lk2(privateViews._mutex());
+        map<void*,MongoMMF*>::iterator i = privateViews.finditer_inlock((void*) (chunkNext-1));
+        while( 1 ) {
+            const pair<void*,MongoMMF*> x = *(--i);
+            MongoMMF *mmf = x.second;
+            if( mmf == 0 )
+                break;
+
+            size_t viewStart = (size_t) x.first;
+            size_t viewEnd = viewStart + mmf->length();
+            if( viewEnd <= chunkStart )
+                break;
+
+            size_t protectStart = max(viewStart, chunkStart);
+            dassert(protectStart<chunkNext);
+
+            size_t protectEnd = min(viewEnd, chunkNext);
+            size_t protectSize = protectEnd - protectStart;
+            dassert(protectSize>0&&protectSize<=MemoryMappedFile::ChunkSize);
+
+            DWORD old;
+            bool ok = VirtualProtect((void*)protectStart, protectSize, PAGE_WRITECOPY, &old);
+            if( !ok ) {
+                DWORD e = GetLastError();
+                log() << "VirtualProtect failed " << chunkno << hex << protectStart << ' ' << protectSize << ' ' << errnoWithDescription(e) << endl;
+                assert(false);
+            }
+        }
+
+        writable.set(chunkno);
+    }
+
+    __declspec(noinline) void makeChunkWritableOld(size_t chunkno) { 
+        scoped_lock lk(mapViewMutex);
+
+        if( writable.get(chunkno) )
+            return;
+
+        size_t loc = chunkno * MemoryMappedFile::ChunkSize;
+        void *Loc = (void*) loc;
+        size_t ofs;
+        MongoMMF *mmf = privateViews.find( (void *) (loc), ofs );
+        MemoryMappedFile *f = (MemoryMappedFile*) mmf;
+        assert(f);
+
+        size_t len = MemoryMappedFile::ChunkSize;
+        assert( mmf->getView() <= Loc );
+        if( ofs + len > f->length() ) {
+            // at the very end of the map
+            len = f->length() - ofs;
+        }
+        else { 
+            ;
+        }
+
+        // todo: check this goes away on remap
+        DWORD old;
+        bool ok = VirtualProtect(Loc, len, PAGE_WRITECOPY, &old);
+        if( !ok ) {
+            DWORD e = GetLastError();
+            log() << "VirtualProtect failed " << Loc << ' ' << len << ' ' << errnoWithDescription(e) << endl;
+            assert(false);
+        }
+
+        writable.set(chunkno);
+    }
+
+    // align so that there is only one map per chunksize so our bitset works right
+    void* mapaligned(HANDLE h, unsigned long long _len) {
+        void *loc = 0;
+        int n = 0;
+        while( 1 ) { 
+            n++;
+            void *m = MapViewOfFileEx(h, FILE_MAP_READ, 0, 0, 0, loc);
+            if( m == 0 ) {
+                DWORD e = GetLastError();
+                if( n == 0 ) { 
+                    // if first fails, it isn't going to work
+                    log() << "mapaligned errno: " << e << endl;
+                    break;
+                }
+                if( debug && n == 1 ) { 
+                    log() << "mapaligned info e:" << e << " at n=1" << endl;
+                }
+                if( n > 98 ) {
+                    log() << "couldn't align mapped view of file len:" << _len/1024.0/1024.0 << "MB errno:" << e << endl;
+                    break;
+                }
+                loc = (void*) (((size_t)loc)+MemoryMappedFile::ChunkSize);
+                continue;
+            }
+
+            size_t x = (size_t) m;
+            if( x % MemoryMappedFile::ChunkSize == 0 ) {
+                void *end = (void*) (x+_len);
+                DEV log() << "mapaligned " << m << '-' << end << " len:" << _len << endl;
+                return m;
+            }
+
+            UnmapViewOfFile(m);
+            x = ((x+MemoryMappedFile::ChunkSize-1) / MemoryMappedFile::ChunkSize) * MemoryMappedFile::ChunkSize;
+            loc = (void*) x;
+            if( n % 20 == 0 ) { 
+                log() << "warning mapaligned n=20" << endl;
+            }
+            if( n > 100 ) {
+                log() << "couldn't align mapped view of file len:" << _len/1024.0/1024.0 << "MB" << endl;
+                break;
+            }
+        }
+        return 0;
+    }
+
+    void* MemoryMappedFile::createPrivateMap() {
+        assert( maphandle );
+        scoped_lock lk(mapViewMutex);
+        //void *p = mapaligned(maphandle, len);
+        void *p = MapViewOfFile(maphandle, FILE_MAP_READ, 0, 0, 0);
+        if ( p == 0 ) {
+            DWORD e = GetLastError();
+            log() << "createPrivateMap failed " << filename() << " " << errnoWithDescription(e) << endl;
+        }
+        else {
+            clearWritableBits(p);
+            views.push_back(p);
+        }
+        return p;
+    }
+
+    void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) {
+        dbMutex.assertWriteLocked(); // short window where we are unmapped so must be exclusive
+
+        // the mapViewMutex is to assure we get the same address on the remap
+        scoped_lock lk(mapViewMutex);
+
+        clearWritableBits(oldPrivateAddr);
+
+        if( !UnmapViewOfFile(oldPrivateAddr) ) {
+            DWORD e = GetLastError();
+            log() << "UnMapViewOfFile failed " << filename() << ' ' << errnoWithDescription(e) << endl;
+            assert(false);
+        }
+
+        // we want the new address to be the same as the old address in case things keep pointers around (as namespaceindex does).
+        void *p = MapViewOfFileEx(maphandle, FILE_MAP_READ, 0, 0,
+                                  /*dwNumberOfBytesToMap 0 means to eof*/0 /*len*/,
+                                  oldPrivateAddr);
+        
+        if ( p == 0 ) {
+            DWORD e = GetLastError();
+            log() << "MapViewOfFileEx failed " << filename() << " " << errnoWithDescription(e) << endl;
+            assert(p);
+        }
+        assert(p == oldPrivateAddr);
+        return p;
+    }
+#endif
+
+    void MongoMMF::remapThePrivateView() {
+        assert( cmdLine.dur );
+
+        // todo 1.9 : it turns out we require that we always remap to the same address.
+        // so the remove / add isn't necessary and can be removed
+        privateViews.remove(_view_private);
+        _view_private = remapPrivateView(_view_private);
+        privateViews.add(_view_private, this);
+    }
+
+    /** register view. threadsafe */
+    void PointerToMMF::add(void *view, MongoMMF *f) {
+        assert(view);
+        assert(f);
+        mutex::scoped_lock lk(_m);
+        _views.insert( pair<void*,MongoMMF*>(view,f) );
+    }
+
+    /** de-register view. threadsafe */
+    void PointerToMMF::remove(void *view) {
+        if( view ) {
+            mutex::scoped_lock lk(_m);
+            _views.erase(view);
+        }
+    }
+
+    PointerToMMF::PointerToMMF() : _m("PointerToMMF") {
+#if defined(SIZE_MAX)
+        size_t max = SIZE_MAX;
+#else
+        size_t max = ~((size_t)0);
+#endif
+        assert( max > (size_t) this ); // just checking that no one redef'd SIZE_MAX and that it is sane
+
+        // this way we don't need any boundary checking in _find()
+        _views.insert( pair<void*,MongoMMF*>((void*)0,(MongoMMF*)0) );
+        _views.insert( pair<void*,MongoMMF*>((void*)max,(MongoMMF*)0) );
+    }
+
+    /** underscore version of find is for when you are already locked
+        @param ofs out return our offset in the view
+        @return the MongoMMF to which this pointer belongs
+    */
+    MongoMMF* PointerToMMF::find_inlock(void *p, /*out*/ size_t& ofs) {
+        //
+        // .................memory..........................
+        //    v1       p                      v2
+        //    [--------------------]          [-------]
+        //
+        // e.g., _find(p) == v1
+        //
+        const pair<void*,MongoMMF*> x = *(--_views.upper_bound(p));
+        MongoMMF *mmf = x.second;
+        if( mmf ) {
+            size_t o = ((char *)p) - ((char*)x.first);
+            if( o < mmf->length() ) {
+                ofs = o;
+                return mmf;
+            }
+        }
+        return 0;
+    }
+
+    /** find associated MMF object for a given pointer.
+        threadsafe
+        @param ofs out returns offset into the view of the pointer, if found.
+        @return the MongoMMF to which this pointer belongs. null if not found.
+    */
+    MongoMMF* PointerToMMF::find(void *p, /*out*/ size_t& ofs) {
+        mutex::scoped_lock lk(_m);
+        return find_inlock(p, ofs);
+    }
+
+    PointerToMMF privateViews;
+
+    /* void* MongoMMF::switchToPrivateView(void *readonly_ptr) {
+        assert( cmdLine.dur );
+        assert( testIntent );
+
+        void *p = readonly_ptr;
+
+        {
+            size_t ofs=0;
+            MongoMMF *mmf = ourReadViews.find(p, ofs);
+            if( mmf ) {
+                void *res = ((char *)mmf->_view_private) + ofs;
+                return res;
+            }
+        }
+
+        {
+            size_t ofs=0;
+            MongoMMF *mmf = privateViews.find(p, ofs);
+            if( mmf ) {
+                log() << "dur: perf warning p=" << p << " is already in the writable view of " << mmf->filename() << endl;
+                return p;
+            }
+        }
+
+        // did you call writing() with a pointer that isn't into a datafile?
+        log() << "dur error switchToPrivateView " << p << endl;
+        return p;
+    }*/
+
+    /* switch to _view_write.  normally, this is a bad idea since your changes will not
+       show up in _view_private if there have been changes there; thus the leading underscore
+       as a tad of a "warning".  but useful when done with some care, such as during
+       initialization.
+    */
+    void* MongoMMF::_switchToWritableView(void *p) {
+        size_t ofs;
+        MongoMMF *f = privateViews.find(p, ofs);
+        assert( f );
+        return (((char *)f->_view_write)+ofs);
+    }
+
+    extern string dbpath;
+
+    // here so that it is precomputed...
+    void MongoMMF::setPath(string f) {
+        string suffix;
+        string prefix;
+        bool ok = str::rSplitOn(f, '.', prefix, suffix);
+        uassert(13520, str::stream() << "MongoMMF only supports filenames in a certain format " << f, ok);
+        if( suffix == "ns" )
+            _fileSuffixNo = dur::JEntry::DotNsSuffix;
+        else
+            _fileSuffixNo = (int) str::toUnsigned(suffix);
+
+        _p = RelativePath::fromFullPath(prefix);
+    }
+
+    bool MongoMMF::open(string fname, bool sequentialHint) {
+        setPath(fname);
+        _view_write = mapWithOptions(fname.c_str(), sequentialHint ? SEQUENTIAL : 0);
+        return finishOpening();
+    }
+
+    bool MongoMMF::create(string fname, unsigned long long& len, bool sequentialHint) {
+        setPath(fname);
+        _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0);
+        return finishOpening();
+    }
+
+    bool MongoMMF::finishOpening() {
+        if( _view_write ) {
+            if( cmdLine.dur ) {
+                _view_private = createPrivateMap();
+                if( _view_private == 0 ) {
+                    massert( 13636 , "createPrivateMap failed (look in log for error)" , false );
+                }
+                privateViews.add(_view_private, this); // note that testIntent builds use this, even though it points to view_write then...
+            }
+            else {
+                _view_private = _view_write;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    MongoMMF::MongoMMF() : _willNeedRemap(false) {
+        _view_write = _view_private = 0;
+    }
+
+    MongoMMF::~MongoMMF() {
+        close();
+    }
+
+    namespace dur {
+        void closingFileNotification();
+    }
+
+    /*virtual*/ void MongoMMF::close() {
+        {
+            if( cmdLine.dur && _view_write/*actually was opened*/ ) {
+                if( debug )
+                    log() << "closingFileNotication:" << filename() << endl;
+                dur::closingFileNotification();
+            }
+            privateViews.remove(_view_private);
+        }
+        _view_write = _view_private = 0;
+        MemoryMappedFile::close();
+    }
+
+}
diff --git a/db/mongommf.h b/db/mongommf.h
new file mode 100644
index 0000000..5da46fc
--- /dev/null
+++ b/db/mongommf.h
@@ -0,0 +1,140 @@
+/** @file mongommf.h
+*
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/mmap.h"
+#include "../util/paths.h"
+
+namespace mongo {
+
+    /** MongoMMF adds some layers atop memory mapped files - specifically our handling of private views & such.
+        if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class,
+        not this.
+    */
+    class MongoMMF : private MemoryMappedFile {
+    public:
+        MongoMMF();
+        virtual ~MongoMMF();
+        virtual void close();
+
+        /** @return true if opened ok. */
+        bool open(string fname, bool sequentialHint);
+
+        /** @return file length */
+        unsigned long long length() const { return MemoryMappedFile::length(); }
+
+        string filename() const { return MemoryMappedFile::filename(); }
+
+        void flush(bool sync)   { MemoryMappedFile::flush(sync); }
+
+        /* Creates with length if DNE, otherwise uses existing file length,
+           passed length.
+           @param sequentialHint if true will be sequentially accessed
+           @return true for ok
+        */
+        bool create(string fname, unsigned long long& len, bool sequentialHint);
+
+        /* Get the "standard" view (which is the private one).
+           @return the private view.
+        */
+        void* getView() const { return _view_private; }
+        
+        /* Get the "write" view (which is required for writing).
+           @return the write view.
+        */
+        void* view_write() const { return _view_write; }
+
+
+        /* switch to _view_write.  normally, this is a bad idea since your changes will not
+           show up in _view_private if there have been changes there; thus the leading underscore
+           as a tad of a "warning".  but useful when done with some care, such as during
+           initialization.
+        */
+        static void* _switchToWritableView(void *private_ptr);
+
+        /** for a filename a/b/c.3
+            filePath() is "a/b/c"
+            fileSuffixNo() is 3
+            if the suffix is "ns", fileSuffixNo -1
+        */
+        RelativePath relativePath() const {
+            DEV assert( !_p._p.empty() );
+            return _p;
+        }
+
+        int fileSuffixNo() const { return _fileSuffixNo; }
+
+        /** true if we have written.
+            set in PREPLOGBUFFER, it is NOT set immediately on write intent declaration.
+            reset to false in REMAPPRIVATEVIEW
+        */
+        bool& willNeedRemap() { return _willNeedRemap; }
+
+        void remapThePrivateView();
+
+        virtual bool isMongoMMF() { return true; }
+
+    private:
+
+        void *_view_write;
+        void *_view_private;
+        bool _willNeedRemap;
+        RelativePath _p;   // e.g. "somepath/dbname"
+        int _fileSuffixNo;  // e.g. 3.  -1="ns"
+
+        void setPath(string pathAndFileName);
+        bool finishOpening();
+    };
+
+    /** for durability support we want to be able to map pointers to specific MongoMMF objects.
+    */
+    class PointerToMMF : boost::noncopyable {
+    public:
+        PointerToMMF();
+
+        /** register view. \
+            threadsafe
+            */
+        void add(void *view, MongoMMF *f);
+
+        /** de-register view.
+            threadsafe
+            */
+        void remove(void *view);
+
+        /** find associated MMF object for a given pointer.
+            threadsafe
+            @param ofs out returns offset into the view of the pointer, if found.
+            @return the MongoMMF to which this pointer belongs. null if not found.
+        */
+        MongoMMF* find(void *p, /*out*/ size_t& ofs);
+
+        /** for doing many finds in a row with one lock operation */
+        mutex& _mutex() { return _m; }
+        MongoMMF* find_inlock(void *p, /*out*/ size_t& ofs);
+
+        map<void*,MongoMMF*>::iterator finditer_inlock(void *p) { return _views.upper_bound(p); }
+
+    private:
+        mutex _m;
+        map<void*, MongoMMF*> _views;
+    };
+
+    // allows a pointer into any private view of a MongoMMF to be resolved to the MongoMMF object
+    extern PointerToMMF privateViews;
+}
diff --git a/db/mongomutex.h b/db/mongomutex.h
new file mode 100644
index 0000000..fac4113
--- /dev/null
+++ b/db/mongomutex.h
@@ -0,0 +1,239 @@
+// @file mongomutex.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+// note: include concurrency.h, not this.
+
+namespace mongo {
+
+    /** the 'big lock' we use for most operations. a read/write lock.
+        there is one of these, dbMutex.
+
+        generally if you need to declare a mutex use the right primitive class, not this.
+
+        use readlock and writelock classes for scoped locks on this rather than direct
+        manipulation.
+       */
+    class MongoMutex {
+    public:
+        MongoMutex(const char * name);
+
+        /** @return
+         *    > 0  write lock
+         *    = 0  no lock
+         *    < 0  read lock
+         */
+        int getState() const { return _state.get(); }
+
+        bool atLeastReadLocked() const { return _state.get() != 0; }
+        void assertAtLeastReadLocked() const { assert(atLeastReadLocked()); }
+        bool isWriteLocked() const { return getState() > 0; }
+        void assertWriteLocked() const {
+            assert( getState() > 0 );
+            DEV assert( !_releasedEarly.get() );
+        }
+
+        // write lock.  use the writelock scoped lock class, not this directly.
+        void lock() {
+            if ( _writeLockedAlready() )
+                return;
+
+            _state.set(1);
+
+            Client *c = curopWaitingForLock( 1 ); // stats
+            _m.lock();
+            curopGotLock(c);
+
+            _minfo.entered();
+
+            MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build
+
+            _acquiredWriteLock();
+        }
+
+        // try write lock
+        bool lock_try( int millis ) {
+            if ( _writeLockedAlready() )
+                return true;
+
+            Client *c = curopWaitingForLock( 1 );
+            bool got = _m.lock_try( millis );
+
+            if ( got ) {
+                curopGotLock(c);
+                _minfo.entered();
+                _state.set(1);
+                MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build
+                _acquiredWriteLock();
+            }
+
+            return got;
+        }
+
+        // un write lock
+        void unlock() {
+            int s = _state.get();
+            if( s > 1 ) {
+                _state.set(s-1); // recursive lock case
+                return;
+            }
+            if( s != 1 ) {
+                if( _releasedEarly.get() ) {
+                    _releasedEarly.set(false);
+                    return;
+                }
+                massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false);
+            }
+            _releasingWriteLock();
+            MongoFile::unmarkAllWritable(); // _DEBUG validation
+            _state.set(0);
+            _minfo.leaving();
+            _m.unlock();
+        }
+
+        /* unlock (write lock), and when unlock() is called later,
+           be smart then and don't unlock it again.
+           */
+        void releaseEarly() {
+            assert( getState() == 1 ); // must not be recursive
+            assert( !_releasedEarly.get() );
+            _releasedEarly.set(true);
+            unlock();
+        }
+
+        // read lock. don't call directly, use readlock.
+        void lock_shared() {
+            int s = _state.get();
+            if( s ) {
+                if( s > 0 ) {
+                    // already in write lock - just be recursive and stay write locked
+                    _state.set(s+1);
+                }
+                else {
+                    // already in read lock - recurse
+                    _state.set(s-1);
+                }
+            }
+            else {
+                _state.set(-1);
+                Client *c = curopWaitingForLock( -1 );
+                _m.lock_shared();
+                curopGotLock(c);
+            }
+        }
+
+        // try read lock
+        bool lock_shared_try( int millis ) {
+            int s = _state.get();
+            if ( s ) {
+                // we already have a lock, so no need to try
+                lock_shared();
+                return true;
+            }
+
+            /* [dm] should there be
+                             Client *c = curopWaitingForLock( 1 );
+               here?  i think so.  seems to be missing.
+               */
+            bool got = _m.lock_shared_try( millis );
+            if ( got )
+                _state.set(-1);
+            return got;
+        }
+
+        void unlock_shared() {
+            int s = _state.get();
+            if( s > 0 ) {
+                assert( s > 1 ); /* we must have done a lock write first to have s > 1 */
+                _state.set(s-1);
+                return;
+            }
+            if( s < -1 ) {
+                _state.set(s+1);
+                return;
+            }
+            assert( s == -1 );
+            _state.set(0);
+            _m.unlock_shared();
+        }
+
+        MutexInfo& info() { return _minfo; }
+
+    private:
+        void _acquiredWriteLock();
+        void _releasingWriteLock();
+
+        /* @return true if was already write locked.  increments recursive lock count. */
+        bool _writeLockedAlready();
+
+        RWLock _m;
+
+        /* > 0 write lock with recurse count
+           < 0 read lock
+        */
+        ThreadLocalValue<int> _state;
+
+        MutexInfo _minfo;
+
+    public:
+        // indicates we need to call dur::REMAPPRIVATEVIEW on the next write lock
+        bool _remapPrivateViewRequested;
+
+    private:
+        /* See the releaseEarly() method.
+           we use a separate TLS value for releasedEarly - that is ok as
+           our normal/common code path, we never even touch it */
+        ThreadLocalValue<bool> _releasedEarly;
+
+        /* this is for fsyncAndLock command.  otherwise write lock's greediness will
+           make us block on any attempted write lock the the fsync's lock.
+           */
+        //volatile bool _blockWrites;
+    };
+
+    extern MongoMutex &dbMutex;
+
+    namespace dur {
+        void REMAPPRIVATEVIEW();
+        void releasingWriteLock(); // because it's hard to include dur.h here
+    }
+
+    inline void MongoMutex::_releasingWriteLock() {
+        dur::releasingWriteLock();
+    }
+
+    inline void MongoMutex::_acquiredWriteLock() {
+        if( _remapPrivateViewRequested ) {
+            dur::REMAPPRIVATEVIEW();
+            dassert( !_remapPrivateViewRequested );
+        }
+    }
+
+    /* @return true if was already write locked.  increments recursive lock count. */
+    inline bool MongoMutex::_writeLockedAlready() {
+        int s = _state.get();
+        if( s > 0 ) {
+            _state.set(s+1);
+            return true;
+        }
+        massert( 10293 , string("internal error: locks are not upgradeable: ") + sayClientState() , s == 0 );
+        return false;
+    }
+
+}
diff --git a/db/mr.cpp b/db/mr.cpp
deleted file mode 100644
index 7786c85..0000000
--- a/db/mr.cpp
+++ /dev/null
@@ -1,721 +0,0 @@
-// mr.cpp
-
-/**
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "pch.h"
-#include "db.h"
-#include "instance.h"
-#include "commands.h"
-#include "../scripting/engine.h"
-#include "../client/dbclient.h"
-#include "../client/connpool.h"
-#include "../client/parallel.h"
-#include "queryoptimizer.h"
-#include "matcher.h"
-#include "clientcursor.h"
-
-namespace mongo {
-
-    namespace mr {
-
-        typedef vector<BSONObj> BSONList;
-
-        class MyCmp {
-        public:
-            MyCmp(){}
-            bool operator()( const BSONObj &l, const BSONObj &r ) const {
-                return l.firstElement().woCompare( r.firstElement() ) < 0;
-            }
-        };
-
-        typedef pair<BSONObj,BSONObj> Data;
-        //typedef list< Data > InMemory;
-        typedef map< BSONObj,BSONList,MyCmp > InMemory;
-
-        BSONObj reduceValues( BSONList& values , Scope * s , ScriptingFunction reduce , bool final , ScriptingFunction finalize ){
-            uassert( 10074 ,  "need values" , values.size() );
-            
-            int sizeEstimate = ( values.size() * values.begin()->getField( "value" ).size() ) + 128;
-            BSONObj key;
-
-            BSONObjBuilder reduceArgs( sizeEstimate );
-            boost::scoped_ptr<BSONArrayBuilder>  valueBuilder;
-            
-            int sizeSoFar = 0;
-            unsigned n = 0;
-            for ( ; n<values.size(); n++ ){
-                BSONObjIterator j(values[n]);
-                BSONElement keyE = j.next();
-                if ( n == 0 ){
-                    reduceArgs.append( keyE );
-                    key = keyE.wrap();
-                    sizeSoFar = 5 + keyE.size();
-                    valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "values" ) ));
-                }
-                
-                BSONElement ee = j.next();
-                
-                uassert( 13070 , "value to large to reduce" , ee.size() < ( 2 * 1024 * 1024 ) );
-
-                if ( sizeSoFar + ee.size() > ( 4 * 1024 * 1024 ) ){
-                    assert( n > 1 ); // if not, inf. loop
-                    break;
-                }
-                
-                valueBuilder->append( ee );
-                sizeSoFar += ee.size();
-            }
-            assert(valueBuilder);
-            valueBuilder->done();
-            BSONObj args = reduceArgs.obj();
-
-            s->invokeSafe( reduce , args );
-            if ( s->type( "return" ) == Array ){
-                uassert( 10075 , "reduce -> multiple not supported yet",0);                
-                return BSONObj();
-            }
-
-            int endSizeEstimate = key.objsize() + ( args.objsize() / values.size() );
-
-            if ( n < values.size() ){
-                BSONList x;
-                for ( ; n < values.size(); n++ ){
-                    x.push_back( values[n] );
-                }
-                BSONObjBuilder temp( endSizeEstimate );
-                temp.append( key.firstElement() );
-                s->append( temp , "1" , "return" );
-                x.push_back( temp.obj() );
-                return reduceValues( x , s , reduce , final , finalize );
-            }
-            
-
-
-            if ( finalize ){
-                BSONObjBuilder b(endSizeEstimate);
-                b.appendAs( key.firstElement() , "_id" );
-                s->append( b , "value" , "return" );
-                s->invokeSafe( finalize , b.obj() );
-            }
-            
-            BSONObjBuilder b(endSizeEstimate);
-            b.appendAs( key.firstElement() , final ? "_id" : "0" );
-            s->append( b , final ? "value" : "1" , "return" );
-            return b.obj();
-        }
-        
-        class MRSetup {
-        public:
-            MRSetup( const string& _dbname , const BSONObj& cmdObj , bool markAsTemp = true ){
-                static int jobNumber = 1;
-                
-                dbname = _dbname;
-                ns = dbname + "." + cmdObj.firstElement().valuestr();
-
-                verbose = cmdObj["verbose"].trueValue();
-                keeptemp = cmdObj["keeptemp"].trueValue();
-                
-                { // setup names
-                    stringstream ss;
-                    if ( ! keeptemp )
-                        ss << "tmp.";
-                    ss << "mr." << cmdObj.firstElement().fieldName() << "_" << time(0) << "_" << jobNumber++;    
-                    tempShort = ss.str();
-                    tempLong = dbname + "." + tempShort;
-                    incLong = tempLong + "_inc";
-
-                    if ( ! keeptemp && markAsTemp )
-                        cc().addTempCollection( tempLong );
-
-                    replicate = keeptemp;
-
-                    if ( cmdObj["out"].type() == String ){
-                        finalShort = cmdObj["out"].valuestr();
-                        replicate = true;
-                    }
-                    else
-                        finalShort = tempShort;
-                    
-                    finalLong = dbname + "." + finalShort;
-                    
-                }
-             
-                { // code
-                    mapCode = cmdObj["map"]._asCode();
-                    reduceCode = cmdObj["reduce"]._asCode();
-                    if ( cmdObj["finalize"].type() ){
-                        finalizeCode = cmdObj["finalize"]._asCode();
-                    }
-                    checkCodeWScope( "map" , cmdObj );
-                    checkCodeWScope( "reduce" , cmdObj );
-                    checkCodeWScope( "finalize" , cmdObj );
-                    
-                    if ( cmdObj["mapparams"].type() == Array ){
-                        mapparams = cmdObj["mapparams"].embeddedObjectUserCheck();
-                    }
-
-                    if ( cmdObj["scope"].type() == Object ){
-                        scopeSetup = cmdObj["scope"].embeddedObjectUserCheck();
-                    }
-                    
-                }
-                
-                { // query options
-                    if ( cmdObj["query"].type() == Object ){
-                        filter = cmdObj["query"].embeddedObjectUserCheck();
-                    }
-                    
-                    if ( cmdObj["sort"].type() == Object ){
-                        sort = cmdObj["sort"].embeddedObjectUserCheck();
-                    }
-
-                    if ( cmdObj["limit"].isNumber() )
-                        limit = cmdObj["limit"].numberLong();
-                    else 
-                        limit = 0;
-                }
-            }
-            
-            void checkCodeWScope( const char * field , const BSONObj& o ){
-                BSONElement e = o[field];
-                if ( e.type() != CodeWScope )
-                    return;
-                BSONObj x = e.codeWScopeObject();
-                uassert( 13035 , (string)"can't use CodeWScope with map/reduce function: " + field , x.isEmpty() );
-            }
-
-            /**
-               @return number objects in collection
-             */
-            long long renameIfNeeded( DBDirectClient& db ){
-                if ( finalLong != tempLong ){
-                    db.dropCollection( finalLong );
-                    if ( db.count( tempLong ) ){
-                        BSONObj info;
-                        uassert( 10076 ,  "rename failed" , db.runCommand( "admin" , BSON( "renameCollection" << tempLong << "to" << finalLong ) , info ) );
-                    }
-                }
-                return db.count( finalLong );
-            }
-                
-            string dbname;
-            string ns;
-            
-            // options
-            bool verbose;            
-            bool keeptemp;
-            bool replicate;
-
-            // query options
-            
-            BSONObj filter;
-            BSONObj sort;
-            long long limit;
-
-            // functions
-            
-            string mapCode;
-            string reduceCode;
-            string finalizeCode;
-            
-            BSONObj mapparams;
-            BSONObj scopeSetup;
-            
-            // output tables
-            string incLong;
-            
-            string tempShort;
-            string tempLong;
-            
-            string finalShort;
-            string finalLong;
-            
-        }; // end MRsetup
-
-        class MRState {
-        public:
-            MRState( MRSetup& s ) : setup(s){
-                scope = globalScriptEngine->getPooledScope( setup.dbname );
-                scope->localConnect( setup.dbname.c_str() );
-                
-                map = scope->createFunction( setup.mapCode.c_str() );
-                if ( ! map )
-                    throw UserException( 9012, (string)"map compile failed: " + scope->getError() );
-
-                reduce = scope->createFunction( setup.reduceCode.c_str() );
-                if ( ! reduce )
-                    throw UserException( 9013, (string)"reduce compile failed: " + scope->getError() );
-
-                if ( setup.finalizeCode.size() )
-                    finalize  = scope->createFunction( setup.finalizeCode.c_str() );
-                else
-                    finalize = 0;
-                
-                if ( ! setup.scopeSetup.isEmpty() )
-                    scope->init( &setup.scopeSetup );
-
-                db.dropCollection( setup.tempLong );
-                db.dropCollection( setup.incLong );
-                
-                writelock l( setup.incLong );
-                Client::Context ctx( setup.incLong );
-                string err;
-                assert( userCreateNS( setup.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) );
-
-            }
-
-            void finalReduce( BSONList& values ){
-                if ( values.size() == 0 )
-                    return;
-
-                BSONObj key = values.begin()->firstElement().wrap( "_id" );
-                BSONObj res = reduceValues( values , scope.get() , reduce , 1 , finalize );
-                
-                writelock l( setup.tempLong );
-                Client::Context ctx( setup.incLong );
-                if ( setup.replicate )
-                    theDataFileMgr.insertAndLog( setup.tempLong.c_str() , res , false );
-                else
-                    theDataFileMgr.insertWithObjMod( setup.tempLong.c_str() , res , false );
-            }
-
-            
-            MRSetup& setup;
-            auto_ptr<Scope> scope;
-            DBDirectClient db;
-
-            ScriptingFunction map;
-            ScriptingFunction reduce;
-            ScriptingFunction finalize;
-            
-        };
-        
-        class MRTL {
-        public:
-            MRTL( MRState& state ) 
-                : _state( state )
-                , _temp(new InMemory())
-            {
-                _size = 0;
-                numEmits = 0;
-            }
-            
-            void reduceInMemory(){
-                boost::shared_ptr<InMemory> old = _temp;
-                _temp.reset(new InMemory());
-                _size = 0;
-                
-                for ( InMemory::iterator i=old->begin(); i!=old->end(); i++ ){
-                    BSONObj key = i->first;
-                    BSONList& all = i->second;
-                    
-                    if ( all.size() == 1 ){
-                        // this key has low cardinality, so just write to db
-                        writelock l(_state.setup.incLong);
-                        Client::Context ctx(_state.setup.incLong.c_str());
-                        write( *(all.begin()) );
-                    }
-                    else if ( all.size() > 1 ){
-                        BSONObj res = reduceValues( all , _state.scope.get() , _state.reduce , false , 0 );
-                        insert( res );
-                    }
-                }
-            }
-
-            void dump(){
-                writelock l(_state.setup.incLong);
-                Client::Context ctx(_state.setup.incLong);
-                    
-                for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ){
-                    BSONList& all = i->second;
-                    if ( all.size() < 1 )
-                        continue;
-                    
-                    for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ )
-                        write( *j );
-                }
-                _temp->clear();
-                _size = 0;
-
-            }
-            
-            void insert( const BSONObj& a ){
-                BSONList& all = (*_temp)[a];
-                all.push_back( a );
-                _size += a.objsize() + 16;
-            }
-
-            void checkSize(){
-                if ( _size < 1024 * 5 )
-                    return;
-
-                long before = _size;
-                reduceInMemory();
-                log(1) << "  mr: did reduceInMemory  " << before << " -->> " << _size << endl;
-
-                if ( _size < 1024 * 15 )
-                    return;
-                
-                dump();
-                log(1) << "  mr: dumping to db" << endl;
-            }
-
-        private:
-            void write( BSONObj& o ){
-                theDataFileMgr.insertWithObjMod( _state.setup.incLong.c_str() , o , true );
-            }
-            
-            MRState& _state;
-        
-            boost::shared_ptr<InMemory> _temp;
-            long _size;
-            
-        public:
-            long long numEmits;
-        };
-
-        boost::thread_specific_ptr<MRTL> _tlmr;
-
-        BSONObj fast_emit( const BSONObj& args ){
-            uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 );
-            uassert( 13069 , "an emit can't be more than 2mb" , args.objsize() < ( 2 * 1024 * 1024 ) );
-            _tlmr->insert( args );
-            _tlmr->numEmits++;
-            return BSONObj();
-        }
-
-        class MapReduceCommand : public Command {
-        public:
-            MapReduceCommand() : Command("mapReduce", false, "mapreduce"){}
-            virtual bool slaveOk() const { return true; }
-        
-            virtual void help( stringstream &help ) const {
-                help << "Run a map/reduce operation on the server.\n";
-                help << "Note this is used for aggregation, not querying, in MongoDB.\n";
-                help << "http://www.mongodb.org/display/DOCS/MapReduce";
-            }
-            virtual LockType locktype() const { return NONE; } 
-            bool run(const string& dbname , BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
-                Timer t;
-                Client::GodScope cg;
-                Client& client = cc();
-                CurOp * op = client.curop();
-
-                MRSetup mr( dbname , cmd );
-
-                log(1) << "mr ns: " << mr.ns << endl;
-                
-                if ( ! db.exists( mr.ns ) ){
-                    errmsg = "ns doesn't exist";
-                    return false;
-                }
-                
-                bool shouldHaveData = false;
-                
-                long long num = 0;
-                long long inReduce = 0;
-                
-                BSONObjBuilder countsBuilder;
-                BSONObjBuilder timingBuilder;
-                try {
-                    
-                    MRState state( mr );
-                    state.scope->injectNative( "emit" , fast_emit );
-                    
-                    MRTL * mrtl = new MRTL( state );
-                    _tlmr.reset( mrtl );
-
-                    ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , db.count( mr.ns , mr.filter ) ) );
-                    long long mapTime = 0;
-                    {
-                        readlock lock( mr.ns );
-                        Client::Context ctx( mr.ns );
-                        
-                        shared_ptr<Cursor> temp = bestGuessCursor( mr.ns.c_str(), mr.filter, mr.sort );
-                        auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , mr.ns.c_str() ) );
-
-                        Timer mt;
-                        while ( cursor->ok() ){
-                            
-                            if ( ! cursor->currentMatches() ){
-                                cursor->advance();
-                                continue;
-                            }
-                            
-                            BSONObj o = cursor->current(); 
-                            cursor->advance();
-                            
-                            if ( mr.verbose ) mt.reset();
-                            
-                            state.scope->setThis( &o );
-                            if ( state.scope->invoke( state.map , state.setup.mapparams , 0 , true ) )
-                                throw UserException( 9014, (string)"map invoke failed: " + state.scope->getError() );
-                            
-                            if ( mr.verbose ) mapTime += mt.micros();
-                            
-                            num++;
-                            if ( num % 100 == 0 ){
-                                ClientCursor::YieldLock yield (cursor.get());
-                                Timer t;
-                                mrtl->checkSize();
-                                inReduce += t.micros();
-                                
-                                if ( ! yield.stillOk() ){
-                                    cursor.release();
-                                    break;
-                                }
-
-                                killCurrentOp.checkForInterrupt();
-                            }
-                            pm.hit();
-                            
-                            if ( mr.limit && num >= mr.limit )
-                                break;
-                        }
-                    }
-                    pm.finished();
-                    
-                    killCurrentOp.checkForInterrupt();
-
-                    countsBuilder.appendNumber( "input" , num );
-                    countsBuilder.appendNumber( "emit" , mrtl->numEmits );
-                    if ( mrtl->numEmits )
-                        shouldHaveData = true;
-                    
-                    timingBuilder.append( "mapTime" , mapTime / 1000 );
-                    timingBuilder.append( "emitLoop" , t.millis() );
-                    
-                    // final reduce
-                    op->setMessage( "m/r: (2/3) final reduce in memory" );
-                    mrtl->reduceInMemory();
-                    mrtl->dump();
-                    
-                    BSONObj sortKey = BSON( "0" << 1 );
-                    db.ensureIndex( mr.incLong , sortKey );
-                    
-                    {
-                        writelock lock( mr.tempLong.c_str() );
-                        Client::Context ctx( mr.tempLong.c_str() );
-                        assert( userCreateNS( mr.tempLong.c_str() , BSONObj() , errmsg , mr.replicate ) );
-                    }
-
-
-                    {
-                        readlock rl(mr.incLong.c_str());
-                        Client::Context ctx( mr.incLong );
-                        
-                        BSONObj prev;
-                        BSONList all;
-                        
-                        assert( pm == op->setMessage( "m/r: (3/3) final reduce to collection" , db.count( mr.incLong ) ) );
-
-                        shared_ptr<Cursor> temp = bestGuessCursor( mr.incLong.c_str() , BSONObj() , sortKey );
-                        auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , mr.incLong.c_str() ) );
-                        
-                        while ( cursor->ok() ){
-                            BSONObj o = cursor->current().getOwned();
-                            cursor->advance();
-                            
-                            pm.hit();
-                            
-                            if ( o.woSortOrder( prev , sortKey ) == 0 ){
-                                all.push_back( o );
-                                if ( pm->hits() % 1000 == 0 ){
-                                    if ( ! cursor->yield() ){
-                                        cursor.release();
-                                        break;
-                                    } 
-                                    killCurrentOp.checkForInterrupt();
-                                }
-                                continue;
-                            }
-                        
-                            ClientCursor::YieldLock yield (cursor.get());
-                            state.finalReduce( all );
-                            
-                            all.clear();
-                            prev = o;
-                            all.push_back( o );
-
-                            if ( ! yield.stillOk() ){
-                                cursor.release();
-                                break;
-                            }
-                            
-                            killCurrentOp.checkForInterrupt();
-                        }
-
-                        {
-                            dbtempreleasecond tl;
-                            if ( ! tl.unlocked() )
-                                log( LL_WARNING ) << "map/reduce can't temp release" << endl;
-                            state.finalReduce( all );
-                        }
-
-                        pm.finished();
-                    }
-
-                    _tlmr.reset( 0 );
-                }
-                catch ( ... ){
-                    log() << "mr failed, removing collection" << endl;
-                    db.dropCollection( mr.tempLong );
-                    db.dropCollection( mr.incLong );
-                    throw;
-                }
-                
-                long long finalCount = 0;
-                {
-                    dblock lock;
-                    db.dropCollection( mr.incLong );
-                
-                    finalCount = mr.renameIfNeeded( db );
-                }
-
-                timingBuilder.append( "total" , t.millis() );
-                
-                result.append( "result" , mr.finalShort );
-                result.append( "timeMillis" , t.millis() );
-                countsBuilder.appendNumber( "output" , finalCount );
-                if ( mr.verbose ) result.append( "timing" , timingBuilder.obj() );
-                result.append( "counts" , countsBuilder.obj() );
-
-                if ( finalCount == 0 && shouldHaveData ){
-                    result.append( "cmd" , cmd );
-                    errmsg = "there were emits but no data!";
-                    return false;
-                }
-
-                return true;
-            }
-
-        private:
-            DBDirectClient db;
-
-        } mapReduceCommand;
-        
-        class MapReduceFinishCommand : public Command {
-        public:
-            MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ){}
-            virtual bool slaveOk() const { return true; }
-            
-            virtual LockType locktype() const { return NONE; } 
-            bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){
-                string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe();
-
-                MRSetup mr( dbname , cmdObj.firstElement().embeddedObjectUserCheck() , false );
-                
-                set<ServerAndQuery> servers;
-                
-                BSONObjBuilder shardCounts;
-                map<string,long long> counts;
-                
-                BSONObj shards = cmdObj["shards"].embeddedObjectUserCheck();
-                vector< auto_ptr<DBClientCursor> > shardCursors;
-
-                { // parse per shard results 
-                    BSONObjIterator i( shards );
-                    while ( i.more() ){
-                        BSONElement e = i.next();
-                        string shard = e.fieldName();
-                        
-                        BSONObj res = e.embeddedObjectUserCheck();
-                        
-                        uassert( 10078 ,  "something bad happened" , shardedOutputCollection == res["result"].valuestrsafe() );
-                        servers.insert( shard );
-                        shardCounts.appendAs( res["counts"] , shard.c_str() );
-                        
-                        BSONObjIterator j( res["counts"].embeddedObjectUserCheck() );
-                        while ( j.more() ){
-                            BSONElement temp = j.next();
-                            counts[temp.fieldName()] += temp.numberLong();
-                        }
-                        
-                    }
-                    
-                }
-                
-                DBDirectClient db;
-                    
-                { // reduce from each stream
-                    
-                    BSONObj sortKey = BSON( "_id" << 1 );
-                    
-                    ParallelSortClusteredCursor cursor( servers , dbname + "." + shardedOutputCollection ,
-                                                        Query().sort( sortKey ) );
-                    cursor.init();
-                    
-                    auto_ptr<Scope> s = globalScriptEngine->getPooledScope( dbname );
-                    s->localConnect( dbname.c_str() );
-                    ScriptingFunction reduceFunction = s->createFunction( mr.reduceCode.c_str() );
-                    ScriptingFunction finalizeFunction = 0;
-                    if ( mr.finalizeCode.size() )
-                        finalizeFunction = s->createFunction( mr.finalizeCode.c_str() );
-                    
-                    BSONList values;
-                    
-                    result.append( "result" , mr.finalShort );
-                    
-                    while ( cursor.more() ){
-                        BSONObj t = cursor.next().getOwned();
-                        
-                        if ( values.size() == 0 ){
-                            values.push_back( t );
-                            continue;
-                        }
-                        
-                        if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ){
-                            values.push_back( t );
-                            continue;
-                        }
-                        
-                        
-                        db.insert( mr.tempLong , reduceValues( values , s.get() , reduceFunction , 1 , finalizeFunction ) );
-                        values.clear();
-                        values.push_back( t );
-                    }
-                    
-                    if ( values.size() )
-                        db.insert( mr.tempLong , reduceValues( values , s.get() , reduceFunction , 1 , finalizeFunction ) );
-                }
-                
-                long long finalCount = mr.renameIfNeeded( db );
-                log(0) << " mapreducefinishcommand " << mr.finalLong << " " << finalCount << endl;
-
-                for ( set<ServerAndQuery>::iterator i=servers.begin(); i!=servers.end(); i++ ){
-                    ScopedDbConnection conn( i->_server );
-                    conn->dropCollection( dbname + "." + shardedOutputCollection );
-                    conn.done();
-                }
-                
-                result.append( "shardCounts" , shardCounts.obj() );
-                
-                {
-                    BSONObjBuilder c;
-                    for ( map<string,long long>::iterator i=counts.begin(); i!=counts.end(); i++ ){
-                        c.append( i->first , i->second );
-                    }
-                    result.append( "counts" , c.obj() );
-                }
-
-                return 1;
-            }
-        } mapReduceFinishCommand;
-
-    }
-
-}
-
diff --git a/db/namespace-inl.h b/db/namespace-inl.h
new file mode 100644
index 0000000..a777ff8
--- /dev/null
+++ b/db/namespace-inl.h
@@ -0,0 +1,130 @@
+// @file namespace-inl.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "namespace.h"
+
+namespace mongo {
+
+    inline Namespace& Namespace::operator=(const char *ns) {
+        // we fill the remaining space with all zeroes here.  as the full Namespace struct is in
+        // the datafiles (the .ns files specifically), that is helpful as then they are deterministic
+        // in the bytes they have for a given sequence of operations.  that makes testing and debugging
+        // the data files easier.
+        //
+        // if profiling indicates this method is a significant bottleneck, we could have a version we
+        // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes.
+        //
+        unsigned len = strlen(ns);
+        uassert( 10080 , "ns name too long, max size is 128", len < MaxNsLen);
+        memset(buf, 0, MaxNsLen);
+        memcpy(buf, ns, len);
+        return *this;
+    }
+
+    inline string Namespace::extraName(int i) const {
+        char ex[] = "$extra";
+        ex[5] += i;
+        string s = string(buf) + ex;
+        massert( 10348 , "$extra: ns name too long", s.size() < MaxNsLen);
+        return s;
+    }
+
+    inline bool Namespace::isExtra() const {
+        const char *p = strstr(buf, "$extr");
+        return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
+    }
+
+    inline int Namespace::hash() const {
+        unsigned x = 0;
+        const char *p = buf;
+        while ( *p ) {
+            x = x * 131 + *p;
+            p++;
+        }
+        return (x & 0x7fffffff) | 0x8000000; // must be > 0
+    }
+
+    /* future : this doesn't need to be an inline. */
+    inline string Namespace::getSisterNS( const char * local ) const {
+        assert( local && local[0] != '.' );
+        string old(buf);
+        if ( old.find( "." ) != string::npos )
+            old = old.substr( 0 , old.find( "." ) );
+        return old + "." + local;
+    }
+
+    inline IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected ) {
+        if( idxNo < NIndexesBase )
+            return _indexes[idxNo];
+        Extra *e = extra();
+        if ( ! e ) {
+            if ( missingExpected )
+                throw MsgAssertionException( 13283 , "Missing Extra" );
+            massert(13282, "missing Extra", e);
+        }
+        int i = idxNo - NIndexesBase;
+        if( i >= NIndexesExtra ) {
+            e = e->next(this);
+            if ( ! e ) {
+                if ( missingExpected )
+                    throw MsgAssertionException( 13283 , "missing extra" );
+                massert(13283, "missing Extra", e);
+            }
+            i -= NIndexesExtra;
+        }
+        return e->details[i];
+    }
+
+    inline int NamespaceDetails::idxNo(IndexDetails& idx) {
+        IndexIterator i = ii();
+        while( i.more() ) {
+            if( &i.next() == &idx )
+                return i.pos()-1;
+        }
+        massert( 10349 , "E12000 idxNo fails", false);
+        return -1;
+    }
+
+    inline int NamespaceDetails::findIndexByKeyPattern(const BSONObj& keyPattern) {
+        IndexIterator i = ii();
+        while( i.more() ) {
+            if( i.next().keyPattern() == keyPattern )
+                return i.pos()-1;
+        }
+        return -1;
+    }
+
+    // @return offset in indexes[]
+    inline int NamespaceDetails::findIndexByName(const char *name) {
+        IndexIterator i = ii();
+        while( i.more() ) {
+            if ( strcmp(i.next().info.obj().getStringField("name"),name) == 0 )
+                return i.pos()-1;
+        }
+        return -1;
+    }
+
+    inline NamespaceDetails::IndexIterator::IndexIterator(NamespaceDetails *_d) {
+        d = _d;
+        i = 0;
+        n = d->nIndexes;
+    }
+
+}
diff --git a/db/namespace.cpp b/db/namespace.cpp
index 8a1ab6f..fcdaee2 100644
--- a/db/namespace.cpp
+++ b/db/namespace.cpp
@@ -19,7 +19,7 @@
 #include "pch.h"
 #include "pdfile.h"
 #include "db.h"
-#include "../util/mmap.h"
+#include "mongommf.h"
 #include "../util/hashtab.h"
 #include "../scripting/engine.h"
 #include "btree.h"
@@ -31,6 +31,8 @@
 
 namespace mongo {
 
+    BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 );
+
     BSONObj idKeyPattern = fromjson("{\"_id\":1}");
 
     /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
@@ -45,7 +47,7 @@ namespace mongo {
     NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool _capped ) {
         /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
         firstExtent = lastExtent = capExtent = loc;
-        datasize = nrecords = 0;
+        stats.datasize = stats.nrecords = 0;
         lastExtentSize = 0;
         nIndexes = 0;
         capped = _capped;
@@ -58,20 +60,23 @@ namespace mongo {
         // For capped case, signal that we are doing initial extent allocation.
         if ( capped )
             cappedLastDelRecLastExtent().setInvalid();
-		assert( sizeof(dataFileVersion) == 2 );
-		dataFileVersion = 0;
-		indexFileVersion = 0;
+        assert( sizeof(dataFileVersion) == 2 );
+        dataFileVersion = 0;
+        indexFileVersion = 0;
         multiKeyIndexBits = 0;
         reservedA = 0;
         extraOffset = 0;
-        backgroundIndexBuildInProgress = 0;
+        indexBuildInProgress = 0;
+        reservedB = 0;
+        capped2.cc2_ptr = 0;
+        capped2.fileNumber = 0;
         memset(reserved, 0, sizeof(reserved));
     }
 
     bool NamespaceIndex::exists() const {
         return !MMF::exists(path());
     }
-    
+
     boost::filesystem::path NamespaceIndex::path() const {
         boost::filesystem::path ret( dir_ );
         if ( directoryperdb )
@@ -88,23 +93,56 @@ namespace mongo {
         if ( !boost::filesystem::exists( dir ) )
             BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( dir ) );
     }
-    
-	int lenForNewNsFiles = 16 * 1024 * 1024;
-    
-    void NamespaceDetails::onLoad(const Namespace& k) { 
-        if( k.isExtra() ) { 
+
+    unsigned lenForNewNsFiles = 16 * 1024 * 1024;
+
+#if defined(_DEBUG)
+    void NamespaceDetails::dump(const Namespace& k) {
+        if( !cmdLine.dur )
+            cout << "ns offsets which follow will not display correctly with --dur disabled" << endl;
+
+        size_t ofs = 1; // 1 is sentinel that the find call below failed
+        privateViews.find(this, /*out*/ofs);
+
+        cout << "ns" << hex << setw(8) << ofs << ' ';
+        cout << k.toString() << '\n';
+
+        if( k.isExtra() ) {
+            cout << "ns\t extra" << endl;
+            return;
+        }
+
+        cout << "ns         " << firstExtent.toString() << ' ' << lastExtent.toString() << " nidx:" << nIndexes << '\n';
+        cout << "ns         " << stats.datasize << ' ' << stats.nrecords << ' ' << nIndexes << '\n';
+        cout << "ns         " << capped << ' ' << paddingFactor << ' ' << flags << ' ' << dataFileVersion << '\n';
+        cout << "ns         " << multiKeyIndexBits << ' ' << indexBuildInProgress << '\n';
+        cout << "ns         " << (int) reserved[0] << ' ' << (int) reserved[59];
+        cout << endl;
+    }
+#endif
+
+    void NamespaceDetails::onLoad(const Namespace& k) {
+        //dump(k);
+
+        if( k.isExtra() ) {
             /* overflow storage for indexes - so don't treat as a NamespaceDetails object. */
             return;
         }
 
-        assertInWriteLock();
-        if( backgroundIndexBuildInProgress ) { 
-            log() << "backgroundIndexBuildInProgress was " << backgroundIndexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl;
-            backgroundIndexBuildInProgress = 0;
+        DEV assertInWriteLock();
+
+        if( indexBuildInProgress || capped2.cc2_ptr ) {
+            assertInWriteLock();
+            if( indexBuildInProgress ) {
+                log() << "indexBuildInProgress was " << indexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl;
+                getDur().writingInt( indexBuildInProgress ) = 0;
+            }
+            if( capped2.cc2_ptr )
+                *getDur().writing(&capped2.cc2_ptr) = 0;
         }
     }
 
-    static void namespaceOnLoadCallback(const Namespace& k, NamespaceDetails& v) { 
+    static void namespaceOnLoadCallback(const Namespace& k, NamespaceDetails& v) {
         v.onLoad(k);
     }
 
@@ -117,105 +155,113 @@ namespace mongo {
            we need to be sure to clear any cached info for the database in
            local.*.
         */
-		/*
+        /*
         if ( "local" != database_ ) {
             DBInfo i(database_.c_str());
             i.dbDropped();
         }
-		*/
-		int len = -1;
+        */
+
+        unsigned long long len = 0;
         boost::filesystem::path nsPath = path();
         string pathString = nsPath.string();
-        MMF::Pointer p;
-        if( MMF::exists(nsPath) ) { 
-			p = f.map(pathString.c_str());
-            if( !p.isNull() ) {
+        void *p = 0;
+        if( MMF::exists(nsPath) ) {
+            if( f.open(pathString, true) ) {
                 len = f.length();
-                if ( len % (1024*1024) != 0 ){
+                if ( len % (1024*1024) != 0 ) {
                     log() << "bad .ns file: " << pathString << endl;
                     uassert( 10079 ,  "bad .ns file length, cannot open database", len % (1024*1024) == 0 );
                 }
+                p = f.getView();
             }
-		}
-		else {
-			// use lenForNewNsFiles, we are making a new database
-			massert( 10343 ,  "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 );
+        }
+        else {
+            // use lenForNewNsFiles, we are making a new database
+            massert( 10343, "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 );
             maybeMkdir();
-			long l = lenForNewNsFiles;
-			p = f.map(pathString.c_str(), l);
-            if( !p.isNull() ) {
-                len = (int) l;
+            unsigned long long l = lenForNewNsFiles;
+            if( f.create(pathString, l, true) ) {
+                getDur().createdFile(pathString, l); // always a new file
+                len = l;
                 assert( len == lenForNewNsFiles );
+                p = f.getView();
             }
-		}
+        }
 
-        if ( p.isNull() ) {
-            problem() << "couldn't open file " << pathString << " terminating" << endl;
+        if ( p == 0 ) {
+            /** TODO: this shouldn't terminate? */
+            log() << "error couldn't open file " << pathString << " terminating" << endl;
             dbexit( EXIT_FS );
         }
 
-        ht = new HashTable<Namespace,NamespaceDetails,MMF::Pointer>(p, len, "namespace index");
+
+        assert( len <= 0x7fffffff );
+        ht = new HashTable<Namespace,NamespaceDetails>(p, (int) len, "namespace index");
         if( checkNsFilesOnLoad )
             ht->iterAll(namespaceOnLoadCallback);
     }
-    
+
     static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , void * extra ) {
         list<string> * l = (list<string>*)extra;
         if ( ! k.hasDollarSign() )
             l->push_back( (string)k );
     }
-
     void NamespaceIndex::getNamespaces( list<string>& tofill , bool onlyCollections ) const {
         assert( onlyCollections ); // TODO: need to implement this
         //                                  need boost::bind or something to make this less ugly
-        
+
         if ( ht )
             ht->iterAll( namespaceGetNamespacesCallback , (void*)&tofill );
     }
 
     void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) {
-		BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
+
         {
+            Record *r = (Record *) getDur().writingPtr(d, sizeof(Record));
+            d = &r->asDeleted();
             // defensive code: try to make us notice if we reference a deleted record
-            (unsigned&) (((Record *) d)->data) = 0xeeeeeeee;
+            (unsigned&) (r->data) = 0xeeeeeeee;
         }
-        dassert( dloc.drec() == d );
-        DEBUGGING out() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl;
+        DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl;
         if ( capped ) {
             if ( !cappedLastDelRecLastExtent().isValid() ) {
                 // Initial extent allocation.  Insert at end.
                 d->nextDeleted = DiskLoc();
                 if ( cappedListOfAllDeletedRecords().isNull() )
-                    cappedListOfAllDeletedRecords() = dloc;
+                    getDur().writingDiskLoc( cappedListOfAllDeletedRecords() ) = dloc;
                 else {
                     DiskLoc i = cappedListOfAllDeletedRecords();
-                    for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted );
-                    i.drec()->nextDeleted = dloc;
+                    for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted )
+                        ;
+                    i.drec()->nextDeleted.writing() = dloc;
                 }
-            } else {
+            }
+            else {
                 d->nextDeleted = cappedFirstDeletedInCurExtent();
-                cappedFirstDeletedInCurExtent() = dloc;
+                getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = dloc;
                 // always compact() after this so order doesn't matter
             }
-        } else {
+        }
+        else {
             int b = bucket(d->lengthWithHeaders);
             DiskLoc& list = deletedList[b];
             DiskLoc oldHead = list;
-            list = dloc;
+            getDur().writingDiskLoc(list) = dloc;
             d->nextDeleted = oldHead;
         }
     }
 
-    /*
-       lenToAlloc is WITH header
-    */
+    // lenToAlloc is WITH header
     DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) {
         lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
         DiskLoc loc = _alloc(ns, lenToAlloc);
         if ( loc.isNull() )
             return loc;
 
-        DeletedRecord *r = loc.drec();
+        const DeletedRecord *r = loc.drec();
+        //r = getDur().writing(r);
 
         /* note we want to grab from the front so our next pointers on disk tend
         to go in a forward direction which is important for performance. */
@@ -229,20 +275,21 @@ namespace mongo {
         if ( capped == 0 ) {
             if ( left < 24 || left < (lenToAlloc >> 3) ) {
                 // you get the whole thing.
-				DataFileMgr::grow(loc, regionlen);
+                //DataFileMgr::grow(loc, regionlen);
                 return loc;
             }
         }
 
         /* split off some for further use. */
-        r->lengthWithHeaders = lenToAlloc;
-		DataFileMgr::grow(loc, lenToAlloc);
+        getDur().writingInt(r->lengthWithHeaders) = lenToAlloc;
+        //DataFileMgr::grow(loc, lenToAlloc);
         DiskLoc newDelLoc = loc;
         newDelLoc.inc(lenToAlloc);
         DeletedRecord *newDel = DataFileMgr::makeDeletedRecord(newDelLoc, left);
-        newDel->extentOfs = r->extentOfs;
-        newDel->lengthWithHeaders = left;
-        newDel->nextDeleted.Null();
+        DeletedRecord *newDelW = getDur().writing(newDel);
+        newDelW->extentOfs = r->extentOfs;
+        newDelW->lengthWithHeaders = left;
+        newDelW->nextDeleted.Null();
 
         addDeletedRec(newDel, newDelLoc);
 
@@ -267,7 +314,7 @@ namespace mongo {
                 int a = cur.a();
                 if ( a < -1 || a >= 100000 ) {
                     problem() << "~~ Assertion - cur out of range in _alloc() " << cur.toString() <<
-                    " a:" << a << " b:" << b << " chain:" << chain << '\n';
+                              " a:" << a << " b:" << b << " chain:" << chain << '\n';
                     sayDbContext();
                     if ( cur == *prev )
                         prev->Null();
@@ -303,7 +350,7 @@ namespace mongo {
                 cur.Null();
             }
             else {
-                /*this defensive check only made sense for the mmap storage engine: 
+                /*this defensive check only made sense for the mmap storage engine:
                   if ( r->nextDeleted.getOfs() == 0 ) {
                     problem() << "~~ Assertion - bad nextDeleted " << r->nextDeleted.toString() <<
                     " b:" << b << " chain:" << chain << ", fixing.\n";
@@ -316,9 +363,9 @@ namespace mongo {
 
         /* unlink ourself from the deleted list */
         {
-            DeletedRecord *bmr = bestmatch.drec();
-            *bestprev = bmr->nextDeleted;
-            bmr->nextDeleted.setInvalid(); // defensive.
+            const DeletedRecord *bmr = bestmatch.drec();
+            *getDur().writing(bestprev) = bmr->nextDeleted;
+            bmr->nextDeleted.writing().setInvalid(); // defensive.
             assert(bmr->extentOfs < bestmatch.getOfs());
         }
 
@@ -371,9 +418,9 @@ namespace mongo {
                 if ( e == capExtent )
                     out() << " (capExtent)";
                 out() << '\n';
-                out() << "    magic: " << hex << e.ext()->magic << dec << " extent->ns: " << e.ext()->nsDiagnostic.buf << '\n';
+                out() << "    magic: " << hex << e.ext()->magic << dec << " extent->ns: " << e.ext()->nsDiagnostic.toString() << '\n';
                 out() << "    fr: " << e.ext()->firstRecord.toString() <<
-                     " lr: " << e.ext()->lastRecord.toString() << " extent->len: " << e.ext()->length << '\n';
+                      " lr: " << e.ext()->lastRecord.toString() << " extent->len: " << e.ext()->length << '\n';
             }
             assert( len * 5 > lastExtentSize ); // assume it is unusually large record; if not, something is broken
         }
@@ -387,12 +434,27 @@ namespace mongo {
         return cappedAlloc(ns,len);
     }
 
+    void NamespaceIndex::kill_ns(const char *ns) {
+        if ( !ht )
+            return;
+        Namespace n(ns);
+        ht->kill(n);
+
+        for( int i = 0; i<=1; i++ ) {
+            try {
+                Namespace extra(n.extraName(i).c_str());
+                ht->kill(extra);
+            }
+            catch(DBException&) { }
+        }
+    }
+
     /* extra space for indexes when more than 10 */
     NamespaceDetails::Extra* NamespaceIndex::newExtra(const char *ns, int i, NamespaceDetails *d) {
         assert( i >= 0 && i <= 1 );
         Namespace n(ns);
         Namespace extra(n.extraName(i).c_str()); // throws userexception if ns name too long
-        
+
         massert( 10350 ,  "allocExtra: base ns missing?", d );
         massert( 10351 ,  "allocExtra: extra already exists", ht->get(extra) == 0 );
 
@@ -409,10 +471,10 @@ namespace mongo {
         long ofs = e->ofsFrom(this);
         if( i == 0 ) {
             assert( extraOffset == 0 );
-            extraOffset = ofs;
+            *getDur().writing(&extraOffset) = ofs;
             assert( extra() == e );
         }
-        else { 
+        else {
             Extra *hd = extra();
             assert( hd->next(this) == 0 );
             hd->setNext(ofs);
@@ -422,25 +484,23 @@ namespace mongo {
 
     /* you MUST call when adding an index.  see pdfile.cpp */
     IndexDetails& NamespaceDetails::addIndex(const char *thisns, bool resetTransient) {
-        assert( nsdetails(thisns) == this );
-
         IndexDetails *id;
         try {
             id = &idx(nIndexes,true);
         }
-        catch(DBException&) { 
+        catch(DBException&) {
             allocExtra(thisns, nIndexes);
             id = &idx(nIndexes,false);
         }
 
-        nIndexes++;
+        (*getDur().writing(&nIndexes))++;
         if ( resetTransient )
             NamespaceDetailsTransient::get_w(thisns).addedIndex();
         return *id;
     }
 
     // must be called when renaming a NS to fix up extra
-    void NamespaceDetails::copyingFrom(const char *thisns, NamespaceDetails *src) { 
+    void NamespaceDetails::copyingFrom(const char *thisns, NamespaceDetails *src) {
         extraOffset = 0; // we are a copy -- the old value is wrong.  fixing it up below.
         Extra *se = src->extra();
         int n = NIndexesBase;
@@ -454,7 +514,7 @@ namespace mongo {
                 Extra *nxt = allocExtra(thisns, n);
                 e->setNext( nxt->ofsFrom(this) );
                 e = nxt;
-            } 
+            }
             assert( extraOffset );
         }
     }
@@ -473,25 +533,39 @@ namespace mongo {
         }*/
         return -1;
     }
-    
-    long long NamespaceDetails::storageSize( int * numExtents ){
+
+    long long NamespaceDetails::storageSize( int * numExtents , BSONArrayBuilder * extentInfo ) const {
         Extent * e = firstExtent.ext();
         assert( e );
-        
+
         long long total = 0;
         int n = 0;
-        while ( e ){
+        while ( e ) {
             total += e->length;
-            e = e->getNextExtent();
             n++;
+
+            if ( extentInfo ) {
+                extentInfo->append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) );
+            }
+
+            e = e->getNextExtent();
         }
-        
+
         if ( numExtents )
             *numExtents = n;
-        
+
         return total;
     }
-    
+
+    NamespaceDetails *NamespaceDetails::writingWithExtra() {
+        vector< pair< long long, unsigned > > writeRanges;
+        writeRanges.push_back( make_pair( 0, sizeof( NamespaceDetails ) ) );
+        for( Extra *e = extra(); e; e = e->next( this ) ) {
+            writeRanges.push_back( make_pair( (char*)e - (char*)this, sizeof( Extra ) ) );
+        }
+        return reinterpret_cast< NamespaceDetails* >( getDur().writingRangesAtOffsets( this, writeRanges ) );
+    }
+
     /* ------------------------------------------------------------------------- */
 
     mongo::mutex NamespaceDetailsTransient::_qcMutex("qc");
@@ -505,14 +579,14 @@ namespace mongo {
         _keysComputed = false;
         _indexSpecs.clear();
     }
-    
-/*    NamespaceDetailsTransient& NamespaceDetailsTransient::get(const char *ns) {
-        shared_ptr< NamespaceDetailsTransient > &t = map_[ ns ];
-        if ( t.get() == 0 )
-            t.reset( new NamespaceDetailsTransient(ns) );
-        return *t;
-    }
-*/
+
+    /*    NamespaceDetailsTransient& NamespaceDetailsTransient::get(const char *ns) {
+            shared_ptr< NamespaceDetailsTransient > &t = map_[ ns ];
+            if ( t.get() == 0 )
+                t.reset( new NamespaceDetailsTransient(ns) );
+            return *t;
+        }
+    */
     void NamespaceDetailsTransient::clearForPrefix(const char *prefix) {
         assertInWriteLock();
         vector< string > found;
@@ -523,7 +597,7 @@ namespace mongo {
             _map[ *i ].reset();
         }
     }
-    
+
     void NamespaceDetailsTransient::computeIndexKeys() {
         _keysComputed = true;
         _indexKeys.clear();
@@ -565,92 +639,92 @@ namespace mongo {
 
     void renameNamespace( const char *from, const char *to ) {
         NamespaceIndex *ni = nsindex( from );
-		assert( ni );
+        assert( ni );
         assert( ni->details( from ) );
         assert( ! ni->details( to ) );
-		
-		// Our namespace and index details will move to a different 
-		// memory location.  The only references to namespace and 
-		// index details across commands are in cursors and nsd
-		// transient (including query cache) so clear these.
-		ClientCursor::invalidate( from );
-		NamespaceDetailsTransient::clearForPrefix( from );
-
-		NamespaceDetails *details = ni->details( from );
-		ni->add_ns( to, *details );
+
+        // Our namespace and index details will move to a different
+        // memory location.  The only references to namespace and
+        // index details across commands are in cursors and nsd
+        // transient (including query cache) so clear these.
+        ClientCursor::invalidate( from );
+        NamespaceDetailsTransient::clearForPrefix( from );
+
+        NamespaceDetails *details = ni->details( from );
+        ni->add_ns( to, *details );
         NamespaceDetails *todetails = ni->details( to );
-        try { 
+        try {
             todetails->copyingFrom(to, details); // fixes extraOffset
         }
-        catch( DBException& ) { 
+        catch( DBException& ) {
             // could end up here if .ns is full - if so try to clean up / roll back a little
             ni->kill_ns(to);
             throw;
         }
-		ni->kill_ns( from );
-		details = todetails;
-		
-		BSONObj oldSpec;
-		char database[MaxDatabaseLen];
-		nsToDatabase(from, database);
-		string s = database;
-		s += ".system.namespaces";
-		assert( Helpers::findOne( s.c_str(), BSON( "name" << from ), oldSpec ) );
-		
-		BSONObjBuilder newSpecB;
-		BSONObjIterator i( oldSpec.getObjectField( "options" ) );
-		while( i.more() ) {
-			BSONElement e = i.next();
-			if ( strcmp( e.fieldName(), "create" ) != 0 )
-				newSpecB.append( e );
-			else
-				newSpecB << "create" << to;
-		}
-		BSONObj newSpec = newSpecB.done();    
-		addNewNamespaceToCatalog( to, newSpec.isEmpty() ? 0 : &newSpec );
-
-		deleteObjects( s.c_str(), BSON( "name" << from ), false, false, true );
-		// oldSpec variable no longer valid memory
-
-		BSONObj oldIndexSpec;
-		s = database;
-		s += ".system.indexes";
-		while( Helpers::findOne( s.c_str(), BSON( "ns" << from ), oldIndexSpec ) ) {
-			BSONObjBuilder newIndexSpecB;
-			BSONObjIterator i( oldIndexSpec );
-			while( i.more() ) {
-				BSONElement e = i.next();
-				if ( strcmp( e.fieldName(), "ns" ) != 0 )
-					newIndexSpecB.append( e );
-				else
-					newIndexSpecB << "ns" << to;
-			}
-			BSONObj newIndexSpec = newIndexSpecB.done();
-			DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, BSONElement(), false );
-			int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) );
-			IndexDetails &indexDetails = details->idx(indexI);
-			string oldIndexNs = indexDetails.indexNamespace();
-			indexDetails.info = newIndexSpecLoc;
-			string newIndexNs = indexDetails.indexNamespace();
-			
-			BtreeBucket::renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() );
-			deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true );
-		}
-	}
-
-    bool legalClientSystemNS( const string& ns , bool write ){
+        ni->kill_ns( from );
+        details = todetails;
+
+        BSONObj oldSpec;
+        char database[MaxDatabaseNameLen];
+        nsToDatabase(from, database);
+        string s = database;
+        s += ".system.namespaces";
+        assert( Helpers::findOne( s.c_str(), BSON( "name" << from ), oldSpec ) );
+
+        BSONObjBuilder newSpecB;
+        BSONObjIterator i( oldSpec.getObjectField( "options" ) );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( strcmp( e.fieldName(), "create" ) != 0 )
+                newSpecB.append( e );
+            else
+                newSpecB << "create" << to;
+        }
+        BSONObj newSpec = newSpecB.done();
+        addNewNamespaceToCatalog( to, newSpec.isEmpty() ? 0 : &newSpec );
+
+        deleteObjects( s.c_str(), BSON( "name" << from ), false, false, true );
+        // oldSpec variable no longer valid memory
+
+        BSONObj oldIndexSpec;
+        s = database;
+        s += ".system.indexes";
+        while( Helpers::findOne( s.c_str(), BSON( "ns" << from ), oldIndexSpec ) ) {
+            BSONObjBuilder newIndexSpecB;
+            BSONObjIterator i( oldIndexSpec );
+            while( i.more() ) {
+                BSONElement e = i.next();
+                if ( strcmp( e.fieldName(), "ns" ) != 0 )
+                    newIndexSpecB.append( e );
+                else
+                    newIndexSpecB << "ns" << to;
+            }
+            BSONObj newIndexSpec = newIndexSpecB.done();
+            DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, BSONElement(), false );
+            int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) );
+            IndexDetails &indexDetails = details->idx(indexI);
+            string oldIndexNs = indexDetails.indexNamespace();
+            indexDetails.info = newIndexSpecLoc;
+            string newIndexNs = indexDetails.indexNamespace();
+
+            BtreeBucket::renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() );
+            deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true );
+        }
+    }
+
+    bool legalClientSystemNS( const string& ns , bool write ) {
         if( ns == "local.system.replset" ) return true;
 
         if ( ns.find( ".system.users" ) != string::npos )
             return true;
 
-        if ( ns.find( ".system.js" ) != string::npos ){
+        if ( ns.find( ".system.js" ) != string::npos ) {
             if ( write )
                 Scope::storedFuncMod();
             return true;
         }
-        
+
         return false;
     }
-	
+
 } // namespace mongo
diff --git a/db/namespace.h b/db/namespace.h
index abc35bb..4ec1edd 100644
--- a/db/namespace.h
+++ b/db/namespace.h
@@ -23,130 +23,66 @@
 #include "queryutil.h"
 #include "diskloc.h"
 #include "../util/hashtab.h"
-#include "../util/mmap.h"
+#include "mongommf.h"
 
 namespace mongo {
 
-	/* in the mongo source code, "client" means "database". */
+    /* in the mongo source code, "client" means "database". */
 
-    const int MaxDatabaseLen = 256; // max str len for the db name, including null char
+    const int MaxDatabaseNameLen = 256; // max str len for the db name, including null char
 
-	// "database.a.b.c" -> "database"
-    inline void nsToDatabase(const char *ns, char *database) {
-        const char *p = ns;
-        char *q = database;
-        while ( *p != '.' ) {
-            if ( *p == 0 )
-                break;
-            *q++ = *p++;
-        }
-        *q = 0;
-        if (q-database>=MaxDatabaseLen) {
-            log() << "nsToDatabase: ns too long. terminating, buf overrun condition" << endl;
-            dbexit( EXIT_POSSIBLE_CORRUPTION );
-        }
-    }
-    inline string nsToDatabase(const char *ns) {
-        char buf[MaxDatabaseLen];
-        nsToDatabase(ns, buf);
-        return buf;
-    }
-    inline string nsToDatabase(const string& ns) {
-        size_t i = ns.find( '.' );
-        if ( i == string::npos )
-            return ns;
-        return ns.substr( 0 , i );
-    }
-
-	/* e.g.
-	   NamespaceString ns("acme.orders");
-	   cout << ns.coll; // "orders"
-	*/
+    /* e.g.
+       NamespaceString ns("acme.orders");
+       cout << ns.coll; // "orders"
+    */
     class NamespaceString {
     public:
         string db;
         string coll; // note collection names can have periods in them for organizing purposes (e.g. "system.indexes")
+
+        NamespaceString( const char * ns ) { init(ns); }
+        NamespaceString( const string& ns ) { init(ns.c_str()); }
+        string ns() const { return db + '.' + coll; }
+        bool isSystem() const { return strncmp(coll.c_str(), "system.", 7) == 0; }
     private:
-        void init(const char *ns) { 
+        void init(const char *ns) {
             const char *p = strchr(ns, '.');
             if( p == 0 ) return;
             db = string(ns, p - ns);
             coll = p + 1;
         }
-    public:
-        NamespaceString( const char * ns ) { init(ns); }
-        NamespaceString( const string& ns ) { init(ns.c_str()); }
-
-        string ns() const { 
-            return db + '.' + coll;
-        }
-
-        bool isSystem() { 
-            return strncmp(coll.c_str(), "system.", 7) == 0;
-        }
     };
 
 #pragma pack(1)
-	/* This helper class is used to make the HashMap below in NamespaceDetails */
+    /* This helper class is used to make the HashMap below in NamespaceDetails e.g. see line:
+          HashTable<Namespace,NamespaceDetails> *ht;
+    */
     class Namespace {
     public:
-        enum MaxNsLenValue { MaxNsLen = 128 };
-        Namespace(const char *ns) {
-            *this = ns;
-        }
-        Namespace& operator=(const char *ns) {
-            uassert( 10080 , "ns name too long, max size is 128", strlen(ns) < MaxNsLen);
-            //memset(buf, 0, MaxNsLen); /* this is just to keep stuff clean in the files for easy dumping and reading */
-            strcpy_s(buf, MaxNsLen, ns);
-            return *this;
-        }
+        explicit Namespace(const char *ns) { *this = ns; }
+        Namespace& operator=(const char *ns);
 
-        /* for more than 10 indexes -- see NamespaceDetails::Extra */
-        string extraName(int i) {
-            char ex[] = "$extra";
-            ex[5] += i;
-            string s = string(buf) + ex;
-            massert( 10348 , "$extra: ns name too long", s.size() < MaxNsLen);
-            return s;
-        }
-        bool isExtra() const { 
-            const char *p = strstr(buf, "$extr");
-            return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
-        }
         bool hasDollarSign() const { return strchr( buf , '$' ) > 0;  }
         void kill() { buf[0] = 0x7f; }
         bool operator==(const char *r) const { return strcmp(buf, r) == 0; }
         bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; }
-        int hash() const {
-            unsigned x = 0;
-            const char *p = buf;
-            while ( *p ) {
-                x = x * 131 + *p;
-                p++;
-            }
-            return (x & 0x7fffffff) | 0x8000000; // must be > 0
-        }
-
-        /**
-           ( foo.bar ).getSisterNS( "blah" ) == foo.blah
-		   perhaps this should move to the NamespaceString helper?
+        int hash() const; // value returned is always > 0
+        string toString() const { return (string) buf; }
+        operator string() const { return (string) buf; }
+
+        /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes
+           (more than 10 IndexDetails).  It's a bit hacky because of this late addition with backward
+           file support. */
+        string extraName(int i) const;
+        bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */
+
+        /** ( foo.bar ).getSisterNS( "blah" ) == foo.blah
+            perhaps this should move to the NamespaceString helper?
          */
-        string getSisterNS( const char * local ) {
-            assert( local && local[0] != '.' );
-            string old(buf);
-            if ( old.find( "." ) != string::npos )
-                old = old.substr( 0 , old.find( "." ) );
-            return old + "." + local;
-        }
-
-        string toString() const {
-            return (string)buf;
-        }
-
-        operator string() const {
-            return (string)buf;
-        }
+        string getSisterNS( const char * local ) const;
 
+        enum MaxNsLenValue { MaxNsLen = 128 };
+    private:
         char buf[MaxNsLen];
     };
 #pragma pack()
@@ -158,7 +94,9 @@ namespace mongo {
 namespace mongo {
 
     /** @return true if a client can modify this namespace
-        things like *.system.users */
+        things like *.system.users
+        @param write used when .system.js
+    */
     bool legalClientSystemNS( const string& ns , bool write );
 
     /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
@@ -170,92 +108,106 @@ namespace mongo {
     extern int bucketSizes[];
 
 #pragma pack(1)
-    /* this is the "header" for a collection that has all its details.  in the .ns file.
+    /* NamespaceDetails : this is the "header" for a collection that has all its details.
+       It's in the .ns file and this is a memory mapped region (thus the pack pragma above).
     */
     class NamespaceDetails {
-        friend class NamespaceIndex;
-        enum { NIndexesExtra = 30,
-               NIndexesBase  = 10
-        };
     public:
-        struct ExtraOld {
-            // note we could use this field for more chaining later, so don't waste it:
-            unsigned long long reserved1;
-            IndexDetails details[NIndexesExtra];
-            unsigned reserved2;
-            unsigned reserved3;
-        };
-        class Extra { 
+        enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase  = 10 };
+
+        /*-------- data fields, as present on disk : */
+        DiskLoc firstExtent;
+        DiskLoc lastExtent;
+        /* NOTE: capped collections v1 override the meaning of deletedList.
+                 deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
+                 the capped namespace.
+                 deletedList[1] points to the last record in the prev extent.  When the "current extent"
+                 changes, this value is updated.  !deletedList[1].isValid() when this value is not
+                 yet computed.
+        */
+        DiskLoc deletedList[Buckets];
+        // ofs 168 (8 byte aligned)
+        struct Stats {
+            // datasize and nrecords MUST Be adjacent code assumes!
+            long long datasize; // this includes padding, but not record headers
+            long long nrecords;
+        } stats;
+        int lastExtentSize;
+        int nIndexes;
+    private:
+        // ofs 192
+        IndexDetails _indexes[NIndexesBase];
+    public:
+        // ofs 352 (16 byte aligned)
+        int capped;
+        int max;                              // max # of objects for a capped table.  TODO: should this be 64 bit?
+        double paddingFactor;                 // 1.0 = no padding.
+        // ofs 386 (16)
+        int flags;
+        DiskLoc capExtent;
+        DiskLoc capFirstNewRecord;
+        unsigned short dataFileVersion;       // NamespaceDetails version.  So we can do backward compatibility in the future. See filever.h
+        unsigned short indexFileVersion;
+        unsigned long long multiKeyIndexBits;
+    private:
+        // ofs 400 (16)
+        unsigned long long reservedA;
+        long long extraOffset;                // where the $extra info is located (bytes relative to this)
+    public:
+        int indexBuildInProgress;   // 1 if in prog
+        unsigned reservedB;
+        // ofs 424 (8)
+        struct Capped2 {
+            unsigned long long cc2_ptr;       // see capped.cpp
+            unsigned fileNumber;
+        } capped2;
+        char reserved[60];
+        /*-------- end data 496 bytes */
+
+        explicit NamespaceDetails( const DiskLoc &loc, bool _capped );
+
+        class Extra {
             long long _next;
-		public:
+        public:
             IndexDetails details[NIndexesExtra];
-		private:
+        private:
             unsigned reserved2;
             unsigned reserved3;
-			Extra(const Extra&) { assert(false); }
-			Extra& operator=(const Extra& r) { assert(false); return *this; }
+            Extra(const Extra&) { assert(false); }
+            Extra& operator=(const Extra& r) { assert(false); return *this; }
         public:
             Extra() { }
-            long ofsFrom(NamespaceDetails *d) { 
+            long ofsFrom(NamespaceDetails *d) {
                 return ((char *) this) - ((char *) d);
             }
             void init() { memset(this, 0, sizeof(Extra)); }
-            Extra* next(NamespaceDetails *d) { 
+            Extra* next(NamespaceDetails *d) {
                 if( _next == 0 ) return 0;
                 return (Extra*) (((char *) d) + _next);
             }
-            void setNext(long ofs) { _next = ofs;  }
-            void copy(NamespaceDetails *d, const Extra& e) { 
+            void setNext(long ofs) { *getDur().writing(&_next) = ofs;  }
+            void copy(NamespaceDetails *d, const Extra& e) {
                 memcpy(this, &e, sizeof(Extra));
                 _next = 0;
             }
-        }; // Extra
-
-        Extra* extra() { 
+        };
+        Extra* extra() {
             if( extraOffset == 0 ) return 0;
             return (Extra *) (((char *) this) + extraOffset);
         }
-
-    public:
         /* add extra space for indexes when more than 10 */
         Extra* allocExtra(const char *ns, int nindexessofar);
-
         void copyingFrom(const char *thisns, NamespaceDetails *src); // must be called when renaming a NS to fix up extra
 
-        enum { NIndexesMax = 64 };
-
-        BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 );
-        BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits
-		BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 );
-		BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 );
-
         /* called when loaded from disk */
         void onLoad(const Namespace& k);
 
-        NamespaceDetails( const DiskLoc &loc, bool _capped );
-
-        DiskLoc firstExtent;
-        DiskLoc lastExtent;
-
-        /* NOTE: capped collections override the meaning of deleted list.  
-                 deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
-                 the capped namespace.
-                 deletedList[1] points to the last record in the prev extent.  When the "current extent" 
-                 changes, this value is updated.  !deletedList[1].isValid() when this value is not 
-                 yet computed.
-        */
-        DiskLoc deletedList[Buckets];
+        /* dump info on this namespace.  for debugging. */
+        void dump(const Namespace& k);
 
+        /* dump info on all extents for this namespace.  for debugging. */
         void dumpExtents();
 
-        long long datasize;
-        long long nrecords;
-        int lastExtentSize;
-        int nIndexes;
-
-    private:
-        IndexDetails _indexes[NIndexesBase];
-
     private:
         Extent *theCapExtent() const { return capExtent.ext(); }
         void advanceCapExtent( const char *ns );
@@ -263,6 +215,7 @@ namespace mongo {
         DiskLoc cappedAlloc(const char *ns, int len);
         DiskLoc &cappedFirstDeletedInCurExtent();
         bool nextIsInCapExtent( const DiskLoc &dl ) const;
+
     public:
         DiskLoc& cappedListOfAllDeletedRecords() { return deletedList[0]; }
         DiskLoc& cappedLastDelRecLastExtent()    { return deletedList[1]; }
@@ -270,122 +223,79 @@ namespace mongo {
         bool capLooped() const { return capped && capFirstNewRecord.isValid();  }
         bool inCapExtent( const DiskLoc &dl ) const;
         void cappedCheckMigrate();
-        void cappedTruncateAfter(const char *ns, DiskLoc after, bool inclusive); /** remove rest of the capped collection from this point onward */
+        /**
+         * Truncate documents newer than the document at 'end' from the capped
+         * collection.  The collection cannot be completely emptied using this
+         * function.  An assertion will be thrown if that is attempted.
+         * @param inclusive - Truncate 'end' as well iff true
+         */
+        void cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive);
+        /** Remove all documents from the capped collection */
         void emptyCappedCollection(const char *ns);
-        
-        int capped;
-
-        int max; // max # of objects for a capped table.  TODO: should this be 64 bit? 
-        double paddingFactor; // 1.0 = no padding.
-        int flags;
-
-        DiskLoc capExtent;
-        DiskLoc capFirstNewRecord;
-
-        /* NamespaceDetails version.  So we can do backward compatibility in the future.
-		   See filever.h
-        */
-		unsigned short dataFileVersion;
-		unsigned short indexFileVersion;
 
-        unsigned long long multiKeyIndexBits;
-    private:
-        unsigned long long reservedA;
-        long long extraOffset; // where the $extra info is located (bytes relative to this)
-    public:
-        int backgroundIndexBuildInProgress; // 1 if in prog
-        char reserved[76];
-
-        /* when a background index build is in progress, we don't count the index in nIndexes until 
+        /* when a background index build is in progress, we don't count the index in nIndexes until
            complete, yet need to still use it in _indexRecord() - thus we use this function for that.
         */
-        int nIndexesBeingBuilt() const { return nIndexes + backgroundIndexBuildInProgress; }
+        int nIndexesBeingBuilt() const { return nIndexes + indexBuildInProgress; }
 
-        /* NOTE: be careful with flags.  are we manipulating them in read locks?  if so, 
+        /* NOTE: be careful with flags.  are we manipulating them in read locks?  if so,
                  this isn't thread safe.  TODO
         */
         enum NamespaceFlags {
             Flag_HaveIdIndex = 1 << 0 // set when we have _id index (ONLY if ensureIdIndex was called -- 0 if that has never been called)
         };
 
-        IndexDetails& idx(int idxNo, bool missingExpected = false ) {
-            if( idxNo < NIndexesBase ) 
-                return _indexes[idxNo];
-            Extra *e = extra();
-            if ( ! e ){
-                if ( missingExpected )
-                    throw MsgAssertionException( 13283 , "Missing Extra" );
-                massert(13282, "missing Extra", e);
-            }
-            int i = idxNo - NIndexesBase;
-            if( i >= NIndexesExtra ) {
-                e = e->next(this);
-                if ( ! e ){
-                    if ( missingExpected )
-                        throw MsgAssertionException( 13283 , "missing extra" );
-                    massert(13283, "missing Extra", e);
-                }
-                i -= NIndexesExtra;
-            }
-            return e->details[i];
-        }
-        IndexDetails& backgroundIdx() { 
-            DEV assert(backgroundIndexBuildInProgress);
+        IndexDetails& idx(int idxNo, bool missingExpected = false );
+
+        /** get the IndexDetails for the index currently being built in the background. (there is at most one) */
+        IndexDetails& inProgIdx() {
+            DEV assert(indexBuildInProgress);
             return idx(nIndexes);
         }
 
-        class IndexIterator { 
-            friend class NamespaceDetails;
-            int i;
-            int n;
-            NamespaceDetails *d;
-            IndexIterator(NamespaceDetails *_d) { 
-                d = _d;
-                i = 0;
-                n = d->nIndexes;
-            }
+        class IndexIterator {
         public:
             int pos() { return i; } // note this is the next one to come
             bool more() { return i < n; }
             IndexDetails& next() { return d->idx(i++); }
-        }; // IndexIterator
+        private:
+            friend class NamespaceDetails;
+            int i, n;
+            NamespaceDetails *d;
+            IndexIterator(NamespaceDetails *_d);
+        };
 
         IndexIterator ii() { return IndexIterator(this); }
 
-        /* hackish - find our index # in the indexes array
-        */
-        int idxNo(IndexDetails& idx) { 
-            IndexIterator i = ii();
-            while( i.more() ) {
-                if( &i.next() == &idx )
-                    return i.pos()-1;
-            }
-            massert( 10349 , "E12000 idxNo fails", false);
-            return -1;
-        }
+        /* hackish - find our index # in the indexes array */
+        int idxNo(IndexDetails& idx);
 
         /* multikey indexes are indexes where there are more than one key in the index
              for a single document. see multikey in wiki.
            for these, we have to do some dedup work on queries.
         */
-        bool isMultikey(int i) {
-            return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0;
-        }
-        void setIndexIsMultikey(int i) { 
+        bool isMultikey(int i) const { return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0; }
+        void setIndexIsMultikey(int i) {
             dassert( i < NIndexesMax );
-            multiKeyIndexBits |= (((unsigned long long) 1) << i);
+            unsigned long long x = ((unsigned long long) 1) << i;
+            if( multiKeyIndexBits & x ) return;
+            *getDur().writing(&multiKeyIndexBits) |= x;
         }
-        void clearIndexIsMultikey(int i) { 
+        void clearIndexIsMultikey(int i) {
             dassert( i < NIndexesMax );
-            multiKeyIndexBits &= ~(((unsigned long long) 1) << i);
+            unsigned long long x = ((unsigned long long) 1) << i;
+            if( (multiKeyIndexBits & x) == 0 ) return;
+            *getDur().writing(&multiKeyIndexBits) &= ~x;
         }
 
         /* add a new index.  does not add to system.indexes etc. - just to NamespaceDetails.
-           caller must populate returned object. 
+           caller must populate returned object.
          */
         IndexDetails& addIndex(const char *thisns, bool resetTransient=true);
 
-        void aboutToDeleteAnIndex() { flags &= ~Flag_HaveIdIndex;  }
+        void aboutToDeleteAnIndex() { 
+            *getDur().writing(&flags) = flags & ~Flag_HaveIdIndex;
+        }
 
         /* returns index of the first index in which the field is present. -1 if not present. */
         int fieldIsIndexed(const char *fieldName);
@@ -393,49 +303,35 @@ namespace mongo {
         void paddingFits() {
             double x = paddingFactor - 0.01;
             if ( x >= 1.0 )
-                paddingFactor = x;
+                getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
         }
         void paddingTooSmall() {
             double x = paddingFactor + 0.6;
             if ( x <= 2.0 )
-                paddingFactor = x;
+                getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
         }
 
-        //returns offset in indexes[]
-        int findIndexByName(const char *name) {
-            IndexIterator i = ii();
-            while( i.more() ) {
-                if ( strcmp(i.next().info.obj().getStringField("name"),name) == 0 )
-                    return i.pos()-1;
-            }
-            return -1;
-        }
+        // @return offset in indexes[]
+        int findIndexByName(const char *name);
+
+        // @return offset in indexes[]
+        int findIndexByKeyPattern(const BSONObj& keyPattern);
 
-        //returns offset in indexes[]
-        int findIndexByKeyPattern(const BSONObj& keyPattern) {
-            IndexIterator i = ii();
-            while( i.more() ) {
-                if( i.next().keyPattern() == keyPattern ) 
-                    return i.pos()-1;
-            }
-            return -1;
-        }
-        
         void findIndexByType( const string& name , vector<int>& matches ) {
             IndexIterator i = ii();
-            while ( i.more() ){
+            while ( i.more() ) {
                 if ( i.next().getSpec().getTypeName() == name )
                     matches.push_back( i.pos() - 1 );
             }
         }
 
-        /* @return -1 = not found 
+        /* @return -1 = not found
            generally id is first index, so not that expensive an operation (assuming present).
         */
         int findIdIndex() {
             IndexIterator i = ii();
             while( i.more() ) {
-                if( i.next().isIdIndex() ) 
+                if( i.next().isIdIndex() )
                     return i.pos()-1;
             }
             return -1;
@@ -451,25 +347,46 @@ namespace mongo {
 
         /* allocate a new record.  lenToAlloc includes headers. */
         DiskLoc alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc);
-
         /* add a given record to the deleted chains for this NS */
         void addDeletedRec(DeletedRecord *d, DiskLoc dloc);
-
         void dumpDeleted(set<DiskLoc> *extents = 0);
-
         // Start from firstExtent by default.
         DiskLoc firstRecord( const DiskLoc &startExtent = DiskLoc() ) const;
-
         // Start from lastExtent by default.
         DiskLoc lastRecord( const DiskLoc &startExtent = DiskLoc() ) const;
+        long long storageSize( int * numExtents = 0 , BSONArrayBuilder * extentInfo = 0 ) const;
+
+        int averageObjectSize() {
+            if ( stats.nrecords == 0 )
+                return 5;
+            return (int) (stats.datasize / stats.nrecords);
+        }
+
+        NamespaceDetails *writingWithoutExtra() {
+            return ( NamespaceDetails* ) getDur().writingPtr( this, sizeof( NamespaceDetails ) );
+        }
+        /** Make all linked Extra objects writeable as well */
+        NamespaceDetails *writingWithExtra();
 
-        long long storageSize( int * numExtents = 0 );
-        
     private:
         DiskLoc _alloc(const char *ns, int len);
         void maybeComplain( const char *ns, int len ) const;
         DiskLoc __stdAlloc(int len);
         void compact(); // combine adjacent deleted records
+        friend class NamespaceIndex;
+        struct ExtraOld {
+            // note we could use this field for more chaining later, so don't waste it:
+            unsigned long long reserved1;
+            IndexDetails details[NIndexesExtra];
+            unsigned reserved2;
+            unsigned reserved3;
+        };
+        /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */
+        void cappedTruncateLastDelUpdate();
+        BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 );
+        BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 );
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 );
     }; // NamespaceDetails
 #pragma pack()
 
@@ -486,7 +403,7 @@ namespace mongo {
        todo: cleanup code, need abstractions and separation
     */
     class NamespaceDetailsTransient : boost::noncopyable {
-		BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 );
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 );
 
         /* general ------------------------------------------------------------- */
     private:
@@ -494,18 +411,18 @@ namespace mongo {
         void reset();
         static std::map< string, shared_ptr< NamespaceDetailsTransient > > _map;
     public:
-        NamespaceDetailsTransient(const char *ns) : _ns(ns), _keysComputed(false), _qcWriteCount(){ }
+        NamespaceDetailsTransient(const char *ns) : _ns(ns), _keysComputed(false), _qcWriteCount() { }
         /* _get() is not threadsafe -- see get_inlock() comments */
         static NamespaceDetailsTransient& _get(const char *ns);
         /* use get_w() when doing write operations */
-        static NamespaceDetailsTransient& get_w(const char *ns) { 
+        static NamespaceDetailsTransient& get_w(const char *ns) {
             DEV assertInWriteLock();
             return _get(ns);
         }
         void addedIndex() { reset(); }
         void deletedIndex() { reset(); }
         /* Drop cached information on all namespaces beginning with the specified prefix.
-           Can be useful as index namespaces share the same start as the regular collection. 
+           Can be useful as index namespaces share the same start as the regular collection.
            SLOW - sequential scan of all NamespaceDetailsTransient objects */
         static void clearForPrefix(const char *prefix);
 
@@ -531,11 +448,11 @@ namespace mongo {
         map<const IndexDetails*,IndexSpec> _indexSpecs;
         static mongo::mutex _isMutex;
     public:
-        const IndexSpec& getIndexSpec( const IndexDetails * details ){
+        const IndexSpec& getIndexSpec( const IndexDetails * details ) {
             IndexSpec& spec = _indexSpecs[details];
-            if ( ! spec._finishedInit ){
+            if ( ! spec._finishedInit ) {
                 scoped_lock lk(_isMutex);
-                if ( ! spec._finishedInit ){
+                if ( ! spec._finishedInit ) {
                     spec.reset( details );
                     assert( spec._finishedInit );
                 }
@@ -591,7 +508,7 @@ namespace mongo {
 
     public:
         NamespaceIndex(const string &dir, const string &database) :
-          ht( 0 ), dir_( dir ), database_( database ) {}
+            ht( 0 ), dir_( dir ), database_( database ) {}
 
         /* returns true if new db will be created if we init lazily */
         bool exists() const;
@@ -600,13 +517,13 @@ namespace mongo {
 
         void add_ns(const char *ns, DiskLoc& loc, bool capped) {
             NamespaceDetails details( loc, capped );
-			add_ns( ns, details );
+            add_ns( ns, details );
         }
-		void add_ns( const char *ns, const NamespaceDetails &details ) {
+        void add_ns( const char *ns, const NamespaceDetails &details ) {
             init();
             Namespace n(ns);
             uassert( 10081 , "too many namespaces/collections", ht->put(n, details));
-		}
+        }
 
         /* just for diagnostics */
         /*size_t detailsOffset(NamespaceDetails *d) {
@@ -625,20 +542,7 @@ namespace mongo {
             return d;
         }
 
-        void kill_ns(const char *ns) {
-            if ( !ht )
-                return;
-            Namespace n(ns);
-            ht->kill(n);
-
-            for( int i = 0; i<=1; i++ ) {
-                try {
-                    Namespace extra(n.extraName(i).c_str());
-                    ht->kill(extra);
-                }
-                catch(DBException&) { }
-            }
-        }
+        void kill_ns(const char *ns);
 
         bool find(const char *ns, DiskLoc& loc) {
             NamespaceDetails *l = details(ns);
@@ -658,12 +562,12 @@ namespace mongo {
         NamespaceDetails::Extra* newExtra(const char *ns, int n, NamespaceDetails *d);
 
         boost::filesystem::path path() const;
-    private:
 
+    private:
         void maybeMkdir() const;
-        
-        MMF f;
-        HashTable<Namespace,NamespaceDetails,MMF::Pointer> *ht;
+
+        MongoMMF f;
+        HashTable<Namespace,NamespaceDetails> *ht;
         string dir_;
         string database_;
     };
@@ -675,4 +579,31 @@ namespace mongo {
     // (Arguments should include db name)
     void renameNamespace( const char *from, const char *to );
 
+    // "database.a.b.c" -> "database"
+    inline void nsToDatabase(const char *ns, char *database) {
+        const char *p = ns;
+        char *q = database;
+        while ( *p != '.' ) {
+            if ( *p == 0 )
+                break;
+            *q++ = *p++;
+        }
+        *q = 0;
+        if (q-database>=MaxDatabaseNameLen) {
+            log() << "nsToDatabase: ns too long. terminating, buf overrun condition" << endl;
+            dbexit( EXIT_POSSIBLE_CORRUPTION );
+        }
+    }
+    inline string nsToDatabase(const char *ns) {
+        char buf[MaxDatabaseNameLen];
+        nsToDatabase(ns, buf);
+        return buf;
+    }
+    inline string nsToDatabase(const string& ns) {
+        size_t i = ns.find( '.' );
+        if ( i == string::npos )
+            return ns;
+        return ns.substr( 0 , i );
+    }
+
 } // namespace mongo
diff --git a/db/nonce.cpp b/db/nonce.cpp
index 519cfaa..6f35c79 100644
--- a/db/nonce.cpp
+++ b/db/nonce.cpp
@@ -17,22 +17,25 @@
 
 #include "pch.h"
 #include "nonce.h"
+#include "../util/time_support.h"
 
 extern int do_md5_test(void);
 
 namespace mongo {
-    
-	Security::Security() {
-		static int n;
-		massert( 10352 , "Security is a singleton class", ++n == 1);
-		init(); 
-	}
 
-    void Security::init(){
-		if( _initialized ) return;
-		_initialized = true;
+    BOOST_STATIC_ASSERT( sizeof(nonce) == 8 );
 
-#if defined(__linux__) || defined(__sunos__)
+    Security::Security() {
+        static int n;
+        massert( 10352 , "Security is a singleton class", ++n == 1);
+        init();
+    }
+
+    void Security::init() {
+        if( _initialized ) return;
+        _initialized = true;
+
+#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__)
         _devrandom = new ifstream("/dev/urandom", ios::binary|ios::in);
         massert( 10353 ,  "can't open dev/urandom", _devrandom->is_open() );
 #elif defined(_WIN32)
@@ -40,36 +43,41 @@ namespace mongo {
 #else
         srandomdev();
 #endif
-        assert( sizeof(nonce) == 8 );
-        
+
 #ifndef NDEBUG
         if ( do_md5_test() )
-	    massert( 10354 , "md5 unit test fails", false);
+            massert( 10354 , "md5 unit test fails", false);
 #endif
     }
-    
-    nonce Security::getNonce(){
+
+    nonce Security::getNonce() {
         static mongo::mutex m("getNonce");
         scoped_lock lk(m);
+        
+        if ( ! _initialized )
+            init();
 
-		/* question/todo: /dev/random works on OS X.  is it better 
-		   to use that than random() / srandom()?
-		*/
+        /* question/todo: /dev/random works on OS X.  is it better
+           to use that than random() / srandom()?
+        */
 
         nonce n;
-#if defined(__linux__) || defined(__sunos__)
+#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__)
         _devrandom->read((char*)&n, sizeof(n));
         massert( 10355 , "devrandom failed", !_devrandom->fail());
 #elif defined(_WIN32)
-        n = (((unsigned long long)rand())<<32) | rand();
+        unsigned a=0, b=0;
+        assert( rand_s(&a) == 0 );
+        assert( rand_s(&b) == 0 );
+        n = (((unsigned long long)a)<<32) | b;
 #else
         n = (((unsigned long long)random())<<32) | random();
 #endif
         return n;
     }
     unsigned getRandomNumber() { return (unsigned) security.getNonce(); }
-    
-	bool Security::_initialized;
+
+    bool Security::_initialized;
     Security security;
-        
+
 } // namespace mongo
diff --git a/db/nonce.h b/db/nonce.h
index 593931f..21592ab 100644
--- a/db/nonce.h
+++ b/db/nonce.h
@@ -20,23 +20,23 @@
 namespace mongo {
 
     typedef unsigned long long nonce;
-    
+
     struct Security {
         Security();
 
         nonce getNonce();
 
-		/** safe during global var initialization */
-		nonce getNonceInitSafe() { 
-			init();
-			return getNonce();
-		}
-	private:
+        /** safe during global var initialization */
+        nonce getNonceInitSafe() {
+            init();
+            return getNonce();
+        }
+    private:
         ifstream *_devrandom;
-		static bool _initialized;
-		void init(); // can call more than once
+        static bool _initialized;
+        void init(); // can call more than once
     };
-    
+
     extern Security security;
-        
+
 } // namespace mongo
diff --git a/db/oplog.cpp b/db/oplog.cpp
index 93800c7..1557cbd 100644
--- a/db/oplog.cpp
+++ b/db/oplog.cpp
@@ -22,18 +22,19 @@
 #include "repl.h"
 #include "commands.h"
 #include "repl/rs.h"
+#include "stats/counters.h"
 
 namespace mongo {
 
     void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt );
 
-    int __findingStartInitialTimeout = 5; // configurable for testing    
+    int __findingStartInitialTimeout = 5; // configurable for testing
 
     // cached copies of these...so don't rename them, drop them, etc.!!!
     static NamespaceDetails *localOplogMainDetails = 0;
     static Database *localDB = 0;
     static NamespaceDetails *rsOplogDetails = 0;
-    void oplogCheckCloseDatabase( Database * db ){
+    void oplogCheckCloseDatabase( Database * db ) {
         localDB = 0;
         localOplogMainDetails = 0;
         rsOplogDetails = 0;
@@ -44,10 +45,10 @@ namespace mongo {
         uassert(13288, "replSet error write op to db before replSet initialized", str::startsWith(ns, "local.") || *opstr == 'n');
     }
 
-    /** write an op to the oplog that is already built. 
+    /** write an op to the oplog that is already built.
         todo : make _logOpRS() call this so we don't repeat ourself?
         */
-    void _logOpObjRS(const BSONObj& op) { 
+    void _logOpObjRS(const BSONObj& op) {
         DEV assertInWriteLock();
 
         const OpTime ts = op["ts"]._opTime();
@@ -62,11 +63,11 @@ namespace mongo {
                 rsOplogDetails = nsdetails(logns);
                 massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
             }
-            Client::Context ctx( "" , localDB, false );
+            Client::Context ctx( logns , localDB, false );
             {
                 int len = op.objsize();
                 Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
-                memcpy(r->data, op.objdata(), len);
+                memcpy(getDur().writingPtr(r->data, len), op.objdata(), len);
             }
             /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
                      this code (or code in now() maybe) should be improved.
@@ -82,11 +83,42 @@ namespace mongo {
         }
     }
 
+    /** given a BSON object, create a new one at dst which is the existing (partial) object
+        with a new object element appended at the end with fieldname "o".
+
+        @param partial already build object with everything except the o member.  e.g. something like:
+               { ts:..., ns:..., os2:... }
+        @param o a bson object to be added with fieldname "o"
+        @dst   where to put the newly built combined object.  e.g. ends up as something like:
+               { ts:..., ns:..., os2:..., o:... }
+    */
+    void append_O_Obj(char *dst, const BSONObj& partial, const BSONObj& o) {
+        const int size1 = partial.objsize() - 1;  // less the EOO char
+        const int oOfs = size1+3;                 // 3 = byte BSONOBJTYPE + byte 'o' + byte \0
+
+        void *p = getDur().writingPtr(dst, oOfs+o.objsize()+1);
+
+        memcpy(p, partial.objdata(), size1);
+
+        // adjust overall bson object size for the o: field
+        *(static_cast<unsigned*>(p)) += o.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/;
+
+        char *b = static_cast<char *>(p);
+        b += size1;
+        *b++ = (char) Object;
+        *b++ = 'o'; // { o : ... }
+        *b++ = 0;   // null terminate "o" fieldname
+        memcpy(b, o.objdata(), o.objsize());
+        b += o.objsize();
+        *b = EOO;
+    }
+
     static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
         DEV assertInWriteLock();
+        // ^- static is safe as we are in write lock
         static BufBuilder bufbuilder(8*1024);
-        
-        if ( strncmp(ns, "local.", 6) == 0 ){
+
+        if ( strncmp(ns, "local.", 6) == 0 ) {
             if ( strncmp(ns, "local.slaves", 12) == 0 )
                 resetSlaveCache();
             return;
@@ -94,15 +126,15 @@ namespace mongo {
 
         const OpTime ts = OpTime::now();
 
-        long long hNew;
-        if( theReplSet ) { 
+        long long hashNew;
+        if( theReplSet ) {
             massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary());
-            hNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId();
+            hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId();
         }
         else {
             // must be initiation
             assert( *ns == 0 );
-            hNew = 0;
+            hashNew = 0;
         }
 
         /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
@@ -113,7 +145,7 @@ namespace mongo {
         BSONObjBuilder b(bufbuilder);
 
         b.appendTimestamp("ts", ts.asDate());
-        b.append("h", hNew);
+        b.append("h", hashNew);
 
         b.append("op", opstr);
         b.append("ns", ns);
@@ -136,7 +168,7 @@ namespace mongo {
                 rsOplogDetails = nsdetails(logns);
                 massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
             }
-            Client::Context ctx( "" , localDB, false );
+            Client::Context ctx( logns , localDB, false );
             r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
             /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
                      this code (or code in now() maybe) should be improved.
@@ -147,22 +179,13 @@ namespace mongo {
                     log() << "replSet " << theReplSet->isPrimary() << rsLog;
                 }
                 theReplSet->lastOpTimeWritten = ts;
-                theReplSet->lastH = hNew;
+                theReplSet->lastH = hashNew;
                 ctx.getClient()->setLastOp( ts.asDate() );
             }
         }
 
-        char *p = r->data;
-        memcpy(p, partial.objdata(), posz);
-        *((unsigned *)p) += obj.objsize() + 1 + 2;
-        p += posz - 1;
-        *p++ = (char) Object;
-        *p++ = 'o';
-        *p++ = 0;
-        memcpy(p, obj.objdata(), obj.objsize());
-        p += obj.objsize();
-        *p = EOO;
-        
+        append_O_Obj(r->data, partial, obj);
+
         if ( logLevel >= 6 ) {
             BSONObj temp(r);
             log( 6 ) << "logOp:" << temp << endl;
@@ -192,9 +215,9 @@ namespace mongo {
     static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
         DEV assertInWriteLock();
         static BufBuilder bufbuilder(8*1024);
-        
-        if ( strncmp(ns, "local.", 6) == 0 ){
-            if ( strncmp(ns, "local.slaves", 12) == 0 ){
+
+        if ( strncmp(ns, "local.", 6) == 0 ) {
+            if ( strncmp(ns, "local.slaves", 12) == 0 ) {
                 resetSlaveCache();
             }
             return;
@@ -202,7 +225,7 @@ namespace mongo {
 
         const OpTime ts = OpTime::now();
         Client::Context context;
-        
+
         /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
            instead we do a single copy to the destination position in the memory mapped file.
         */
@@ -216,9 +239,10 @@ namespace mongo {
             b.appendBool("b", *bb);
         if ( o2 )
             b.append("o2", *o2);
-        BSONObj partial = b.done();
-        int posz = partial.objsize();
-        int len = posz + obj.objsize() + 1 + 2 /*o:*/;
+        BSONObj partial = b.done(); // partial is everything except the o:... part.
+
+        int po_sz = partial.objsize();
+        int len = po_sz + obj.objsize() + 1 + 2 /*o:*/;
 
         Record *r;
         if( logNS == 0 ) {
@@ -230,25 +254,18 @@ namespace mongo {
                 localOplogMainDetails = nsdetails(logNS);
                 assert( localOplogMainDetails );
             }
-            Client::Context ctx( "" , localDB, false );
+            Client::Context ctx( logNS , localDB, false );
             r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len);
-        } else {
+        }
+        else {
             Client::Context ctx( logNS, dbpath, 0, false );
             assert( nsdetails( logNS ) );
+            // first we allocate the space, then we fill it below.
             r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
         }
 
-        char *p = r->data;
-        memcpy(p, partial.objdata(), posz);
-        *((unsigned *)p) += obj.objsize() + 1 + 2;
-        p += posz - 1;
-        *p++ = (char) Object;
-        *p++ = 'o';
-        *p++ = 0;
-        memcpy(p, obj.objdata(), obj.objsize());
-        p += obj.objsize();
-        *p = EOO;
-        
+        append_O_Obj(r->data, partial, obj);
+
         context.getClient()->setLastOp( ts.asDate() );
 
         if ( logLevel >= 6 ) {
@@ -259,17 +276,17 @@ namespace mongo {
     }
 
     static void (*_logOp)(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) = _logOpOld;
-    void newReplUp() { 
+    void newReplUp() {
         replSettings.master = true;
-        _logOp = _logOpRS; 
+        _logOp = _logOpRS;
     }
-    void newRepl() { 
+    void newRepl() {
         replSettings.master = true;
-        _logOp = _logOpUninitialized; 
+        _logOp = _logOpUninitialized;
     }
     void oldRepl() { _logOp = _logOpOld; }
 
-    void logKeepalive() { 
+    void logKeepalive() {
         _logOp("n", "", 0, BSONObj(), 0, 0);
     }
     void logOpComment(const BSONObj& obj) {
@@ -289,13 +306,10 @@ namespace mongo {
     void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) {
         if ( replSettings.master ) {
             _logOp(opstr, ns, 0, obj, patt, b);
-            // why? :
-            //char cl[ 256 ];
-            //nsToDatabase( ns, cl );
         }
-        
+
         logOpForSharding( opstr , ns , obj , patt );
-    }    
+    }
 
     void createOplog() {
         dblock lk;
@@ -307,15 +321,15 @@ namespace mongo {
             ns = rsoplog;
 
         Client::Context ctx(ns);
-        
+
         NamespaceDetails * nsd = nsdetails( ns );
 
         if ( nsd ) {
-            
-            if ( cmdLine.oplogSize != 0 ){
+
+            if ( cmdLine.oplogSize != 0 ) {
                 int o = (int)(nsd->storageSize() / ( 1024 * 1024 ) );
                 int n = (int)(cmdLine.oplogSize / ( 1024 * 1024 ) );
-                if ( n != o ){
+                if ( n != o ) {
                     stringstream ss;
                     ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog";
                     log() << ss.str() << endl;
@@ -332,19 +346,19 @@ namespace mongo {
             }
             return;
         }
-        
+
         /* create an oplog collection, if it doesn't yet exist. */
         BSONObjBuilder b;
         double sz;
         if ( cmdLine.oplogSize != 0 )
             sz = (double)cmdLine.oplogSize;
         else {
-			/* not specified. pick a default size */
+            /* not specified. pick a default size */
             sz = 50.0 * 1000 * 1000;
             if ( sizeof(int *) >= 8 ) {
 #if defined(__APPLE__)
-				// typically these are desktops (dev machines), so keep it smallish
-				sz = (256-64) * 1000 * 1000;
+                // typically these are desktops (dev machines), so keep it smallish
+                sz = (256-64) * 1000 * 1000;
 #else
                 sz = 990.0 * 1000 * 1000;
                 boost::intmax_t free = freeSpace(); //-1 if call not supported.
@@ -356,7 +370,7 @@ namespace mongo {
         }
 
         log() << "******" << endl;
-        log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB... (use --oplogSize to change)" << endl;
+        log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl;
 
         b.append("size", sz);
         b.appendBool("capped", 1);
@@ -366,7 +380,7 @@ namespace mongo {
         BSONObj o = b.done();
         userCreateNS(ns, o, err, false);
         if( !rs )
-            logOp( "n", "dummy", BSONObj() );
+            logOp( "n", "", BSONObj() );
 
         /* sync here so we don't get any surprising lag later when we try to sync */
         MemoryMappedFile::flushAll(true);
@@ -394,8 +408,8 @@ namespace mongo {
     void pretouchN(vector<BSONObj>& v, unsigned a, unsigned b) {
         DEV assert( !dbMutex.isWriteLocked() );
 
-        Client *c = &cc();
-        if( c == 0 ) { 
+        Client *c = currentClient.get();
+        if( c == 0 ) {
             Client::initThread("pretouchN");
             c = &cc();
         }
@@ -413,7 +427,7 @@ namespace mongo {
                 continue;
             /* todo : other operations */
 
-            try { 
+            try {
                 BSONObj o = op.getObjectField(which);
                 BSONElement _id;
                 if( o.getObjectID(_id) ) {
@@ -426,7 +440,7 @@ namespace mongo {
                         _dummy_z += result.objsize(); // touch
                 }
             }
-            catch( DBException& e ) { 
+            catch( DBException& e ) {
                 log() << "ignoring assertion in pretouchN() " << a << ' ' << b << ' ' << i << ' ' << e.toString() << endl;
             }
         }
@@ -447,7 +461,7 @@ namespace mongo {
             return;
         /* todo : other operations */
 
-        try { 
+        try {
             BSONObj o = op.getObjectField(which);
             BSONElement _id;
             if( o.getObjectID(_id) ) {
@@ -461,15 +475,17 @@ namespace mongo {
                     _dummy_z += result.objsize(); // touch
             }
         }
-        catch( DBException& ) { 
+        catch( DBException& ) {
             log() << "ignoring assertion in pretouchOperation()" << endl;
         }
     }
 
-    void applyOperation_inlock(const BSONObj& op){
-        if( logLevel >= 6 ) 
+    void applyOperation_inlock(const BSONObj& op , bool fromRepl ) {
+        OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;
+
+        if( logLevel >= 6 )
             log() << "applying op: " << op << endl;
-        
+
         assertInWriteLock();
 
         OpDebug debug;
@@ -479,6 +495,8 @@ namespace mongo {
         const char *opType = op.getStringField("op");
 
         if ( *opType == 'i' ) {
+            opCounters->gotInsert();
+
             const char *p = strchr(ns, '.');
             if ( p && strcmp(p, ".system.indexes") == 0 ) {
                 // updates aren't allowed for indexes -- so we will do a regular insert. if index already
@@ -499,11 +517,11 @@ namespace mongo {
                 else {
                     BSONObjBuilder b;
                     b.append(_id);
-                    
+
                     /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */
-                    RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow 
+                    RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow
 
-                    /* todo : it may be better to do an insert here, and then catch the dup key exception and do update 
+                    /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
                               then.  very few upserts will not be inserts...
                               */
                     updateObjects(ns, o, b.done(), true, false, false , debug );
@@ -511,10 +529,14 @@ namespace mongo {
             }
         }
         else if ( *opType == 'u' ) {
+            opCounters->gotUpdate();
+
             RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow
             updateObjects(ns, o, op.getObjectField("o2"), /*upsert*/ op.getBoolField("b"), /*multi*/ false, /*logop*/ false , debug );
         }
         else if ( *opType == 'd' ) {
+            opCounters->gotDelete();
+
             if ( opType[1] == 0 )
                 deleteObjects(ns, o, op.getBoolField("b"));
             else
@@ -523,7 +545,9 @@ namespace mongo {
         else if ( *opType == 'n' ) {
             // no op
         }
-        else if ( *opType == 'c' ){
+        else if ( *opType == 'c' ) {
+            opCounters->gotCommand();
+
             BufBuilder bb;
             BSONObjBuilder ob;
             _runCommands(ns, o, bb, ob, true, 0);
@@ -533,9 +557,9 @@ namespace mongo {
             ss << "unknown opType [" << opType << "]";
             throw MsgAssertionException( 13141 , ss.str() );
         }
-        
+
     }
-    
+
     class ApplyOpsCmd : public Command {
     public:
         virtual bool slaveOk() const { return false; }
@@ -545,17 +569,18 @@ namespace mongo {
             help << "examples: { applyOps : [ ] , preCondition : [ { ns : ... , q : ... , res : ... } ] }";
         }
         virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            
-            if ( cmdObj.firstElement().type() != Array ){
+
+            if ( cmdObj.firstElement().type() != Array ) {
                 errmsg = "ops has to be an array";
                 return false;
             }
-            
+
             BSONObj ops = cmdObj.firstElement().Obj();
-            
-            { // check input
+
+            {
+                // check input
                 BSONObjIterator i( ops );
-                while ( i.more() ){
+                while ( i.more() ) {
                     BSONElement e = i.next();
                     if ( e.type() == Object )
                         continue;
@@ -564,16 +589,16 @@ namespace mongo {
                     return false;
                 }
             }
-            
-            if ( cmdObj["preCondition"].type() == Array ){
+
+            if ( cmdObj["preCondition"].type() == Array ) {
                 BSONObjIterator i( cmdObj["preCondition"].Obj() );
-                while ( i.more() ){
+                while ( i.more() ) {
                     BSONObj f = i.next().Obj();
-                    
+
                     BSONObj realres = db.findOne( f["ns"].String() , f["q"].Obj() );
-                    
+
                     Matcher m( f["res"].Obj() );
-                    if ( ! m.matches( realres ) ){
+                    if ( ! m.matches( realres ) ) {
                         result.append( "got" , realres );
                         result.append( "whatFailed" , f );
                         errmsg = "pre-condition failed";
@@ -581,23 +606,32 @@ namespace mongo {
                     }
                 }
             }
-            
+
             // apply
             int num = 0;
             BSONObjIterator i( ops );
-            while ( i.more() ){
+            while ( i.more() ) {
                 BSONElement e = i.next();
-                applyOperation_inlock( e.Obj() );
+                applyOperation_inlock( e.Obj() , false );
                 num++;
             }
 
             result.append( "applied" , num );
 
+            if ( ! fromRepl ) {
+                // We want this applied atomically on slaves
+                // so we re-wrap without the pre-condition for speed
+
+                string tempNS = str::stream() << dbname << ".$cmd";
+
+                logOp( "c" , tempNS.c_str() , cmdObj.firstElement().wrap() );
+            }
+
             return true;
         }
 
         DBDirectClient db;
-        
+
     } applyOpsCmd;
 
 }
diff --git a/db/oplog.h b/db/oplog.h
index 34c345f..d9073ab 100644
--- a/db/oplog.h
+++ b/db/oplog.h
@@ -16,7 +16,7 @@
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-/* 
+/*
 
      local.oplog.$main is the default
 */
@@ -30,6 +30,7 @@
 #include "queryoptimizer.h"
 #include "../client/dbclient.h"
 #include "../util/optime.h"
+#include "../util/timer.h"
 
 namespace mongo {
 
@@ -38,7 +39,7 @@ namespace mongo {
     void _logOpObjRS(const BSONObj& op);
 
     /** Write operation to the log (local.oplog.$main)
-      
+
        @param opstr
         "i" insert
         "u" update
@@ -47,89 +48,88 @@ namespace mongo {
         "n" no-op
         "db" declares presence of a database (ns is set to the db name + '.')
 
-       See _logOp() in oplog.cpp for more details.   
+       See _logOp() in oplog.cpp for more details.
     */
     void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0);
 
     void logKeepalive();
 
-    /** puts obj in the oplog as a comment (a no-op).  Just for diags. 
-        convention is 
+    /** puts obj in the oplog as a comment (a no-op).  Just for diags.
+        convention is
           { msg : "text", ... }
     */
     void logOpComment(const BSONObj& obj);
 
     void oplogCheckCloseDatabase( Database * db );
-    
-    extern int __findingStartInitialTimeout; // configurable for testing    
+
+    extern int __findingStartInitialTimeout; // configurable for testing
 
     class FindingStartCursor {
     public:
-        FindingStartCursor( const QueryPlan & qp ) : 
-        _qp( qp ),
-        _findingStart( true ),
-        _findingStartMode(),
-        _findingStartTimer( 0 ),
-        _findingStartCursor( 0 )
+        FindingStartCursor( const QueryPlan & qp ) :
+            _qp( qp ),
+            _findingStart( true ),
+            _findingStartMode(),
+            _findingStartTimer( 0 )
         { init(); }
         bool done() const { return !_findingStart; }
         shared_ptr<Cursor> cRelease() { return _c; }
         void next() {
-            if ( !_findingStartCursor || !_findingStartCursor->c->ok() ) {
+            if ( !_findingStartCursor || !_findingStartCursor->ok() ) {
                 _findingStart = false;
                 _c = _qp.newCursor(); // on error, start from beginning
                 destroyClientCursor();
                 return;
             }
             switch( _findingStartMode ) {
-                case Initial: {
-                    if ( !_matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) {
-                        _findingStart = false; // found first record out of query range, so scan normally
-                        _c = _qp.newCursor( _findingStartCursor->c->currLoc() );
-                        destroyClientCursor();
-                        return;
-                    }
-                    _findingStartCursor->c->advance();
-                    RARELY {
-                        if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) {
-                            createClientCursor( startLoc( _findingStartCursor->c->currLoc() ) );
-                            _findingStartMode = FindExtent;
-                            return;
-                        }
-                    }
+            case Initial: {
+                if ( !_matcher->matches( _findingStartCursor->currKey(), _findingStartCursor->currLoc() ) ) {
+                    _findingStart = false; // found first record out of query range, so scan normally
+                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
+                    destroyClientCursor();
                     return;
                 }
-                case FindExtent: {
-                    if ( !_matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) {
-                        _findingStartMode = InExtent;
-                        return;
-                    }
-                    DiskLoc prev = prevLoc( _findingStartCursor->c->currLoc() );
-                    if ( prev.isNull() ) { // hit beginning, so start scanning from here
-                        createClientCursor();
-                        _findingStartMode = InExtent;
+                _findingStartCursor->advance();
+                RARELY {
+                    if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) {
+                        createClientCursor( startLoc( _findingStartCursor->currLoc() ) );
+                        _findingStartMode = FindExtent;
                         return;
                     }
-                    // There might be a more efficient implementation than creating new cursor & client cursor each time,
-                    // not worrying about that for now
-                    createClientCursor( prev );
+                }
+                return;
+            }
+            case FindExtent: {
+                if ( !_matcher->matches( _findingStartCursor->currKey(), _findingStartCursor->currLoc() ) ) {
+                    _findingStartMode = InExtent;
                     return;
                 }
-                case InExtent: {
-                    if ( _matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) {
-                        _findingStart = false; // found first record in query range, so scan normally
-                        _c = _qp.newCursor( _findingStartCursor->c->currLoc() );
-                        destroyClientCursor();
-                        return;
-                    }
-                    _findingStartCursor->c->advance();
+                DiskLoc prev = prevLoc( _findingStartCursor->currLoc() );
+                if ( prev.isNull() ) { // hit beginning, so start scanning from here
+                    createClientCursor();
+                    _findingStartMode = InExtent;
                     return;
                 }
-                default: {
-                    massert( 12600, "invalid _findingStartMode", false );
+                // There might be a more efficient implementation than creating new cursor & client cursor each time,
+                // not worrying about that for now
+                createClientCursor( prev );
+                return;
+            }
+            case InExtent: {
+                if ( _matcher->matches( _findingStartCursor->currKey(), _findingStartCursor->currLoc() ) ) {
+                    _findingStart = false; // found first record in query range, so scan normally
+                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
+                    destroyClientCursor();
+                    return;
                 }
-            }                
-        }     
+                _findingStartCursor->advance();
+                return;
+            }
+            default: {
+                massert( 12600, "invalid _findingStartMode", false );
+            }
+            }
+        }
         bool prepareToYield() {
             if ( _findingStartCursor ) {
                 return _findingStartCursor->prepareToYield( _yieldData );
@@ -139,10 +139,10 @@ namespace mongo {
         void recoverFromYield() {
             if ( _findingStartCursor ) {
                 if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
-                    _findingStartCursor = 0;
+                    _findingStartCursor.reset( 0 );
                 }
             }
-        }        
+        }
     private:
         enum FindingStartMode { Initial, FindExtent, InExtent };
         const QueryPlan &_qp;
@@ -150,7 +150,7 @@ namespace mongo {
         FindingStartMode _findingStartMode;
         auto_ptr< CoveredIndexMatcher > _matcher;
         Timer _findingStartTimer;
-        ClientCursor * _findingStartCursor;
+        ClientCursor::CleanupPointer _findingStartCursor;
         shared_ptr<Cursor> _c;
         ClientCursor::YieldData _yieldData;
         DiskLoc startLoc( const DiskLoc &rec ) {
@@ -162,7 +162,7 @@ namespace mongo {
             // doesn't matter if we start the extent scan with capFirstNewRecord.
             return _qp.nsd()->capFirstNewRecord;
         }
-        
+
         // should never have an empty extent in the oplog, so don't worry about that case
         DiskLoc prevLoc( const DiskLoc &rec ) {
             Extent *e = rec.rec()->myExtent( rec );
@@ -173,7 +173,8 @@ namespace mongo {
                     e = e->xprev.ext();
                 if ( e->myLoc != _qp.nsd()->capExtent )
                     return e->firstRecord;
-            } else {
+            }
+            else {
                 if ( !e->xprev.isNull() ) {
                     e = e->xprev.ext();
                     return e->firstRecord;
@@ -183,19 +184,16 @@ namespace mongo {
         }
         void createClientCursor( const DiskLoc &startLoc = DiskLoc() ) {
             shared_ptr<Cursor> c = _qp.newCursor( startLoc );
-            _findingStartCursor = new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns());
+            _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()) );
         }
         void destroyClientCursor() {
-            if ( _findingStartCursor ) {
-                ClientCursor::erase( _findingStartCursor->cursorid );
-                _findingStartCursor = 0;
-            }
+            _findingStartCursor.reset( 0 );
         }
         void init() {
             // Use a ClientCursor here so we can release db mutex while scanning
             // oplog (can take quite a while with large oplogs).
             shared_ptr<Cursor> c = _qp.newReverseCursor();
-            _findingStartCursor = new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj());
+            _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) );
             _findingStartTimer.reset();
             _findingStartMode = Initial;
             BSONElement tsElt = _qp.originalQuery()[ "ts" ];
@@ -210,5 +208,10 @@ namespace mongo {
     void pretouchOperation(const BSONObj& op);
     void pretouchN(vector<BSONObj>&, unsigned a, unsigned b);
 
-    void applyOperation_inlock(const BSONObj& op);
+    /**
+     * take an op and apply locally
+     * used for applying from an oplog
+     * @param fromRepl really from replication or for testing/internal/command/etc...
+     */
+    void applyOperation_inlock(const BSONObj& op , bool fromRepl = true );
 }
diff --git a/db/oplogreader.h b/db/oplogreader.h
index 5c2881b..54c90d9 100644
--- a/db/oplogreader.h
+++ b/db/oplogreader.h
@@ -8,7 +8,7 @@
 
 namespace mongo {
 
-    /* started abstracting out the querying of the primary/master's oplog 
+    /* started abstracting out the querying of the primary/master's oplog
        still fairly awkward but a start.
     */
     class OplogReader {
@@ -16,28 +16,24 @@ namespace mongo {
         auto_ptr<DBClientCursor> cursor;
     public:
 
-        OplogReader() { 
-            DEV log() << "TEMP *** OplogReader()" << endl;
+        OplogReader() {
         }
-        ~OplogReader() { 
-            DEV log() << "TEMP *** ~OplogReader()" << endl;
+        ~OplogReader() {
         }
 
         void resetCursor() {
-            DEV log() << "TEMP *** OplogReader::resetCursor" << endl;
             cursor.reset();
         }
         void resetConnection() {
-            DEV log() << "TEMP *** OplogReader::resetConnection" << endl;
             cursor.reset();
             _conn.reset();
         }
         DBClientConnection* conn() { return _conn.get(); }
-        BSONObj findOne(const char *ns, const Query& q) { 
-            return conn()->findOne(ns, q);
+        BSONObj findOne(const char *ns, const Query& q) {
+            return conn()->findOne(ns, q, 0, QueryOption_SlaveOk);
         }
 
-        BSONObj getLastOp(const char *ns) { 
+        BSONObj getLastOp(const char *ns) {
             return findOne(ns, Query().sort(reverseNaturalObj));
         }
 
@@ -45,7 +41,7 @@ namespace mongo {
         bool connect(string hostname);
 
         void tailCheck() {
-            if( cursor.get() && cursor->isDead() ) { 
+            if( cursor.get() && cursor->isDead() ) {
                 log() << "repl: old cursor isDead, will initiate a new one" << endl;
                 resetCursor();
             }
@@ -53,19 +49,19 @@ namespace mongo {
 
         bool haveCursor() { return cursor.get() != 0; }
 
-        void query(const char *ns, const BSONObj& query) { 
+        void query(const char *ns, const BSONObj& query) {
             assert( !haveCursor() );
             cursor = _conn->query(ns, query, 0, 0, 0, QueryOption_SlaveOk);
         }
 
-        void tailingQuery(const char *ns, const BSONObj& query) { 
+        void tailingQuery(const char *ns, const BSONObj& query) {
             assert( !haveCursor() );
             log(2) << "repl: " << ns << ".find(" << query.toString() << ')' << endl;
-            cursor = _conn->query( ns, query, 0, 0, 0, 
-                                  QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay |
-                                  /* TODO: slaveok maybe shouldn't use? */
-                                  QueryOption_AwaitData
-                                  );
+            cursor = _conn->query( ns, query, 0, 0, 0,
+                                   QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay |
+                                   /* TODO: slaveok maybe shouldn't use? */
+                                   QueryOption_AwaitData
+                                 );
         }
 
         void tailingQueryGTE(const char *ns, OpTime t) {
@@ -76,34 +72,34 @@ namespace mongo {
             tailingQuery(ns, query.done());
         }
 
-        bool more() { 
+        bool more() {
             assert( cursor.get() );
             return cursor->more();
         }
-        bool moreInCurrentBatch() { 
+        bool moreInCurrentBatch() {
             assert( cursor.get() );
             return cursor->moreInCurrentBatch();
         }
 
         /* old mongod's can't do the await flag... */
-        bool awaitCapable() { 
+        bool awaitCapable() {
             return cursor->hasResultFlag(ResultFlag_AwaitCapable);
         }
 
-        void peek(vector<BSONObj>& v, int n) { 
+        void peek(vector<BSONObj>& v, int n) {
             if( cursor.get() )
                 cursor->peek(v,n);
         }
 
         BSONObj nextSafe() { return cursor->nextSafe(); }
 
-        BSONObj next() { 
+        BSONObj next() {
             return cursor->next();
         }
 
-        void putBack(BSONObj op) { 
+        void putBack(BSONObj op) {
             cursor->putBack(op);
         }
     };
-    
+
 }
diff --git a/db/pdfile.cpp b/db/pdfile.cpp
index 216f21a..20a7423 100644
--- a/db/pdfile.cpp
+++ b/db/pdfile.cpp
@@ -20,7 +20,6 @@
 todo:
 _ table scans must be sequential, not next/prev pointers
 _ coalesce deleted
-
 _ disallow system* manipulations from the database.
 */
 
@@ -37,21 +36,21 @@ _ disallow system* manipulations from the database.
 #include "query.h"
 #include "repl.h"
 #include "dbhelpers.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "queryutil.h"
 #include "extsort.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "background.h"
 
 namespace mongo {
 
     bool inDBRepair = false;
     struct doingRepair {
-        doingRepair(){
+        doingRepair() {
             assert( ! inDBRepair );
             inDBRepair = true;
         }
-        ~doingRepair(){
+        ~doingRepair() {
             inDBRepair = false;
         }
     };
@@ -64,42 +63,42 @@ namespace mongo {
         return dbsInProg[db] != 0;
     }
 
-    bool BackgroundOperation::inProgForNs(const char *ns) { 
+    bool BackgroundOperation::inProgForNs(const char *ns) {
         assertInWriteLock();
         return nsInProg.count(ns) != 0;
     }
 
-    void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { 
+    void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) {
         uassert(12586, "cannot perform operation: a background operation is currently running for this database",
-            !inProgForDb(db));
+                !inProgForDb(db));
     }
 
-    void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { 
+    void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) {
         uassert(12587, "cannot perform operation: a background operation is currently running for this collection",
-            !inProgForNs(ns));
-    } 
+                !inProgForNs(ns));
+    }
 
-    BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { 
+    BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) {
         assertInWriteLock();
         dbsInProg[_ns.db]++;
         assert( nsInProg.count(_ns.ns()) == 0 );
         nsInProg.insert(_ns.ns());
     }
 
-    BackgroundOperation::~BackgroundOperation() { 
+    BackgroundOperation::~BackgroundOperation() {
         assertInWriteLock();
         dbsInProg[_ns.db]--;
         nsInProg.erase(_ns.ns());
     }
 
     void BackgroundOperation::dump(stringstream& ss) {
-        if( nsInProg.size() ) { 
+        if( nsInProg.size() ) {
             ss << "\n<b>Background Jobs in Progress</b>\n";
             for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ )
                 ss << "  " << *i << '\n';
         }
-        for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { 
-            if( i->second ) 
+        for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) {
+            if( i->second )
                 ss << "database " << i->first << ": " << i->second << '\n';
         }
     }
@@ -114,24 +113,23 @@ namespace mongo {
     DataFileMgr theDataFileMgr;
     DatabaseHolder dbHolder;
     int MAGIC = 0x1000;
-//    int curOp = -2;
 
     extern int otherTraceLevel;
     void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
     void ensureIdIndexForNewNs(const char *ns) {
         if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) &&
-             strstr( ns, ".$freelist" ) == 0 ){
+                strstr( ns, ".$freelist" ) == 0 ) {
             log( 1 ) << "adding _id index for collection " << ns << endl;
             ensureHaveIdIndex( ns );
-        }        
+        }
     }
 
     string getDbContext() {
         stringstream ss;
         Client * c = currentClient.get();
-        if ( c ){
+        if ( c ) {
             Client::Context * cx = c->getContext();
-            if ( cx ){
+            if ( cx ) {
                 Database *database = cx->db();
                 if ( database ) {
                     ss << database->name << ' ';
@@ -142,20 +140,44 @@ namespace mongo {
         return ss.str();
     }
 
-    BSONObj::BSONObj(const Record *r) {
-        init(r->data, false);
-    }
-
     /*---------------------------------------------------------------------*/
 
-    int initialExtentSize(int len) {
+    // inheritable class to implement an operation that may be applied to all
+    // files in a database using _applyOpToDataFiles()
+    class FileOp {
+    public:
+        virtual ~FileOp() {}
+        // Return true if file exists and operation successful
+        virtual bool apply( const boost::filesystem::path &p ) = 0;
+        virtual const char * op() const = 0;
+    };
+
+    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );
+
+    void _deleteDataFiles(const char *database) {
+        if ( directoryperdb ) {
+            FileAllocator::get()->waitUntilFinished();
+            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) );
+            return;
+        }
+        class : public FileOp {
+            virtual bool apply( const boost::filesystem::path &p ) {
+                return boost::filesystem::remove( p );
+            }
+            virtual const char * op() const {
+                return "remove";
+            }
+        } deleter;
+        _applyOpToDataFiles( database, deleter, true );
+    }
+
+    int Extent::initialSize(int len) {
         long long sz = len * 16;
         if ( len < 1000 ) sz = len * 64;
         if ( sz > 1000000000 )
             sz = 1000000000;
         int z = ((int)sz) & 0xffffff00;
         assert( z > len );
-        //DEV tlog() << "initialExtentSize(" << len << ") returns " << z << endl;
         return z;
     }
 
@@ -165,7 +187,7 @@ namespace mongo {
             return false;
         }
 
-        log(1) << "create collection " << ns << ' ' << options << '\n';
+        log(1) << "create collection " << ns << ' ' << options << endl;
 
         /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field
            and then go back and set to ok : 1 after we are done.
@@ -174,33 +196,48 @@ namespace mongo {
         if( !isFreeList )
             addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options);
 
-        long long size = initialExtentSize(128);
-        BSONElement e = options.getField("size");
-        if ( e.isNumber() ) {
-            size = e.numberLong();
-            size += 256;
-            size &= 0xffffffffffffff00LL;
+        long long size = Extent::initialSize(128);
+        {
+            BSONElement e = options.getField("size");
+            if ( e.isNumber() ) {
+                size = e.numberLong();
+                size += 256;
+                size &= 0xffffffffffffff00LL;
+            }
         }
-        
+
         uassert( 10083 ,  "invalid size spec", size > 0 );
 
         bool newCapped = false;
         int mx = 0;
-        e = options.getField("capped");
-        if ( e.type() == Bool && e.boolean() ) {
+        if( options.getBoolField("capped") ) {
             newCapped = true;
-            e = options.getField("max");
+            BSONElement e = options.getField("max");
             if ( e.isNumber() ) {
                 mx = e.numberInt();
             }
         }
 
-        // $nExtents just for debug/testing.  We create '$nExtents' extents,
-        // each of size 'size'.
-        e = options.getField( "$nExtents" );
-        int nExtents = int( e.number() );
+        // $nExtents just for debug/testing.
+        BSONElement e = options.getField( "$nExtents" );
         Database *database = cc().database();
-        if ( nExtents > 0 ) {
+        if ( e.type() == Array ) {
+            // We create one extent per array entry, with size specified
+            // by the array value.
+            BSONObjIterator i( e.embeddedObject() );
+            while( i.more() ) {
+                BSONElement e = i.next();
+                int size = int( e.number() );
+                assert( size <= 0x7fffffff );
+                // $nExtents is just for testing - always allocate new extents
+                // rather than reuse existing extents so we have some predictibility
+                // in the extent size used by our tests
+                database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
+            }
+        }
+        else if ( int( e.number() ) > 0 ) {
+            // We create '$nExtents' extents, each of size 'size'.
+            int nExtents = int( e.number() );
             assert( size <= 0x7fffffff );
             for ( int i = 0; i < nExtents; ++i ) {
                 assert( size <= 0x7fffffff );
@@ -209,10 +246,16 @@ namespace mongo {
                 // in the extent size used by our tests
                 database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
             }
-        } else {
+        }
+        else {
+            // This is the non test case, where we don't have a $nExtents spec.
             while ( size > 0 ) {
                 int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize;
                 int desiredExtentSize = (int) (size > max ? max : size);
+                if ( desiredExtentSize < Extent::minSize() ) {
+                    desiredExtentSize = Extent::minSize();
+                }
+                desiredExtentSize &= 0xffffff00;
                 Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped );
                 size -= e->length;
             }
@@ -223,15 +266,16 @@ namespace mongo {
 
         bool ensure = false;
         if ( options.getField( "autoIndexId" ).type() ) {
-            if ( options["autoIndexId"].trueValue() ){
+            if ( options["autoIndexId"].trueValue() ) {
                 ensure = true;
             }
-        } else {
+        }
+        else {
             if ( !newCapped ) {
                 ensure=true;
             }
         }
-        if( ensure ) { 
+        if( ensure ) {
             if( deferIdIndex )
                 *deferIdIndex = true;
             else
@@ -239,7 +283,7 @@ namespace mongo {
         }
 
         if ( mx > 0 )
-            d->max = mx;
+            getDur().writingInt( d->max ) = mx;
 
         return true;
     }
@@ -250,7 +294,7 @@ namespace mongo {
     */
     bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) {
         const char *coll = strchr( ns, '.' ) + 1;
-        massert( 10356 ,  "invalid ns", coll && *coll );
+        massert( 10356 ,  str::stream() << "invalid ns: " << ns , coll && *coll );
         char cl[ 256 ];
         nsToDatabase( ns, cl );
         bool ok = _userCreateNS(ns, options, err, deferIdIndex);
@@ -272,14 +316,22 @@ namespace mongo {
     int MongoDataFile::maxSize() {
         if ( sizeof( int* ) == 4 ) {
             return 512 * 1024 * 1024;
-        } else if ( cmdLine.smallfiles ) {
+        }
+        else if ( cmdLine.smallfiles ) {
             return 0x7ff00000 >> 2;
-        } else {
+        }
+        else {
             return 0x7ff00000;
         }
     }
 
-    void MongoDataFile::badOfs(int ofs) const { 
+    void MongoDataFile::badOfs2(int ofs) const {
+        stringstream ss;
+        ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
+        uasserted(13441, ss.str());
+    }
+
+    void MongoDataFile::badOfs(int ofs) const {
         stringstream ss;
         ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
         uasserted(13440, ss.str());
@@ -293,26 +345,18 @@ namespace mongo {
         else
             size = 0x7ff00000;
 
-        if ( strstr(filename, "_hudsonSmall") ) {
-            int mult = 1;
-            if ( fileNo > 1 && fileNo < 1000 )
-                mult = fileNo;
-            size = 1024 * 512 * mult;
-            log() << "Warning : using small files for _hudsonSmall" << endl;
-        }
-        else if ( cmdLine.smallfiles ){
+        if ( cmdLine.smallfiles ) {
             size = size >> 2;
         }
-        
-        
+
+
         return size;
     }
 
     void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) {
         {
             /* check quotas
-               very simple temporary implementation - we will in future look up
-               the quota from the grid database
+               very simple temporary implementation for now
             */
             if ( cmdLine.quota && fileNo > cmdLine.quotaFiles && !MMF::exists(filename) ) {
                 /* todo: if we were adding / changing keys in an index did we do some
@@ -340,58 +384,66 @@ namespace mongo {
         if ( size > maxSize() )
             size = maxSize();
 
-        assert( ( size >= 64*1024*1024 ) || cmdLine.smallfiles || ( strstr( filename, "_hudsonSmall" ) ) );
+        assert( size >= 64*1024*1024 || cmdLine.smallfiles );
         assert( size % 4096 == 0 );
 
         if ( preallocateOnly ) {
             if ( cmdLine.prealloc ) {
-                theFileAllocator().requestAllocation( filename, size );
+                FileAllocator::get()->requestAllocation( filename, size );
             }
             return;
         }
-        
-        _p = mmf.map(filename, size);
-        header = (DataFileHeader *) _p.at(0, DataFileHeader::HeaderSize);
-        if( sizeof(char *) == 4 ) 
-            uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", header);
+
+        {
+            assert( _mb == 0 );
+            unsigned long long sz = size;
+            if( mmf.create(filename, sz, false) )
+                _mb = mmf.getView();
+            assert( sz <= 0x7fffffff );
+            size = (int) sz;
+        }
+        //header = (DataFileHeader *) _p;
+        if( sizeof(char *) == 4 )
+            uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0);
         else
-            uassert( 10085 , "can't map file memory", header);
-        header->init(fileNo, size);
+            uassert( 10085 , "can't map file memory", _mb != 0);
+        header()->init(fileNo, size, filename);
     }
 
-    void MongoDataFile::flush( bool sync ){
+    void MongoDataFile::flush( bool sync ) {
         mmf.flush( sync );
     }
 
-    void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { 
-        DiskLoc oldExtentLoc;
+    void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) {
         NamespaceIndex *ni = nsindex(ns);
         NamespaceDetails *details = ni->details(ns);
         if ( details ) {
             assert( !details->lastExtent.isNull() );
             assert( !details->firstExtent.isNull() );
-            e->xprev = details->lastExtent;
-            details->lastExtent.ext()->xnext = eloc;
+            getDur().writingDiskLoc(e->xprev) = details->lastExtent;
+            getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc;
             assert( !eloc.isNull() );
-            details->lastExtent = eloc;
+            getDur().writingDiskLoc(details->lastExtent) = eloc;
         }
         else {
             ni->add_ns(ns, eloc, capped);
             details = ni->details(ns);
         }
 
-        details->lastExtentSize = e->length;
-        DEBUGGING out() << "temp: newextent adddelrec " << ns << endl;
+        {
+            NamespaceDetails *dw = details->writingWithoutExtra();
+            dw->lastExtentSize = e->length;
+        }
         details->addDeletedRec(emptyLoc.drec(), emptyLoc);
     }
 
     Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
-        massert( 10357 ,  "shutdown in progress", !goingAway );
-        massert( 10358 ,  "bad new extent size", approxSize >= 0 && approxSize <= Extent::maxSize() );
-        massert( 10359 ,  "header==0 on new extent: 32 bit mmap space exceeded?", header ); // null if file open failed
-        int ExtentSize = approxSize <= header->unusedLength ? approxSize : header->unusedLength;
+        massert( 10357 ,  "shutdown in progress", ! inShutdown() );
+        massert( 10358 ,  "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() );
+        massert( 10359 ,  "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed
+        int ExtentSize = approxSize <= header()->unusedLength ? approxSize : header()->unusedLength;
         DiskLoc loc;
-        if ( ExtentSize <= 0 ) {
+        if ( ExtentSize < Extent::minSize() ) {
             /* not there could be a lot of looping here is db just started and
                no files are open yet.  we might want to do something about that. */
             if ( loops > 8 ) {
@@ -401,12 +453,14 @@ namespace mongo {
             log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n";
             return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
         }
-        int offset = header->unused.getOfs();
-        header->unused.setOfs( fileNo, offset + ExtentSize );
-        header->unusedLength -= ExtentSize;
-        loc.setOfs(fileNo, offset);
+        int offset = header()->unused.getOfs();
+
+        DataFileHeader *h = getDur().writing(header());
+        h->unused.set( fileNo, offset + ExtentSize );
+        h->unusedLength -= ExtentSize;
+        loc.set(fileNo, offset);
         Extent *e = _getExtent(loc);
-        DiskLoc emptyLoc = e->init(ns, ExtentSize, fileNo, offset);
+        DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset);
 
         addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);
 
@@ -415,7 +469,7 @@ namespace mongo {
         return e;
     }
 
-    Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { 
+    Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) {
         string s = cc().database()->name + ".$freelist";
         NamespaceDetails *f = nsdetails(s.c_str());
         if( f ) {
@@ -426,7 +480,7 @@ namespace mongo {
                 if( low > 2048 ) low -= 256;
                 high = (int) (approxSize * 1.05) + 256;
             }
-            else { 
+            else {
                 low = (int) (approxSize * 0.8);
                 high = (int) (approxSize * 1.4);
             }
@@ -436,20 +490,20 @@ namespace mongo {
             int bestDiff = 0x7fffffff;
             {
                 DiskLoc L = f->firstExtent;
-                while( !L.isNull() ) { 
+                while( !L.isNull() ) {
                     Extent * e = L.ext();
-                    if( e->length >= low && e->length <= high ) { 
+                    if( e->length >= low && e->length <= high ) {
                         int diff = abs(e->length - approxSize);
-                        if( diff < bestDiff ) { 
+                        if( diff < bestDiff ) {
                             bestDiff = diff;
                             best = e;
-                            if( diff == 0 ) 
+                            if( diff == 0 )
                                 break;
                         }
                     }
                     L = e->xnext;
                     ++n;
-                
+
                 }
             }
             OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
@@ -457,13 +511,13 @@ namespace mongo {
                 Extent *e = best;
                 // remove from the free list
                 if( !e->xprev.isNull() )
-                    e->xprev.ext()->xnext = e->xnext;
+                    e->xprev.ext()->xnext.writing() = e->xnext;
                 if( !e->xnext.isNull() )
-                    e->xnext.ext()->xprev = e->xprev;
+                    e->xnext.ext()->xprev.writing() = e->xprev;
                 if( f->firstExtent == e->myLoc )
-                    f->firstExtent = e->xnext;
+                    f->firstExtent.writing() = e->xnext;
                 if( f->lastExtent == e->myLoc )
-                    f->lastExtent = e->xprev;
+                    f->lastExtent.writing() = e->xprev;
 
                 // use it
                 OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
@@ -479,9 +533,11 @@ namespace mongo {
 
     /*---------------------------------------------------------------------*/
 
-    DiskLoc Extent::reuse(const char *nsname) { 
-		/*TODOMMF - work to do when extent is freed. */
-        log(3) << "reset extent was:" << nsDiagnostic.buf << " now:" << nsname << '\n';
+    DiskLoc Extent::reuse(const char *nsname) {
+        return getDur().writing(this)->_reuse(nsname);
+    }
+    DiskLoc Extent::_reuse(const char *nsname) {
+        log(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n';
         massert( 10360 ,  "Extent::reset bad magic value", magic == 0x41424344 );
         xnext.Null();
         xprev.Null();
@@ -493,12 +549,9 @@ namespace mongo {
         emptyLoc.inc( (int) (_extentData-(char*)this) );
 
         int delRecLength = length - (_extentData - (char *) this);
-        //DeletedRecord *empty1 = (DeletedRecord *) extentData;
-        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc);
-        //assert( empty == empty1 );
-
-        // do we want to zero the record? memset(empty, ...)
 
+        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc);
+        empty = getDur().writing(empty);
         empty->lengthWithHeaders = delRecLength;
         empty->extentOfs = myLoc.getOfs();
         empty->nextDeleted.Null();
@@ -509,7 +562,7 @@ namespace mongo {
     /* assumes already zeroed -- insufficient for block 'reuse' perhaps */
     DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) {
         magic = 0x41424344;
-        myLoc.setOfs(_fileNo, _offset);
+        myLoc.set(_fileNo, _offset);
         xnext.Null();
         xprev.Null();
         nsDiagnostic = nsname;
@@ -521,9 +574,7 @@ namespace mongo {
         emptyLoc.inc( (int) (_extentData-(char*)this) );
 
         int l = _length - (_extentData - (char *) this);
-        //DeletedRecord *empty1 = (DeletedRecord *) extentData;
-        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, l);
-        //assert( empty == empty1 );
+        DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, l) );
         empty->lengthWithHeaders = l;
         empty->extentOfs = myLoc.getOfs();
         return emptyLoc;
@@ -582,7 +633,7 @@ namespace mongo {
         }
         return maxExtentSize;
     }
-    
+
     /*---------------------------------------------------------------------*/
 
     shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) {
@@ -612,12 +663,12 @@ namespace mongo {
             d->dumpDeleted(&extents);
         }
 
-        if ( d->capped ) 
+        if ( d->capped )
             return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) );
-        
+
         if ( !startLoc.isNull() )
-            return shared_ptr<Cursor>(new BasicCursor( startLoc ));                
-        
+            return shared_ptr<Cursor>(new BasicCursor( startLoc ));
+
         while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
             /* todo: if extent is empty, free it for reuse elsewhere.
                that is a bit complicated have to clean up the freelists.
@@ -638,37 +689,38 @@ namespace mongo {
 
         if ( el.number() >= 0 )
             return DataFileMgr::findAll(ns, startLoc);
-        
+
         // "reverse natural order"
         NamespaceDetails *d = nsdetails(ns);
-        
+
         if ( !d )
             return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
-        
+
         if ( !d->capped ) {
             if ( !startLoc.isNull() )
-                return shared_ptr<Cursor>(new ReverseCursor( startLoc ));                
+                return shared_ptr<Cursor>(new ReverseCursor( startLoc ));
             Extent *e = d->lastExtent.ext();
             while ( e->lastRecord.isNull() && !e->xprev.isNull() ) {
                 OCCASIONALLY out() << "  findTableScan: extent empty, skipping ahead" << endl;
                 e = e->getPrevExtent();
             }
             return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
-        } else {
+        }
+        else {
             return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) );
         }
     }
 
-    void printFreeList() { 
+    void printFreeList() {
         string s = cc().database()->name + ".$freelist";
         log() << "dump freelist " << s << '\n';
         NamespaceDetails *freeExtents = nsdetails(s.c_str());
-        if( freeExtents == 0 ) { 
+        if( freeExtents == 0 ) {
             log() << "  freeExtents==0" << endl;
             return;
         }
         DiskLoc a = freeExtents->firstExtent;
-        while( !a.isNull() ) { 
+        while( !a.isNull() ) {
             Extent *e = a.ext();
             log() << "  " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << '\n';
             a = e->xnext;
@@ -687,7 +739,7 @@ namespace mongo {
         NamespaceString s(nsToDrop);
         assert( s.db == cc().database()->name );
         if( s.isSystem() ) {
-            if( s.coll == "system.profile" ) 
+            if( s.coll == "system.profile" )
                 uassert( 10087 ,  "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 );
             else
                 uasserted( 12502, "can't drop system ns" );
@@ -698,32 +750,31 @@ namespace mongo {
             BSONObj cond = BSON( "name" << nsToDrop );   // { name: "colltodropname" }
             string system_namespaces = cc().database()->name + ".system.namespaces";
             /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true);
-			// no check of return code as this ns won't exist for some of the new storage engines
+            // no check of return code as this ns won't exist for some of the new storage engines
         }
 
         // free extents
         if( !d->firstExtent.isNull() ) {
             string s = cc().database()->name + ".$freelist";
             NamespaceDetails *freeExtents = nsdetails(s.c_str());
-            if( freeExtents == 0 ) { 
+            if( freeExtents == 0 ) {
                 string err;
                 _userCreateNS(s.c_str(), BSONObj(), err, 0);
                 freeExtents = nsdetails(s.c_str());
                 massert( 10361 , "can't create .$freelist", freeExtents);
             }
-            if( freeExtents->firstExtent.isNull() ) { 
-                freeExtents->firstExtent = d->firstExtent;
-                freeExtents->lastExtent = d->lastExtent;
+            if( freeExtents->firstExtent.isNull() ) {
+                freeExtents->firstExtent.writing() = d->firstExtent;
+                freeExtents->lastExtent.writing() = d->lastExtent;
             }
-            else { 
+            else {
                 DiskLoc a = freeExtents->firstExtent;
                 assert( a.ext()->xprev.isNull() );
-                a.ext()->xprev = d->lastExtent;
-                d->lastExtent.ext()->xnext = a;
-                freeExtents->firstExtent = d->firstExtent;
-
-                d->firstExtent.setInvalid();
-                d->lastExtent.setInvalid();
+                getDur().writingDiskLoc( a.ext()->xprev ) = d->lastExtent;
+                getDur().writingDiskLoc( d->lastExtent.ext()->xnext ) = a;
+                getDur().writingDiskLoc( freeExtents->firstExtent ) = d->firstExtent;
+                getDur().writingDiskLoc( d->firstExtent ).setInvalid();
+                getDur().writingDiskLoc( d->lastExtent ).setInvalid();
             }
         }
 
@@ -740,7 +791,7 @@ namespace mongo {
         BackgroundOperation::assertNoBgOpInProgForNs(name.c_str());
 
         if ( d->nIndexes != 0 ) {
-            try { 
+            try {
                 assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) );
             }
             catch( DBException& e ) {
@@ -754,11 +805,10 @@ namespace mongo {
         log(1) << "\t dropIndexes done" << endl;
         result.append("ns", name.c_str());
         ClientCursor::invalidate(name.c_str());
-        Client::invalidateNS( name );
         Top::global.collectionDropped( name );
-        dropNS(name);        
+        dropNS(name);
     }
-    
+
     int nUnindexes = 0;
 
     /* unindex all keys in index for this record. */
@@ -797,63 +847,69 @@ namespace mongo {
         int n = d->nIndexes;
         for ( int i = 0; i < n; i++ )
             _unindexRecord(d->idx(i), obj, dl, !noWarn);
-        if( d->backgroundIndexBuildInProgress ) {
+        if( d->indexBuildInProgress ) { // background index
             // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it
-            _unindexRecord(d->idx(n), obj, dl, false); 
+            _unindexRecord(d->idx(n), obj, dl, false);
         }
     }
 
-    /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. 
+    /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
        caller must check if capped
     */
-    void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl)
-    {
+    void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) {
         /* remove ourself from the record next/prev chain */
         {
             if ( todelete->prevOfs != DiskLoc::NullOfs )
-                todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs;
+                getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs;
             if ( todelete->nextOfs != DiskLoc::NullOfs )
-                todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs;
+                getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs;
         }
 
         /* remove ourself from extent pointers */
         {
-            Extent *e = todelete->myExtent(dl);
+            Extent *e = getDur().writing( todelete->myExtent(dl) );
             if ( e->firstRecord == dl ) {
                 if ( todelete->nextOfs == DiskLoc::NullOfs )
                     e->firstRecord.Null();
                 else
-                    e->firstRecord.setOfs(dl.a(), todelete->nextOfs);
+                    e->firstRecord.set(dl.a(), todelete->nextOfs);
             }
             if ( e->lastRecord == dl ) {
                 if ( todelete->prevOfs == DiskLoc::NullOfs )
                     e->lastRecord.Null();
                 else
-                    e->lastRecord.setOfs(dl.a(), todelete->prevOfs);
+                    e->lastRecord.set(dl.a(), todelete->prevOfs);
             }
         }
 
         /* add to the free list */
         {
-            d->nrecords--;
-            d->datasize -= todelete->netLength();
-            /* temp: if in system.indexes, don't reuse, and zero out: we want to be
-               careful until validated more, as IndexDetails has pointers
-               to this disk location.  so an incorrectly done remove would cause
-               a lot of problems.
-            */
+            {
+                NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+                s->datasize -= todelete->netLength();
+                s->nrecords--;
+            }
+
             if ( strstr(ns, ".system.indexes") ) {
-                memset(todelete, 0, todelete->lengthWithHeaders);
+                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+                   careful until validated more, as IndexDetails has pointers
+                   to this disk location.  so an incorrectly done remove would cause
+                   a lot of problems.
+                */
+                memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders);
             }
             else {
-                DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+                DEV {
+                    unsigned long long *p = (unsigned long long *) todelete->data;
+                    *getDur().writing(p) = 0;
+                    //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+                }
                 d->addDeletedRec((DeletedRecord*)todelete, dl);
             }
         }
     }
 
-    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn)
-    {
+    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) {
         dassert( todelete == dl.rec() );
 
         NamespaceDetails* d = nsdetails(ns);
@@ -880,8 +936,7 @@ namespace mongo {
         NamespaceDetails *d,
         NamespaceDetailsTransient *nsdt,
         Record *toupdate, const DiskLoc& dl,
-        const char *_buf, int _len, OpDebug& debug, bool &changedId, bool god)
-    {
+        const char *_buf, int _len, OpDebug& debug,  bool god) {
         StringBuilder& ss = debug.str;
         dassert( toupdate == dl.rec() );
 
@@ -891,7 +946,7 @@ namespace mongo {
         DEV assert( objNew.objdata() == _buf );
 
         if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
-            /* add back the old _id value if the update removes it.  Note this implementation is slow 
+            /* add back the old _id value if the update removes it.  Note this implementation is slow
                (copies entire object multiple times), but this shouldn't happen often, so going for simple
                code, not speed.
             */
@@ -903,11 +958,13 @@ namespace mongo {
             objNew = b.obj();
         }
 
-        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further  
+        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
            below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
         */
         vector<IndexChanges> changes;
+        bool changedId = false;
         getIndexChanges(changes, *d, objNew, objOld, changedId);
+        uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId );
         dupCheck(changes, *d, dl);
 
         if ( toupdate->netLength() < objNew.objsize() ) {
@@ -946,8 +1003,8 @@ namespace mongo {
                     try {
                         /* we did the dupCheck() above.  so we don't have to worry about it here. */
                         idx.head.btree()->bt_insert(
-                                                    idx.head,
-                                                    dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
+                            idx.head,
+                            dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
                     }
                     catch (AssertionException& e) {
                         ss << " exception update index ";
@@ -959,25 +1016,30 @@ namespace mongo {
                 ss << '\n' << keyUpdates << " key updates ";
         }
 
-        //	update in place
-        memcpy(toupdate->data, objNew.objdata(), objNew.objsize());
+        //  update in place
+        int sz = objNew.objsize();
+        memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz);
         return dl;
     }
 
-    int followupExtentSize(int len, int lastExtentLen) {
+    int Extent::followupSize(int len, int lastExtentLen) {
         assert( len < Extent::maxSize() );
-        int x = initialExtentSize(len);
+        int x = initialSize(len);
         int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2);
         int sz = y > x ? y : x;
 
-        if ( sz < lastExtentLen )
-            sz = lastExtentLen;
-        else if ( sz > Extent::maxSize() )
+        if ( sz < lastExtentLen ) {
+            // this means there was an int overflow
+            // so we should turn it into maxSize
+            sz = Extent::maxSize();
+        }
+        else if ( sz > Extent::maxSize() ) {
             sz = Extent::maxSize();
-        
+        }
+
         sz = ((int)sz) & 0xffffff00;
         assert( sz > len );
-        
+
         return sz;
     }
 
@@ -990,7 +1052,7 @@ namespace mongo {
         Ordering ordering = Ordering::make(order);
         int n = 0;
         for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
-            if( ++n == 2 ) { 
+            if( ++n == 2 ) {
                 d->setIndexIsMultikey(idxNo);
             }
             assert( !recordLoc.isNull() );
@@ -999,7 +1061,7 @@ namespace mongo {
                                             *i, ordering, dupsAllowed, idx);
             }
             catch (AssertionException& e) {
-                if( e.getCode() == 10287 && idxNo == d->nIndexes ) { 
+                if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
                     DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
                     continue;
                 }
@@ -1012,8 +1074,7 @@ namespace mongo {
         }
     }
 
-    void testSorting() 
-    {
+    void testSorting() {
         BSONObjBuilder b;
         b.appendNull("");
         BSONObj x = b.obj();
@@ -1027,9 +1088,9 @@ namespace mongo {
         sorter.add(x, DiskLoc(3,77));
 
         sorter.sort();
-        
+
         auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
-        while( i->more() ) { 
+        while( i->more() ) {
             BSONObjExternalSorter::Data d = i->next();
             /*cout << d.second.toString() << endl;
             cout << d.first.objsize() << endl;
@@ -1039,7 +1100,6 @@ namespace mongo {
 
     // throws DBException
     unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
-        assert( d->backgroundIndexBuildInProgress == 0 );
         CurOp * op = cc().curop();
 
         Timer t;
@@ -1050,17 +1110,17 @@ namespace mongo {
         bool dropDups = idx.dropDups() || inDBRepair;
         BSONObj order = idx.keyPattern();
 
-        idx.head.Null();
-        
+        getDur().writingDiskLoc(idx.head).Null();
+
         if ( logLevel > 1 ) printMemInfo( "before index start" );
 
         /* get and sort all the keys ----- */
         unsigned long long n = 0;
         shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
         BSONObjExternalSorter sorter(order);
-        sorter.hintNumObjects( d->nrecords );
+        sorter.hintNumObjects( d->stats.nrecords );
         unsigned long long nkeys = 0;
-        ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->nrecords , 10 ) );
+        ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
         while ( c->ok() ) {
             BSONObj o = c->current();
             DiskLoc loc = c->currLoc();
@@ -1069,17 +1129,17 @@ namespace mongo {
             idx.getKeysFromObject(o, keys);
             int k = 0;
             for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
-                if( ++k == 2 )
+                if( ++k == 2 ) {
                     d->setIndexIsMultikey(idxNo);
-                //cout<<"SORTER ADD " << i->toString() << ' ' << loc.toString() << endl;
+                }
                 sorter.add(*i, loc);
                 nkeys++;
             }
-            
+
             c->advance();
             n++;
             pm.hit();
-            if ( logLevel > 1 && n % 10000 == 0 ){
+            if ( logLevel > 1 && n % 10000 == 0 ) {
                 printMemInfo( "\t iterating objects" );
             }
 
@@ -1089,37 +1149,37 @@ namespace mongo {
         if ( logLevel > 1 ) printMemInfo( "before final sort" );
         sorter.sort();
         if ( logLevel > 1 ) printMemInfo( "after final sort" );
-        
+
         log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
 
         list<DiskLoc> dupsToDrop;
 
-        /* build index --- */ 
+        /* build index --- */
         {
             BtreeBuilder btBuilder(dupsAllowed, idx);
             BSONObj keyLast;
             auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
             assert( pm == op->setMessage( "index: (2/3) btree bottom up" , nkeys , 10 ) );
-            while( i->more() ) { 
+            while( i->more() ) {
                 RARELY killCurrentOp.checkForInterrupt();
                 BSONObjExternalSorter::Data d = i->next();
 
-                try { 
+                try {
                     btBuilder.addKey(d.first, d.second);
                 }
-                catch( AssertionException& e ) { 
-                    if ( dupsAllowed ){
+                catch( AssertionException& e ) {
+                    if ( dupsAllowed ) {
                         // unknow exception??
                         throw;
                     }
-                    
+
                     if( e.interrupted() )
                         throw;
 
                     if ( ! dropDups )
                         throw;
 
-                    /* we could queue these on disk, but normally there are very few dups, so instead we 
+                    /* we could queue these on disk, but normally there are very few dups, so instead we
                        keep in ram and have a limit.
                     */
                     dupsToDrop.push_back(d.second);
@@ -1131,9 +1191,11 @@ namespace mongo {
             op->setMessage( "index: (3/3) btree-middle" );
             log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
             btBuilder.commit();
-            wassert( btBuilder.getn() == nkeys || dropDups ); 
+            if ( btBuilder.getn() != nkeys && ! dropDups ) {
+                warning() << "not all entries were added to the index, probably some keys were too large" << endl;
+            }
         }
-        
+
         log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
 
         for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ )
@@ -1142,13 +1204,13 @@ namespace mongo {
         return n;
     }
 
-    class BackgroundIndexBuildJob : public BackgroundOperation { 
+    class BackgroundIndexBuildJob : public BackgroundOperation {
 
         unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
             bool dupsAllowed = !idx.unique();
             bool dropDups = idx.dropDups();
 
-            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->nrecords );
+            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords );
 
             unsigned long long n = 0;
             auto_ptr<ClientCursor> cc;
@@ -1156,25 +1218,26 @@ namespace mongo {
                 shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
                 cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) );
             }
-            CursorId id = cc->cursorid;
+            CursorId id = cc->cursorid();
 
-            while ( cc->c->ok() ) {
-                BSONObj js = cc->c->current();
-                try { 
-                    _indexRecord(d, idxNo, js, cc->c->currLoc(), dupsAllowed);
-                    cc->c->advance();
-                } catch( AssertionException& e ) { 
+            while ( cc->ok() ) {
+                BSONObj js = cc->current();
+                try {
+                    _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                    cc->advance();
+                }
+                catch( AssertionException& e ) {
                     if( e.interrupted() )
                         throw;
 
                     if ( dropDups ) {
-                        DiskLoc toDelete = cc->c->currLoc();
-                        bool ok = cc->c->advance();
+                        DiskLoc toDelete = cc->currLoc();
+                        bool ok = cc->advance();
                         cc->updateLocation();
                         theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true );
                         if( ClientCursor::find(id, false) == 0 ) {
                             cc.release();
-                            if( !ok ) { 
+                            if( !ok ) {
                                 /* we were already at the end. normal. */
                             }
                             else {
@@ -1182,7 +1245,8 @@ namespace mongo {
                             }
                             break;
                         }
-                    } else {
+                    }
+                    else {
                         log() << "background addExistingToIndex exception " << e.what() << endl;
                         throw;
                     }
@@ -1200,7 +1264,7 @@ namespace mongo {
             return n;
         }
 
-        /* we do set a flag in the namespace for quick checking, but this is our authoritative info - 
+        /* we do set a flag in the namespace for quick checking, but this is our authoritative info -
            that way on a crash/restart, we don't think we are still building one. */
         set<NamespaceDetails*> bgJobsInProgress;
 
@@ -1208,12 +1272,8 @@ namespace mongo {
             assertInWriteLock();
             uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , dbMutex.getState() == 1 );
             bgJobsInProgress.insert(d);
-            d->backgroundIndexBuildInProgress = 1;
-            d->nIndexes--;
         }
         void done(const char *ns, NamespaceDetails *d) {
-            d->nIndexes++;
-            d->backgroundIndexBuildInProgress = 0;
             NamespaceDetailsTransient::get_w(ns).addedIndex(); // clear query optimizer cache
             assertInWriteLock();
         }
@@ -1221,16 +1281,16 @@ namespace mongo {
     public:
         BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { }
 
-        unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { 
+        unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
             unsigned long long n = 0;
 
             prep(ns.c_str(), d);
             assert( idxNo == d->nIndexes );
-            try { 
+            try {
                 idx.head = BtreeBucket::addBucket(idx);
                 n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
             }
-            catch(...) { 
+            catch(...) {
                 if( cc().database() && nsdetails(ns.c_str()) == d ) {
                     assert( idxNo == d->nIndexes );
                     done(ns.c_str(), d);
@@ -1246,25 +1306,51 @@ namespace mongo {
         }
     };
 
+    /**
+     * For the lifetime of this object, an index build is indicated on the specified
+     * namespace and the newest index is marked as absent.  This simplifies
+     * the cleanup required on recovery.
+     */
+    class RecoverableIndexState {
+    public:
+        RecoverableIndexState( NamespaceDetails *d ) : _d( d ) {
+            indexBuildInProgress() = 1;
+            nIndexes()--;
+        }
+        ~RecoverableIndexState() {
+            DESTRUCTOR_GUARD (
+                nIndexes()++;
+                indexBuildInProgress() = 0;
+            )
+        }
+    private:
+        int &nIndexes() { return getDur().writingInt( _d->nIndexes ); }
+        int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); }
+        NamespaceDetails *_d;
+    };
+
     // throws DBException
-    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { 
+    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
         tlog() << "building new index on " << idx.keyPattern() << " for " << ns << ( background ? " background" : "" ) << endl;
         Timer t;
-		unsigned long long n;
+        unsigned long long n;
 
         if( background ) {
             log(2) << "buildAnIndex: background=true\n";
         }
 
         assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be...
+        assert( d->indexBuildInProgress == 0 );
+        assertInWriteLock();
+        RecoverableIndexState recoverable( d );
         if( inDBRepair || !background ) {
-			n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
-			assert( !idx.head.isNull() );
-		}
-		else {
+            n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
+            assert( !idx.head.isNull() );
+        }
+        else {
             BackgroundIndexBuildJob j(ns.c_str());
             n = j.go(ns, d, idx, idxNo);
-		}
+        }
         tlog() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl;
     }
 
@@ -1272,20 +1358,20 @@ namespace mongo {
     static void indexRecord(NamespaceDetails *d, BSONObj obj, DiskLoc loc) {
         int n = d->nIndexesBeingBuilt();
         for ( int i = 0; i < n; i++ ) {
-            try { 
+            try {
                 bool unique = d->idx(i).unique();
                 _indexRecord(d, i, obj, loc, /*dupsAllowed*/!unique);
             }
-            catch( DBException& ) { 
+            catch( DBException& ) {
                 /* try to roll back previously added index entries
                    note <= i (not < i) is important here as the index we were just attempted
                    may be multikey and require some cleanup.
                 */
-                for( int j = 0; j <= i; j++ ) { 
+                for( int j = 0; j <= i; j++ ) {
                     try {
                         _unindexRecord(d->idx(j), obj, loc, false);
                     }
-                    catch(...) { 
+                    catch(...) {
                         log(3) << "unindex fails on rollback after unique failure\n";
                     }
                 }
@@ -1301,7 +1387,7 @@ namespace mongo {
         if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) )
             return;
 
-        d->flags |= NamespaceDetails::Flag_HaveIdIndex;
+        *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex;
 
         {
             NamespaceDetails::IndexIterator i = d->ii();
@@ -1324,7 +1410,7 @@ namespace mongo {
     }
 
 #pragma pack(1)
-    struct IDToInsert_ { 
+    struct IDToInsert_ {
         char type;
         char _id[4];
         OID oid;
@@ -1338,13 +1424,13 @@ namespace mongo {
         IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {}
     } idToInsert;
 #pragma pack()
-    
+
     void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) {
         BSONObj tmp = o;
         insertWithObjMod( ns, tmp, god );
         logOp( "i", ns, tmp );
     }
-    
+
     DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
         DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god );
         if ( !loc.isNull() )
@@ -1356,12 +1442,12 @@ namespace mongo {
         insert( ns, o.objdata(), o.objsize(), god );
     }
 
-    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection);
+    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject );
 
     // We are now doing two btree scans for all unique indexes (one here, and one when we've
     // written the record to the collection.  This could be made more efficient inserting
     // dummy data here, keeping pointers to the btree nodes holding the dummy data and then
-    // updating the dummy data with the DiskLoc of the real record.    
+    // updating the dummy data with the DiskLoc of the real record.
     void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) {
         for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) {
             if( d->idx(idxNo).unique() ) {
@@ -1371,19 +1457,19 @@ namespace mongo {
                 BSONObj order = idx.keyPattern();
                 for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
                     uassert( 12582, "duplicate key insert for unique index of capped collection",
-                            idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() );
+                             idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() );
                 }
             }
-        }        
+        }
     }
 
-    /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc 
+    /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
              after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
     */
     DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) {
         bool wouldAddIndex = false;
-        massert( 10093 , "cannot insert into reserved $ collection", god || nsDollarCheck( ns ) );
-        uassert( 10094 , "invalid ns", strchr( ns , '.' ) > 0 );
+        massert( 10093 , "cannot insert into reserved $ collection", god || isANormalNSName( ns ) );
+        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
         const char *sys = strstr(ns, "system.");
         if ( sys ) {
             uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
@@ -1411,7 +1497,7 @@ namespace mongo {
                also if this is an addIndex, those checks should happen before this!
             */
             // This may create first file in the database.
-            cc().database()->allocExtent(ns, initialExtentSize(len), false);
+            cc().database()->allocExtent(ns, Extent::initialSize(len), false);
             d = nsdetails(ns);
             if ( !god )
                 ensureIdIndexForNewNs(ns);
@@ -1421,17 +1507,24 @@ namespace mongo {
         NamespaceDetails *tableToIndex = 0;
 
         string tabletoidxns;
+        BSONObj fixedIndexObject;
         if ( addIndex ) {
             assert( obuf );
             BSONObj io((const char *) obuf);
-            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) )
+            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) )
                 return DiskLoc();
+
+            if ( ! fixedIndexObject.isEmpty() ) {
+                obuf = fixedIndexObject.objdata();
+                len = fixedIndexObject.objsize();
+            }
+
         }
 
         const BSONElement *newId = &writeId;
         int addID = 0;
         if( !god ) {
-            /* Check if we have an _id field. If we don't, we'll add it. 
+            /* Check if we have an _id field. If we don't, we'll add it.
                Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
             */
             BSONObj io((const char *) obuf);
@@ -1446,7 +1539,7 @@ namespace mongo {
                 }
                 len += newId->size();
             }
-            
+
             BSONElementManipulator::lookForTimestamps( io );
         }
 
@@ -1456,28 +1549,28 @@ namespace mongo {
         if ( lenWHdr == 0 ) {
             // old datafiles, backward compatible here.
             assert( d->paddingFactor == 0 );
-            d->paddingFactor = 1.0;
+            *getDur().writing(&d->paddingFactor) = 1.0;
             lenWHdr = len + Record::HeaderSize;
         }
-        
+
         // If the collection is capped, check if the new object will violate a unique index
         // constraint before allocating space.
         if ( d->nIndexes && d->capped && !god ) {
             checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
         }
-        
+
         DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
         if ( loc.isNull() ) {
             // out of space
             if ( d->capped == 0 ) { // size capped doesn't grow
                 log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
-                cc().database()->allocExtent(ns, followupExtentSize(lenWHdr, d->lastExtentSize), false);
+                cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false);
                 loc = d->alloc(ns, lenWHdr, extentLoc);
-                if ( loc.isNull() ){
+                if ( loc.isNull() ) {
                     log() << "WARNING: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
-                    for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ){
+                    for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ) {
                         log() << "try #" << zzz << endl;
-                        cc().database()->allocExtent(ns, followupExtentSize(len, d->lastExtentSize), false);
+                        cc().database()->allocExtent(ns, Extent::followupSize(len, d->lastExtentSize), false);
                         loc = d->alloc(ns, lenWHdr, extentLoc);
                         if ( ! loc.isNull() )
                             break;
@@ -1492,45 +1585,55 @@ namespace mongo {
         }
 
         Record *r = loc.rec();
-        assert( r->lengthWithHeaders >= lenWHdr );
-        if( addID ) { 
-            /* a little effort was made here to avoid a double copy when we add an ID */
-            ((int&)*r->data) = *((int*) obuf) + newId->size();
-            memcpy(r->data+4, newId->rawdata(), newId->size());
-            memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4);
-        }
-        else {
-            if( obuf )
-                memcpy(r->data, obuf, len);
-        }
-        Extent *e = r->myExtent(loc);
-        if ( e->lastRecord.isNull() ) {
-            e->firstRecord = e->lastRecord = loc;
-            r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+        {
+            assert( r->lengthWithHeaders >= lenWHdr );
+            r = (Record*) getDur().writingPtr(r, lenWHdr);
+            if( addID ) {
+                /* a little effort was made here to avoid a double copy when we add an ID */
+                ((int&)*r->data) = *((int*) obuf) + newId->size();
+                memcpy(r->data+4, newId->rawdata(), newId->size());
+                memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4);
+            }
+            else {
+                if( obuf )
+                    memcpy(r->data, obuf, len);
+            }
         }
-        else {
 
-            Record *oldlast = e->lastRecord.rec();
-            r->prevOfs = e->lastRecord.getOfs();
-            r->nextOfs = DiskLoc::NullOfs;
-            oldlast->nextOfs = loc.getOfs();
-            e->lastRecord = loc;
+        {
+            Extent *e = r->myExtent(loc);
+            if ( e->lastRecord.isNull() ) {
+                Extent::FL *fl = getDur().writing(e->fl());
+                fl->firstRecord = fl->lastRecord = loc;
+                r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+            }
+            else {
+                Record *oldlast = e->lastRecord.rec();
+                r->prevOfs = e->lastRecord.getOfs();
+                r->nextOfs = DiskLoc::NullOfs;
+                getDur().writingInt(oldlast->nextOfs) = loc.getOfs();
+                getDur().writingDiskLoc(e->lastRecord) = loc;
+            }
         }
 
-        d->nrecords++;
-        d->datasize += r->netLength();
+        /* durability todo : this could be a bit annoying / slow to record constantly */
+        {
+            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
 
         // we don't bother clearing those stats for the god tables - also god is true when adidng a btree bucket
         if ( !god )
             NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
-        
+
         if ( tableToIndex ) {
             uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos );
 
             BSONObj info = loc.obj();
             bool background = info["background"].trueValue();
-            if( background && cc().isSyncThread() ) { 
-                /* don't do background indexing on slaves.  there are nuances.  this could be added later 
+            if( background && cc().isSyncThread() ) {
+                /* don't do background indexing on slaves.  there are nuances.  this could be added later
                    but requires more code.
                    */
                 log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
@@ -1539,10 +1642,11 @@ namespace mongo {
 
             int idxNo = tableToIndex->nIndexes;
             IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
-            idx.info = loc;
+            getDur().writingDiskLoc(idx.info) = loc;
             try {
                 buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
-            } catch( DBException& e ) {
+            }
+            catch( DBException& e ) {
                 // save our error msg string as an exception or dropIndexes will overwrite our message
                 LastError *le = lastError.get();
                 int savecode = 0;
@@ -1564,7 +1668,7 @@ namespace mongo {
                 if( !ok ) {
                     log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
                 }
-                
+
                 assert( le && !saveerrmsg.empty() );
                 raiseError(savecode,saveerrmsg.c_str());
                 throw;
@@ -1573,20 +1677,20 @@ namespace mongo {
 
         /* add this record to our indexes */
         if ( d->nIndexes ) {
-            try { 
+            try {
                 BSONObj obj(r->data);
                 indexRecord(d, obj, loc);
-            } 
-            catch( AssertionException& e ) { 
+            }
+            catch( AssertionException& e ) {
                 // should be a dup key error on _id index
                 if( tableToIndex || d->capped ) {
                     massert( 12583, "unexpected index insertion failure on capped collection", !d->capped );
                     string s = e.toString();
                     s += " : on addIndex/capped - collection and its index will not match";
                     uassert_nothrow(s.c_str());
-                    log() << s << '\n';
+                    error() << s << endl;
                 }
-                else { 
+                else {
                     // normal case -- we can roll back
                     _deleteRecord(d, ns, r, loc);
                     throw;
@@ -1594,7 +1698,7 @@ namespace mongo {
             }
         }
 
-        //	out() << "   inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
+        //  out() << "   inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
         return loc;
     }
 
@@ -1619,18 +1723,27 @@ namespace mongo {
 
         Extent *e = r->myExtent(loc);
         if ( e->lastRecord.isNull() ) {
-            e->firstRecord = e->lastRecord = loc;
-            r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+            Extent::FL *fl = getDur().writing( e->fl() );
+            fl->firstRecord = fl->lastRecord = loc;
+
+            Record::NP *np = getDur().writing(r->np());
+            np->nextOfs = np->prevOfs = DiskLoc::NullOfs;
         }
         else {
             Record *oldlast = e->lastRecord.rec();
-            r->prevOfs = e->lastRecord.getOfs();
-            r->nextOfs = DiskLoc::NullOfs;
-            oldlast->nextOfs = loc.getOfs();
-            e->lastRecord = loc;
+            Record::NP *np = getDur().writing(r->np());
+            np->prevOfs = e->lastRecord.getOfs();
+            np->nextOfs = DiskLoc::NullOfs;
+            getDur().writingInt( oldlast->nextOfs ) = loc.getOfs();
+            e->lastRecord.writing() = loc;
         }
 
-        d->nrecords++;
+        /* todo: don't update for oplog?  seems wasteful. */
+        {
+            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
 
         return r;
     }
@@ -1641,7 +1754,7 @@ namespace mongo {
 
 namespace mongo {
 
-    void dropAllDatabasesExceptLocal() { 
+    void dropAllDatabasesExceptLocal() {
         writelock lk("");
 
         vector<string> n;
@@ -1658,14 +1771,17 @@ namespace mongo {
 
     void dropDatabase(string db) {
         log(1) << "dropDatabase " << db << endl;
-        assert( cc().database() );
-        assert( cc().database()->name == db );
+        Database *d = cc().database();
+        assert( d );
+        assert( d->name == db );
 
-        BackgroundOperation::assertNoBgOpInProgForDb(db.c_str());
+        BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str());
 
-        Client::invalidateDB( db );
+        getDur().syncDataAndTruncateJournal();
+
+        Database::closeDatabase( d->name.c_str(), d->path );
+        d = 0; // d is now deleted
 
-        closeDatabase( db.c_str() );
         _deleteDataFiles( db.c_str() );
     }
 
@@ -1674,13 +1790,14 @@ namespace mongo {
     void boostRenameWrapper( const Path &from, const Path &to ) {
         try {
             boost::filesystem::rename( from, to );
-        } catch ( const boost::filesystem::filesystem_error & ) {
+        }
+        catch ( const boost::filesystem::filesystem_error & ) {
             // boost rename doesn't work across partitions
             boost::filesystem::copy_file( from, to);
             boost::filesystem::remove( from );
         }
     }
-    
+
     // back up original database files to 'temp' dir
     void _renameForBackup( const char *database, const Path &reservedPath ) {
         Path newPath( reservedPath );
@@ -1738,7 +1855,8 @@ namespace mongo {
             ss << prefix << "_repairDatabase_" << i++;
             reservedPath = repairPath / ss.str();
             BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
-        } while ( exists );
+        }
+        while ( exists );
         return reservedPath;
     }
 
@@ -1790,12 +1908,15 @@ namespace mongo {
         stringstream ss;
         ss << "localhost:" << cmdLine.port;
         string localhost = ss.str();
-        
+
         problem() << "repairDatabase " << dbName << endl;
         assert( cc().database()->name == dbName );
+        assert( cc().database()->path == dbpath );
 
         BackgroundOperation::assertNoBgOpInProgForDb(dbName);
 
+        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
         boost::intmax_t totalSize = dbSize( dbName );
         boost::intmax_t freeSize = freeSpace( repairpath );
         if ( freeSize > -1 && freeSize < totalSize ) {
@@ -1812,30 +1933,37 @@ namespace mongo {
                                 "backup" : "$tmp" );
         BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
         string reservedPathString = reservedPath.native_directory_string();
-        
+
         bool res;
-        { // clone to temp location, which effectively does repair
+        {
+            // clone to temp location, which effectively does repair
             Client::Context ctx( dbName, reservedPathString );
             assert( ctx.justCreated() );
-            
-            res = cloneFrom(localhost.c_str(), errmsg, dbName, 
-                                 /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
-            closeDatabase( dbName, reservedPathString.c_str() );
+
+            res = cloneFrom(localhost.c_str(), errmsg, dbName,
+                            /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
+            Database::closeDatabase( dbName, reservedPathString.c_str() );
         }
 
         if ( !res ) {
             problem() << "clone failed for " << dbName << " with error: " << errmsg << endl;
             if ( !preserveClonedFilesOnFailure )
                 BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+            getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
             return false;
         }
 
+        MongoFile::flushAll(true);
+
         Client::Context ctx( dbName );
-        closeDatabase( dbName );
+        Database::closeDatabase( dbName, dbpath );
 
         if ( backupOriginalFiles ) {
             _renameForBackup( dbName, reservedPath );
-        } else {
+        }
+        else {
             _deleteDataFiles( dbName );
             BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) );
         }
@@ -1845,12 +1973,14 @@ namespace mongo {
         if ( !backupOriginalFiles )
             BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
 
+        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
         return true;
     }
 
     void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) {
         if ( afterAllocator )
-            theFileAllocator().waitUntilFinished();
+            FileAllocator::get()->waitUntilFinished();
         string c = database;
         c += '.';
         boost::filesystem::path p(path);
@@ -1871,8 +2001,8 @@ namespace mongo {
             q = p / ss.str();
             BOOST_CHECK_EXCEPTION( ok = fo.apply(q) );
             if ( ok ) {
-                if ( extra != 10 ){
-                    log(1) << fo.op() << " file " << q.string() << '\n';
+                if ( extra != 10 ) {
+                    log(1) << fo.op() << " file " << q.string() << endl;
                     log() << "  _applyOpToDataFiles() warning: extra == " << extra << endl;
                 }
             }
@@ -1883,19 +2013,20 @@ namespace mongo {
     }
 
     NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); }
-    
-    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ){
+
+    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) {
         log() << "DatabaseHolder::closeAll path:" << path << endl;
         dbMutex.assertWriteLocked();
-        
+
         map<string,Database*>& m = _paths[path];
         _size -= m.size();
-        
+
         set< string > dbs;
         for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) {
+            wassert( i->second->path == path );
             dbs.insert( i->first );
         }
-        
+
         currentClient.get()->getContext()->clear();
 
         BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
@@ -1910,7 +2041,7 @@ namespace mongo {
                 nNotClosed++;
             }
             else {
-                closeDatabase( name.c_str() , path );
+                Database::closeDatabase( name.c_str() , path );
                 bb.append( bb.numStr( n++ ) , name );
             }
         }
@@ -1923,6 +2054,17 @@ namespace mongo {
 
         return true;
     }
-    
+
+    bool isValidNS( const StringData& ns ) {
+        // TODO: should check for invalid characters
+
+        const char * x = strchr( ns.data() , '.' );
+        if ( ! x )
+            return false;
+
+        x++;
+        return *x > 0;
+    }
+
 
 } // namespace mongo
diff --git a/db/pdfile.h b/db/pdfile.h
index d268aac..91f4877 100644
--- a/db/pdfile.h
+++ b/db/pdfile.h
@@ -29,8 +29,9 @@
 #include "../util/mmap.h"
 #include "diskloc.h"
 #include "jsobjmanipulator.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "client.h"
+#include "mongommf.h"
 
 namespace mongo {
 
@@ -45,53 +46,60 @@ namespace mongo {
 
     /* low level - only drops this ns */
     void dropNS(const string& dropNs);
-    
+
     /* deletes this ns, indexes and cursors */
-    void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ); 
+    void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result );
     bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication, bool *deferIdIndex = 0);
     shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc());
 
-// -1 if library unavailable.
+    // -1 if library unavailable.
     boost::intmax_t freeSpace( const string &path = dbpath );
 
+    bool isValidNS( const StringData& ns );
+
     /*---------------------------------------------------------------------*/
 
     class MongoDataFile {
         friend class DataFileMgr;
         friend class BasicCursor;
     public:
-        MongoDataFile(int fn) : fileNo(fn) { }
+        MongoDataFile(int fn) : _mb(0), fileNo(fn) { }
         void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false);
 
-        /* allocate a new extent from this datafile. 
+        /* allocate a new extent from this datafile.
            @param capped - true if capped collection
            @param loops is our recursion check variable - you want to pass in zero
         */
         Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0);
 
-        DataFileHeader *getHeader() {
-            return header;
-        }
+        DataFileHeader *getHeader() { return header(); }
+
+        unsigned long long length() const { return mmf.length(); }
 
         /* return max size an extent may be */
         static int maxSize();
-        
+
+        /** fsync */
         void flush( bool sync );
-        
+
+        /** only use fore debugging */
+        Extent* debug_getExtent(DiskLoc loc) { return _getExtent( loc ); }
     private:
         void badOfs(int) const;
-
+        void badOfs2(int) const;
         int defaultSize( const char *filename ) const;
 
-        Extent* getExtent(DiskLoc loc);
-        Extent* _getExtent(DiskLoc loc);
+        Extent* getExtent(DiskLoc loc) const;
+        Extent* _getExtent(DiskLoc loc) const;
         Record* recordAt(DiskLoc dl);
         Record* makeRecord(DiskLoc dl, int size);
-		void grow(DiskLoc dl, int size);
+        void grow(DiskLoc dl, int size);
 
-        MMF mmf;
-        MMF::Pointer _p;
-        DataFileHeader *header;
+        char* p() const { return (char *) _mb; }
+        DataFileHeader* header() { return (DataFileHeader*) _mb; }
+
+        MongoMMF mmf;
+        void *_mb; // the memory mapped view
         int fileNo;
     };
 
@@ -110,9 +118,9 @@ namespace mongo {
             NamespaceDetails *d,
             NamespaceDetailsTransient *nsdt,
             Record *toupdate, const DiskLoc& dl,
-            const char *buf, int len, OpDebug& debug, bool &changedId, bool god=false);
+            const char *buf, int len, OpDebug& debug, bool god=false);
 
-        // The object o may be updated if modified on insert.                                
+        // The object o may be updated if modified on insert.
         void insertAndLog( const char *ns, const BSONObj &o, bool god = false );
 
         /** @param obj both and in and out param -- insert can sometimes modify an object (such as add _id). */
@@ -122,7 +130,6 @@ namespace mongo {
         void insertNoReturnVal(const char *ns, BSONObj o, bool god = false);
 
         DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, const BSONElement &writeId = BSONElement(), bool mayAddIndex = true);
-        void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false);
         static shared_ptr<Cursor> findAll(const char *ns, const DiskLoc &startLoc = DiskLoc());
 
         /* special version of insert for transaction logging -- streamlined a bit.
@@ -134,9 +141,10 @@ namespace mongo {
         static Extent* getExtent(const DiskLoc& dl);
         static Record* getRecord(const DiskLoc& dl);
         static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len);
-		static void grow(const DiskLoc& dl, int len);
 
-        /* does not clean up indexes, etc. : just deletes the record in the pdfile. */
+        void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false);
+
+        /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */
         void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl);
 
     private:
@@ -175,7 +183,10 @@ namespace mongo {
         int extentOfs;
         int nextOfs;
         int prevOfs;
+
+        /** be careful when referencing this that your write intent was correct */
         char data[4];
+
         int netLength() {
             return lengthWithHeaders - HeaderSize;
         }
@@ -192,6 +203,12 @@ namespace mongo {
         /* get the next record in the namespace, traversing extents as necessary */
         DiskLoc getNext(const DiskLoc& myLoc);
         DiskLoc getPrev(const DiskLoc& myLoc);
+
+        struct NP {
+            int nextOfs;
+            int prevOfs;
+        };
+        NP* np() { return (NP*) &nextOfs; }
     };
 
     /* extents are datafile regions where all the records within the region
@@ -206,13 +223,14 @@ namespace mongo {
         DiskLoc myLoc;
         DiskLoc xnext, xprev; /* next/prev extent for this namespace */
 
-        /* which namespace this extent is for.  this is just for troubleshooting really 
+        /* which namespace this extent is for.  this is just for troubleshooting really
            and won't even be correct if the collection were renamed!
         */
-        Namespace nsDiagnostic; 
+        Namespace nsDiagnostic;
 
         int length;   /* size of the extent, including these fields */
-        DiskLoc firstRecord, lastRecord;
+        DiskLoc firstRecord;
+        DiskLoc lastRecord;
         char _extentData[4];
 
         static int HeaderSize() { return sizeof(Extent)-4; }
@@ -224,7 +242,7 @@ namespace mongo {
 
         void dump(iostream& s) {
             s << "    loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n';
-            s << "    nsdiag:" << nsDiagnostic.buf << '\n';
+            s << "    nsdiag:" << nsDiagnostic.toString() << '\n';
             s << "    size:" << length << " firstRecord:" << firstRecord.toString() << " lastRecord:" << lastRecord.toString() << '\n';
         }
 
@@ -237,9 +255,8 @@ namespace mongo {
         /* like init(), but for a reuse case */
         DiskLoc reuse(const char *nsname);
 
-        void assertOk() {
-            assert(magic == 0x41424344);
-        }
+        bool isOk() const { return magic == 0x41424344; }
+        void assertOk() const { assert(isOk()); }
 
         Record* newRecord(int len);
 
@@ -251,19 +268,38 @@ namespace mongo {
             return (Record *) (((char *) this) + x);
         }
 
-        Extent* getNextExtent() {
-            return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext);
-        }
-        Extent* getPrevExtent() {
-            return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev);
-        }
-        
+        Extent* getNextExtent() { return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext); }
+        Extent* getPrevExtent() { return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev); }
+
         static int maxSize();
+        static int minSize() { return 0x100; }
+        /**
+         * @param len lengt of record we need
+         * @param lastRecord size of last extent which is a factor in next extent size
+         */
+        static int followupSize(int len, int lastExtentLen);
+
+        /**
+         * @param len lengt of record we need
+         */
+        static int initialSize(int len);
+
+        struct FL {
+            DiskLoc firstRecord;
+            DiskLoc lastRecord;
+        };
+        /** often we want to update just the firstRecord and lastRecord fields.
+            this helper is for that -- for use with getDur().writing() method
+        */
+        FL* fl() { return (FL*) &firstRecord; }
+    private:
+        DiskLoc _reuse(const char *nsname);
     };
 
-    /*
+    /*  a datafile - i.e. the "dbname.<#>" files :
+
           ----------------------
-          Header
+          DataFileHeader
           ----------------------
           Extent (for a particular namespace)
             Record
@@ -273,7 +309,6 @@ namespace mongo {
           more Extents...
           ----------------------
     */
-
     class DataFileHeader {
     public:
         int version;
@@ -287,35 +322,27 @@ namespace mongo {
 
         enum { HeaderSize = 8192 };
 
-        bool currentVersion() const {
-            return ( version == VERSION ) && ( versionMinor == VERSION_MINOR );
-        }
-
-        bool uninitialized() const {
-            if ( version == 0 ) return true;
-            return false;
-        }
+        bool isCurrentVersion() const { return ( version == VERSION ) && ( versionMinor == VERSION_MINOR ); }
 
-        /*Record* __getRecord(DiskLoc dl) {
-            int ofs = dl.getOfs();
-            assert( ofs >= HeaderSize );
-            return (Record*) (((char *) this) + ofs);
-        }*/
+        bool uninitialized() const { return version == 0; }
 
-        void init(int fileno, int filelength) {
+        void init(int fileno, int filelength, const char* filename) {
             if ( uninitialized() ) {
-                assert(filelength > 32768 );
+                if( !(filelength > 32768 ) ) { 
+                    massert(13640, str::stream() << "DataFileHeader looks corrupt at file open filelength:" << filelength << " fileno:" << fileno, false);
+                }
+                getDur().createdFile(filename, filelength);
                 assert( HeaderSize == 8192 );
-                fileLength = filelength;
-                version = VERSION;
-                versionMinor = VERSION_MINOR;
-                unused.setOfs( fileno, HeaderSize );
+                DataFileHeader *h = getDur().writing(this);
+                h->fileLength = filelength;
+                h->version = VERSION;
+                h->versionMinor = VERSION_MINOR;
+                h->unused.set( fileno, HeaderSize );
                 assert( (data-(char*)this) == HeaderSize );
-                unusedLength = fileLength - HeaderSize - 16;
-                //memcpy(data+unusedLength, "      \nthe end\n", 16);
+                h->unusedLength = fileLength - HeaderSize - 16;
             }
         }
-        
+
         bool isEmpty() const {
             return uninitialized() || ( unusedLength == fileLength - HeaderSize - 16 );
         }
@@ -323,13 +350,13 @@ namespace mongo {
 
 #pragma pack()
 
-    inline Extent* MongoDataFile::_getExtent(DiskLoc loc) {
+    inline Extent* MongoDataFile::_getExtent(DiskLoc loc) const {
         loc.assertOk();
-        Extent *e = (Extent *) _p.at(loc.getOfs(), Extent::HeaderSize());
+        Extent *e = (Extent *) (p()+loc.getOfs());
         return e;
     }
 
-    inline Extent* MongoDataFile::getExtent(DiskLoc loc) {
+    inline Extent* MongoDataFile::getExtent(DiskLoc loc) const {
         Extent *e = _getExtent(loc);
         e->assertOk();
         return e;
@@ -344,18 +371,13 @@ namespace mongo {
     inline Record* MongoDataFile::recordAt(DiskLoc dl) {
         int ofs = dl.getOfs();
         if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path
-        return (Record*) _p.at(ofs, -1);
+        return (Record*) (p()+ofs);
     }
 
-	inline void MongoDataFile::grow(DiskLoc dl, int size) { 
-        int ofs = dl.getOfs();
-        _p.grow(ofs, size);
-	}
-
-    inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) { 
+    inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) {
         int ofs = dl.getOfs();
-        assert( ofs >= DataFileHeader::HeaderSize );
-        return (Record*) _p.at(ofs, size);
+        if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path
+        return (Record*) (p()+ofs);
     }
 
     inline DiskLoc Record::getNext(const DiskLoc& myLoc) {
@@ -395,50 +417,23 @@ namespace mongo {
         return BSONObj(rec());
     }
     inline DeletedRecord* DiskLoc::drec() const {
-        assert( fileNo != -1 );
+        assert( _a != -1 );
         return (DeletedRecord*) rec();
     }
     inline Extent* DiskLoc::ext() const {
         return DataFileMgr::getExtent(*this);
     }
-
-    /*---------------------------------------------------------------------*/
+    inline const BtreeBucket* DiskLoc::btree() const {
+        assert( _a != -1 );
+        return (const BtreeBucket *) rec()->data;
+    }
 
 } // namespace mongo
 
-#include "rec.h"
 #include "database.h"
 
 namespace mongo {
 
-    // Heritable class to implement an operation that may be applied to all
-    // files in a database using _applyOpToDataFiles()
-    class FileOp {
-    public:
-        virtual ~FileOp() {}
-        // Return true if file exists and operation successful
-        virtual bool apply( const boost::filesystem::path &p ) = 0;
-        virtual const char * op() const = 0;
-    };
-
-    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );
-
-    inline void _deleteDataFiles(const char *database) {
-        if ( directoryperdb ) {
-            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) );
-            return;
-        }
-        class : public FileOp {
-            virtual bool apply( const boost::filesystem::path &p ) {
-                return boost::filesystem::remove( p );
-            }
-            virtual const char * op() const {
-                return "remove";
-            }
-        } deleter;
-        _applyOpToDataFiles( database, deleter, true );
-    }
-
     boost::intmax_t dbSize( const char *database );
 
     inline NamespaceIndex* nsindex(const char *ns) {
@@ -462,11 +457,6 @@ namespace mongo {
         return nsindex(ns)->details(ns);
     }
 
-    inline MongoDataFile& DiskLoc::pdf() const {
-        assert( fileNo != -1 );
-        return *cc().database()->getFile(fileNo);
-    }
-
     inline Extent* DataFileMgr::getExtent(const DiskLoc& dl) {
         assert( dl.a() != -1 );
         return cc().database()->getFile(dl.a())->getExtent(dl);
@@ -477,30 +467,30 @@ namespace mongo {
         return cc().database()->getFile(dl.a())->recordAt(dl);
     }
 
-	BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) );
-
-	inline void DataFileMgr::grow(const DiskLoc& dl, int len) { 
-        assert( dl.a() != -1 );
-        cc().database()->getFile(dl.a())->grow(dl, len);
-	}
+    BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) );
 
-    inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) { 
+    inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) {
         assert( dl.a() != -1 );
         return (DeletedRecord*) cc().database()->getFile(dl.a())->makeRecord(dl, sizeof(DeletedRecord));
     }
-    
+
     void ensureHaveIdIndex(const char *ns);
-    
+
     bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex );
 
 
     /**
-     * @return true if ns is ok
+     * @return true if ns is 'normal'.  $ used for collections holding index data, which do not contain BSON objects in their records.
+     * special case for the local.oplog.$main ns -- naming it as such was a mistake.
      */
-    inline bool nsDollarCheck( const char* ns ){
+    inline bool isANormalNSName( const char* ns ) {
         if ( strchr( ns , '$' ) == 0 )
             return true;
-        
         return strcmp( ns, "local.oplog.$main" ) == 0;
     }
+
+    inline BSONObj::BSONObj(const Record *r) {
+        init(r->data, false);
+    }
+
 } // namespace mongo
diff --git a/db/projection.cpp b/db/projection.cpp
new file mode 100644
index 0000000..3dcfef7
--- /dev/null
+++ b/db/projection.cpp
@@ -0,0 +1,301 @@
+// projection.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "projection.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+
+    void Projection::init( const BSONObj& o ) {
+        massert( 10371 , "can only add to Projection once", _source.isEmpty());
+        _source = o;
+
+        BSONObjIterator i( o );
+        int true_false = -1;
+        while ( i.more() ) {
+            BSONElement e = i.next();
+
+            if ( ! e.isNumber() )
+                _hasNonSimple = true;
+
+            if (e.type() == Object) {
+                BSONObj obj = e.embeddedObject();
+                BSONElement e2 = obj.firstElement();
+                if ( strcmp(e2.fieldName(), "$slice") == 0 ) {
+                    if (e2.isNumber()) {
+                        int i = e2.numberInt();
+                        if (i < 0)
+                            add(e.fieldName(), i, -i); // limit is now positive
+                        else
+                            add(e.fieldName(), 0, i);
+
+                    }
+                    else if (e2.type() == Array) {
+                        BSONObj arr = e2.embeddedObject();
+                        uassert(13099, "$slice array wrong size", arr.nFields() == 2 );
+
+                        BSONObjIterator it(arr);
+                        int skip = it.next().numberInt();
+                        int limit = it.next().numberInt();
+                        uassert(13100, "$slice limit must be positive", limit > 0 );
+                        add(e.fieldName(), skip, limit);
+
+                    }
+                    else {
+                        uassert(13098, "$slice only supports numbers and [skip, limit] arrays", false);
+                    }
+                }
+                else {
+                    uassert(13097, string("Unsupported projection option: ") + obj.firstElement().fieldName(), false);
+                }
+
+            }
+            else if (!strcmp(e.fieldName(), "_id") && !e.trueValue()) {
+                _includeID = false;
+
+            }
+            else {
+
+                add (e.fieldName(), e.trueValue());
+
+                // validate input
+                if (true_false == -1) {
+                    true_false = e.trueValue();
+                    _include = !e.trueValue();
+                }
+                else {
+                    uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." ,
+                             (bool)true_false == e.trueValue() );
+                }
+            }
+        }
+    }
+
+    void Projection::add(const string& field, bool include) {
+        if (field.empty()) { // this is the field the user referred to
+            _include = include;
+        }
+        else {
+            _include = !include;
+
+            const size_t dot = field.find('.');
+            const string subfield = field.substr(0,dot);
+            const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos));
+
+            boost::shared_ptr<Projection>& fm = _fields[subfield];
+            if (!fm)
+                fm.reset(new Projection());
+
+            fm->add(rest, include);
+        }
+    }
+
+    void Projection::add(const string& field, int skip, int limit) {
+        _special = true; // can't include or exclude whole object
+
+        if (field.empty()) { // this is the field the user referred to
+            _skip = skip;
+            _limit = limit;
+        }
+        else {
+            const size_t dot = field.find('.');
+            const string subfield = field.substr(0,dot);
+            const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos));
+
+            boost::shared_ptr<Projection>& fm = _fields[subfield];
+            if (!fm)
+                fm.reset(new Projection());
+
+            fm->add(rest, skip, limit);
+        }
+    }
+
+    void Projection::transform( const BSONObj& in , BSONObjBuilder& b ) const {
+        BSONObjIterator i(in);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if ( mongoutils::str::equals( "_id" , e.fieldName() ) ) {
+                if ( _includeID )
+                    b.append( e );
+            }
+            else {
+                append( b , e );
+            }
+        }
+    }
+
+    BSONObj Projection::transform( const BSONObj& in ) const {
+        BSONObjBuilder b;
+        transform( in , b );
+        return b.obj();
+    }
+
+
+    //b will be the value part of an array-typed BSONElement
+    void Projection::appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested) const {
+        int skip  = nested ?  0 : _skip;
+        int limit = nested ? -1 : _limit;
+
+        if (skip < 0) {
+            skip = max(0, skip + a.nFields());
+        }
+
+        int i=0;
+        BSONObjIterator it(a);
+        while (it.more()) {
+            BSONElement e = it.next();
+
+            if (skip) {
+                skip--;
+                continue;
+            }
+
+            if (limit != -1 && (limit-- == 0)) {
+                break;
+            }
+
+            switch(e.type()) {
+            case Array: {
+                BSONObjBuilder subb;
+                appendArray(subb , e.embeddedObject(), true);
+                b.appendArray(b.numStr(i++), subb.obj());
+                break;
+            }
+            case Object: {
+                BSONObjBuilder subb;
+                BSONObjIterator jt(e.embeddedObject());
+                while (jt.more()) {
+                    append(subb , jt.next());
+                }
+                b.append(b.numStr(i++), subb.obj());
+                break;
+            }
+            default:
+                if (_include)
+                    b.appendAs(e, b.numStr(i++));
+            }
+        }
+    }
+
+    void Projection::append( BSONObjBuilder& b , const BSONElement& e ) const {
+        FieldMap::const_iterator field = _fields.find( e.fieldName() );
+
+        if (field == _fields.end()) {
+            if (_include)
+                b.append(e);
+        }
+        else {
+            Projection& subfm = *field->second;
+
+            if ((subfm._fields.empty() && !subfm._special) || !(e.type()==Object || e.type()==Array) ) {
+                if (subfm._include)
+                    b.append(e);
+            }
+            else if (e.type() == Object) {
+                BSONObjBuilder subb;
+                BSONObjIterator it(e.embeddedObject());
+                while (it.more()) {
+                    subfm.append(subb, it.next());
+                }
+                b.append(e.fieldName(), subb.obj());
+
+            }
+            else { //Array
+                BSONObjBuilder subb;
+                subfm.appendArray(subb, e.embeddedObject());
+                b.appendArray(e.fieldName(), subb.obj());
+            }
+        }
+    }
+
+    Projection::KeyOnly* Projection::checkKey( const BSONObj& keyPattern ) const {
+        if ( _include ) {
+            // if we default to including then we can't
+            // use an index because we don't know what we're missing
+            return 0;
+        }
+
+        if ( _hasNonSimple )
+            return 0;
+
+        if ( _includeID && keyPattern["_id"].eoo() )
+            return 0;
+
+        // at this point we know its all { x : 1 } style
+
+        auto_ptr<KeyOnly> p( new KeyOnly() );
+
+        int got = 0;
+        BSONObjIterator i( keyPattern );
+        while ( i.more() ) {
+            BSONElement k = i.next();
+
+            if ( _source[k.fieldName()].type() ) {
+
+                if ( strchr( k.fieldName() , '.' ) ) {
+                    // TODO we currently don't support dotted fields
+                    //      SERVER-2104
+                    return 0;
+                }
+
+                if ( ! _includeID && mongoutils::str::equals( k.fieldName() , "_id" ) ) {
+                    p->addNo();
+                }
+                else {
+                    p->addYes( k.fieldName() );
+                    got++;
+                }
+            }
+            else if ( mongoutils::str::equals( "_id" , k.fieldName() ) && _includeID ) {
+                p->addYes( "_id" );
+            }
+            else {
+                p->addNo();
+            }
+
+        }
+
+        int need = _source.nFields();
+        if ( ! _includeID )
+            need--;
+
+        if ( got == need )
+            return p.release();
+
+        return 0;
+    }
+
+    BSONObj Projection::KeyOnly::hydrate( const BSONObj& key ) const {
+        assert( _include.size() == _names.size() );
+
+        BSONObjBuilder b( key.objsize() + _stringSize + 16 );
+
+        BSONObjIterator i(key);
+        unsigned n=0;
+        while ( i.more() ) {
+            assert( n < _include.size() );
+            BSONElement e = i.next();
+            if ( _include[n] ) {
+                b.appendAs( e , _names[n] );
+            }
+            n++;
+        }
+
+        return b.obj();
+    }
+}
diff --git a/db/projection.h b/db/projection.h
new file mode 100644
index 0000000..fd3b856
--- /dev/null
+++ b/db/projection.h
@@ -0,0 +1,127 @@
+// projection.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    /**
+     * given a document and a projection specification
+     * can transform the document
+     * currently supports specifying which fields and $slice
+     */
+    class Projection {
+    public:
+
+        class KeyOnly {
+        public:
+
+            KeyOnly() : _stringSize(0) {}
+
+            BSONObj hydrate( const BSONObj& key ) const;
+
+            void addNo() { _add( false , "" ); }
+            void addYes( const string& name ) { _add( true , name ); }
+
+        private:
+
+            void _add( bool b , const string& name ) {
+                _include.push_back( b );
+                _names.push_back( name );
+                _stringSize += name.size();
+            }
+
+            vector<bool> _include; // one entry per field in key.  true iff should be in output
+            vector<string> _names; // name of field since key doesn't have names
+
+            int _stringSize;
+        };
+
+        Projection() :
+            _include(true) ,
+            _special(false) ,
+            _includeID(true) ,
+            _skip(0) ,
+            _limit(-1) ,
+            _hasNonSimple(false) {
+        }
+
+        /**
+         * called once per lifetime
+         * e.g. { "x" : 1 , "a.y" : 1 }
+         */
+        void init( const BSONObj& spec );
+
+        /**
+         * @return the spec init was called with
+         */
+        BSONObj getSpec() const { return _source; }
+
+        /**
+         * transforms in according to spec
+         */
+        BSONObj transform( const BSONObj& in ) const;
+
+
+        /**
+         * transforms in according to spec
+         */
+        void transform( const BSONObj& in , BSONObjBuilder& b ) const;
+
+
+        /**
+         * @return if the keyPattern has all the information needed to return then
+         *         return a new KeyOnly otherwise null
+         *         NOTE: a key may have modified the actual data
+         *               which has to be handled above this (arrays, geo)
+         */
+        KeyOnly* checkKey( const BSONObj& keyPattern ) const;
+
+    private:
+
+        /**
+         * appends e to b if user wants it
+         * will descend into e if needed
+         */
+        void append( BSONObjBuilder& b , const BSONElement& e ) const;
+
+
+        void add( const string& field, bool include );
+        void add( const string& field, int skip, int limit );
+        void appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested=false) const;
+
+        bool _include; // true if default at this level is to include
+        bool _special; // true if this level can't be skipped or included without recursing
+
+        //TODO: benchmark vector<pair> vs map
+        typedef map<string, boost::shared_ptr<Projection> > FieldMap;
+        FieldMap _fields;
+        BSONObj _source;
+        bool _includeID;
+
+        // used for $slice operator
+        int _skip;
+        int _limit;
+
+        bool _hasNonSimple;
+    };
+
+
+}
diff --git a/db/query.cpp b/db/query.cpp
index 154fd15..df09fce 100644
--- a/db/query.cpp
+++ b/db/query.cpp
@@ -30,7 +30,7 @@
 #include "replpair.h"
 #include "scanandorder.h"
 #include "security.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "commands.h"
 #include "queryoptimizer.h"
 #include "lasterror.h"
@@ -67,7 +67,7 @@ namespace mongo {
                 _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , qp().ns() ) );
             }
             return _cc->prepareToYield( _yieldData );
-        }        
+        }
         virtual void recoverFromYield() {
             if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
                 _cc.reset();
@@ -75,24 +75,28 @@ namespace mongo {
                 massert( 13340, "cursor dropped during delete", false );
             }
         }
+        virtual long long nscanned() {
+            assert( c_.get() );
+            return c_->nscanned();
+        }
         virtual void next() {
             if ( !c_->ok() ) {
                 setComplete();
                 return;
             }
-            
+
             DiskLoc rloc = c_->currLoc();
-            
+
             if ( matcher()->matches(c_->currKey(), rloc ) ) {
                 if ( !c_->getsetdup(rloc) )
                     ++count_;
             }
 
             c_->advance();
-            ++_nscanned;
+            _nscanned = c_->nscanned();
             if ( count_ > bestCount_ )
                 bestCount_ = count_;
-            
+
             if ( count_ > 0 ) {
                 if ( justOne_ )
                     setComplete();
@@ -115,7 +119,7 @@ namespace mongo {
         ClientCursor::CleanupPointer _cc;
         ClientCursor::YieldData _yieldData;
     };
-    
+
     /* ns:      namespace, e.g. <database>.<collection>
        pattern: the "where" clause / criteria
        justOne: stop after 1 match
@@ -124,13 +128,13 @@ namespace mongo {
     long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
         if( !god ) {
             if ( strstr(ns, ".system.") ) {
-                /* note a delete from system.indexes would corrupt the db 
-                if done here, as there are pointers into those objects in 
+                /* note a delete from system.indexes would corrupt the db
+                if done here, as there are pointers into those objects in
                 NamespaceDetails.
                 */
                 uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
             }
-            if ( strchr( ns , '$' ) ){
+            if ( strchr( ns , '$' ) ) {
                 log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
                 uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
             }
@@ -145,55 +149,56 @@ namespace mongo {
 
         int best = 0;
         shared_ptr< MultiCursor::CursorOp > opPtr( new DeleteOp( justOneOrig, best ) );
-        shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, true ) );
-        
+        shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, !god ) );
+
         if( !creal->ok() )
             return nDeleted;
-            
+
         shared_ptr< Cursor > cPtr = creal;
         auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
         cc->setDoingDeletes( true );
-            
-        CursorId id = cc->cursorid;
-            
+
+        CursorId id = cc->cursorid();
+
         bool justOne = justOneOrig;
         bool canYield = !god && !creal->matcher()->docMatcher().atomic();
+
         do {
-            if ( canYield && ! cc->yieldSometimes() ){
+            if ( canYield && ! cc->yieldSometimes() ) {
                 cc.release(); // has already been deleted elsewhere
                 // TODO should we assert or something?
                 break;
             }
-            if ( !cc->c->ok() ) {
+            if ( !cc->ok() ) {
                 break; // if we yielded, could have hit the end
             }
-                
+
             // this way we can avoid calling updateLocation() every time (expensive)
             // as well as some other nuances handled
             cc->setDoingDeletes( true );
-                
-            DiskLoc rloc = cc->c->currLoc();
-            BSONObj key = cc->c->currKey();
 
-            // NOTE Calling advance() may change the matcher, so it's important 
+            DiskLoc rloc = cc->currLoc();
+            BSONObj key = cc->currKey();
+
+            // NOTE Calling advance() may change the matcher, so it's important
             // to try to match first.
             bool match = creal->matcher()->matches( key , rloc );
-            
-            if ( ! cc->c->advance() )
+
+            if ( ! cc->advance() )
                 justOne = true;
-                
+
             if ( ! match )
                 continue;
-                            
-            assert( !cc->c->getsetdup(rloc) ); // can't be a dup, we deleted it!
-                
+
+            assert( !cc->c()->getsetdup(rloc) ); // can't be a dup, we deleted it!
+
             if ( !justOne ) {
                 /* NOTE: this is SLOW.  this is not good, noteLocation() was designed to be called across getMore
                     blocks.  here we might call millions of times which would be bad.
                     */
-                cc->c->noteLocation();
+                cc->c()->noteLocation();
             }
-                
+
             if ( logop ) {
                 BSONElement e;
                 if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
@@ -201,7 +206,8 @@ namespace mongo {
                     b.append( e );
                     bool replJustOne = true;
                     logOp( "d", ns, b.done(), 0, &replJustOne );
-                } else {
+                }
+                else {
                     problem() << "deleted object without id, not logging" << endl;
                 }
             }
@@ -214,14 +220,20 @@ namespace mongo {
             if ( justOne ) {
                 break;
             }
-            cc->c->checkLocation();
-                
-        } while ( cc->c->ok() );
+            cc->c()->checkLocation();
+         
+            if( !god ) 
+                getDur().commitIfNeeded();
 
-        if ( cc.get() && ClientCursor::find( id , false ) == 0 ){
+            if( debug && god && nDeleted == 100 ) 
+                log() << "warning high number of deletes with god=true which could use significant memory" << endl;
+        }
+        while ( cc->ok() );
+
+        if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
             cc.release();
         }
-              
+
         return nDeleted;
     }
 
@@ -246,16 +258,6 @@ namespace mongo {
 
     int nCaught = 0;
 
-    void killCursors(int n, long long *ids) {
-        int k = 0;
-        for ( int i = 0; i < n; i++ ) {
-            if ( ClientCursor::erase(ids[i]) )
-                k++;
-        }
-        if ( logLevel > 0 || k != n ){
-            log( k == n ) << "killcursors: found " << k << " of " << n << endl;
-        }
-    }
 
     BSONObj id_obj = fromjson("{\"_id\":1}");
     BSONObj empty_obj = fromjson("{}");
@@ -278,21 +280,20 @@ namespace mongo {
     }
 
     QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
-//        log() << "TEMP GETMORE " << ns << ' ' << cursorid << ' ' << pass << endl;
         exhaust = false;
         ClientCursor::Pointer p(cursorid);
-        ClientCursor *cc = p._c;
-        
+        ClientCursor *cc = p.c();
+
         int bufSize = 512;
-        if ( cc ){
+        if ( cc ) {
             bufSize += sizeof( QueryResult );
-            bufSize += ( ntoreturn ? 4 : 1 ) * 1024 * 1024;
+            bufSize += MaxBytesToReturnToClientAtOnce;
         }
 
         BufBuilder b( bufSize );
 
         b.skip(sizeof(QueryResult));
-        
+
         int resultFlags = ResultFlag_AwaitCapable;
         int start = 0;
         int n = 0;
@@ -306,23 +307,27 @@ namespace mongo {
             if ( pass == 0 )
                 cc->updateSlaveLocation( curop );
 
-            int queryOptions = cc->_queryOptions;
+            int queryOptions = cc->queryOptions();
 
             if( pass == 0 ) {
                 StringBuilder& ss = curop.debug().str;
-                ss << " getMore: " << cc->query.toString() << " ";
+                ss << " getMore: " << cc->query().toString() << " ";
             }
-            
-            start = cc->pos;
-            Cursor *c = cc->c.get();
+
+            start = cc->pos();
+            Cursor *c = cc->c();
             c->checkLocation();
             DiskLoc last;
 
+            scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
+            if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
+                keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );
+
             while ( 1 ) {
                 if ( !c->ok() ) {
                     if ( c->tailable() ) {
-                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however 
-                           advance() can still be retries as a reactivation attempt.  when there is new data, it will 
+                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
+                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
                            return true.  that's what we are doing here.
                            */
                         if ( c->advance() )
@@ -356,27 +361,40 @@ namespace mongo {
                     }
                     else {
                         last = c->currLoc();
-                        BSONObj js = c->current();
-
-                        // show disk loc should be part of the main query, not in an $or clause, so this should be ok
-                        fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
                         n++;
-                        if ( (ntoreturn>0 && (n >= ntoreturn || b.len() > MaxBytesToReturnToClientAtOnce)) ||
-                             (ntoreturn==0 && b.len()>1*1024*1024) ) {
+
+                        if ( keyFieldsOnly ) {
+                            fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
+                        }
+                        else {
+                            BSONObj js = c->current();
+                            // show disk loc should be part of the main query, not in an $or clause, so this should be ok
+                            fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
+                        }
+
+                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                             c->advance();
-                            cc->pos += n;
+                            cc->incPos( n );
                             break;
                         }
                     }
                 }
                 c->advance();
+
+                if ( ! cc->yieldSometimes() ) {
+                    ClientCursor::erase(cursorid);
+                    cursorid = 0;
+                    cc = 0;
+                    p.deleted();
+                    break;
+                }
             }
             
             if ( cc ) {
                 cc->updateLocation();
                 cc->mayUpgradeStorage();
                 cc->storeOpForSlave( last );
-                exhaust = cc->_queryOptions & QueryOption_Exhaust;
+                exhaust = cc->queryOptions() & QueryOption_Exhaust;
             }
         }
 
@@ -395,104 +413,120 @@ namespace mongo {
     class CountOp : public QueryOp {
     public:
         CountOp( const string& ns , const BSONObj &spec ) :
-            _ns(ns), count_(),
-            skip_( spec["skip"].numberLong() ),
-            limit_( spec["limit"].numberLong() ),
-            bc_(){
+            _ns(ns), _capped(false), _count(), _myCount(),
+            _skip( spec["skip"].numberLong() ),
+            _limit( spec["limit"].numberLong() ),
+            _bc() {
         }
-        
+
         virtual void _init() {
-            c_ = qp().newCursor();
-            
+            _c = qp().newCursor();
+            _capped = _c->capped();
             if ( qp().exactKeyMatch() && ! matcher()->needRecord() ) {
-                query_ = qp().simplifiedQuery( qp().indexKey() );
-                bc_ = dynamic_cast< BtreeCursor* >( c_.get() );
-                bc_->forgetEndKey();
+                _query = qp().simplifiedQuery( qp().indexKey() );
+                _bc = dynamic_cast< BtreeCursor* >( _c.get() );
+                _bc->forgetEndKey();
             }
         }
 
+        virtual long long nscanned() {
+            assert( _c.get() );
+            return _c->nscanned();
+        }
+
         virtual bool prepareToYield() {
             if ( ! _cc ) {
-                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , _ns.c_str() ) );
+                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _ns.c_str() ) );
             }
             return _cc->prepareToYield( _yieldData );
         }
-        
+
         virtual void recoverFromYield() {
             if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
-                c_.reset();
+                _c.reset();
                 _cc.reset();
-                massert( 13337, "cursor dropped during count", false );
-                // TODO maybe we want to prevent recording the winning plan as well?
+
+                if ( _capped ) {
+                    msgassertedNoTrace( 13337, str::stream() << "capped cursor overrun during count: " << _ns );
+                }
+                else {
+                    // we don't fail query since we're fine with returning partial data if collection dropped
+                }
             }
         }
-        
+
         virtual void next() {
-            if ( !c_->ok() ) {
+            if ( ! _c || !_c->ok() ) {
                 setComplete();
                 return;
             }
 
-            if ( bc_ ) {
-                if ( firstMatch_.isEmpty() ) {
-                    firstMatch_ = bc_->currKeyNode().key;
+            if ( _bc ) {
+                if ( _firstMatch.isEmpty() ) {
+                    _firstMatch = _bc->currKeyNode().key.copy();
                     // if not match
-                    if ( query_.woCompare( firstMatch_, BSONObj(), false ) ) {
+                    if ( _query.woCompare( _firstMatch, BSONObj(), false ) ) {
                         setComplete();
                         return;
                     }
                     _gotOne();
-                } else {
-                    if ( !firstMatch_.woEqual( bc_->currKeyNode().key ) ) {
+                }
+                else {
+                    if ( ! _firstMatch.woEqual( _bc->currKeyNode().key ) ) {
                         setComplete();
                         return;
                     }
                     _gotOne();
                 }
-            } 
+            }
             else {
-                if ( !matcher()->matches(c_->currKey(), c_->currLoc() ) ) {
+                if ( !matcher()->matches(_c->currKey(), _c->currLoc() ) ) {
                 }
-                else if( !c_->getsetdup(c_->currLoc()) ) {
+                else if( !_c->getsetdup(_c->currLoc()) ) {
                     _gotOne();
-                }                
+                }
             }
-            c_->advance();
+            _c->advance();
         }
         virtual QueryOp *_createChild() const {
             CountOp *ret = new CountOp( _ns , BSONObj() );
-            ret->count_ = count_;
-            ret->skip_ = skip_;
-            ret->limit_ = limit_;
+            ret->_count = _count;
+            ret->_skip = _skip;
+            ret->_limit = _limit;
             return ret;
         }
-        long long count() const { return count_; }
-        virtual bool mayRecordPlan() const { return true; }
+        long long count() const { return _count; }
+        virtual bool mayRecordPlan() const {
+            return ( _myCount > _limit / 2 ) || ( complete() && !stopRequested() );
+        }
     private:
-        
-        void _gotOne(){
-            if ( skip_ ){
-                skip_--;
+
+        void _gotOne() {
+            if ( _skip ) {
+                _skip--;
                 return;
             }
-            
-            if ( limit_ > 0 && count_ >= limit_ ){
+
+            if ( _limit > 0 && _count >= _limit ) {
                 setStop();
                 return;
             }
 
-            count_++;
+            _count++;
+            _myCount++;
         }
 
         string _ns;
-        
-        long long count_;
-        long long skip_;
-        long long limit_;
-        shared_ptr<Cursor> c_;
-        BSONObj query_;
-        BtreeCursor *bc_;
-        BSONObj firstMatch_;
+        bool _capped;
+
+        long long _count;
+        long long _myCount;
+        long long _skip;
+        long long _limit;
+        shared_ptr<Cursor> _c;
+        BSONObj _query;
+        BtreeCursor * _bc;
+        BSONObj _firstMatch;
 
         ClientCursor::CleanupPointer _cc;
         ClientCursor::YieldData _yieldData;
@@ -500,7 +534,7 @@ namespace mongo {
 
     /* { count: "collectionname"[, query: <query>] }
        returns -1 on ns does not exist error.
-    */    
+    */
     long long runCount( const char *ns, const BSONObj &cmd, string &err ) {
         Client::Context cx(ns);
         NamespaceDetails *d = nsdetails( ns );
@@ -509,10 +543,10 @@ namespace mongo {
             return -1;
         }
         BSONObj query = cmd.getObjectField("query");
-        
+
         // count of all objects
-        if ( query.isEmpty() ){
-            return applySkipLimit( d->nrecords , cmd );
+        if ( query.isEmpty() ) {
+            return applySkipLimit( d->stats.nrecords , cmd );
         }
         MultiPlanScanner mps( ns, query, BSONObj(), 0, true, BSONObj(), BSONObj(), false, true );
         CountOp original( ns , cmd );
@@ -525,8 +559,11 @@ namespace mongo {
         }
         return res->count();
     }
-    
+
     class ExplainBuilder {
+        // Note: by default we filter out allPlans and oldPlan in the shell's
+        // explain() function. If you add any recursive structures, make sure to
+        // edit the JS to make sure everything gets filtered.
     public:
         ExplainBuilder() : _i() {}
         void ensureStartScan() {
@@ -539,14 +576,16 @@ namespace mongo {
             b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds();
             b.done();
         }
-        void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder, int millis, bool hint ) {
+        void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder,
+                       int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) {
             if ( _i == 1 ) {
                 _c.reset( new BSONArrayBuilder() );
                 *_c << _b->obj();
             }
             if ( _i == 0 ) {
                 _b.reset( new BSONObjBuilder() );
-            } else {
+            }
+            else {
                 _b.reset( new BSONObjBuilder( _c->subobjStart() ) );
             }
             *_b << "cursor" << c->toString();
@@ -559,6 +598,11 @@ namespace mongo {
 
             *_b << "millis" << millis;
 
+            *_b << "nYields" << nYields;
+            *_b << "nChunkSkips" << nChunkSkips;
+            *_b << "isMultiKey" << c->isMultiKey();
+            *_b << "indexOnly" << indexOnly;
+
             *_b << "indexBounds" << c->prettyIndexBounds();
 
             if ( !hint ) {
@@ -570,19 +614,20 @@ namespace mongo {
             _a.reset( 0 );
             ++_i;
         }
-        BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) { 
+        BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) {
             if ( _i > 1 ) {
                 BSONObjBuilder b;
                 b << "clauses" << _c->arr();
                 b.appendNumber( "nscanned", nscanned );
-                b.appendNumber( "nscanneObjects", nscannedObjects );
+                b.appendNumber( "nscannedObjects", nscannedObjects );
                 b << "n" << n;
                 b << "millis" << millis;
                 b.appendElements( suffix );
                 return b.obj();
-            } else {
+            }
+            else {
                 _b->appendElements( suffix );
-                return _b->obj();                
+                return _b->obj();
             }
         }
     private:
@@ -591,11 +636,11 @@ namespace mongo {
         auto_ptr< BSONArrayBuilder > _c;
         int _i;
     };
-    
+
     // Implements database 'query' requests using the query optimizer's QueryOp interface
     class UserQueryOp : public QueryOp {
     public:
-        
+
         UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) :
             _buf( 32768 ) , // TODO be smarter here
             _pq( pq ) ,
@@ -603,8 +648,12 @@ namespace mongo {
             _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0),
             _n(0),
             _oldN(0),
-            _chunkMatcher(shardingState.getChunkMatcher(pq.ns())),
+            _nYields(),
+            _nChunkSkips(),
+            _chunkManager( shardingState.needShardChunkManager(pq.ns()) ?
+                           shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ),
             _inMemSort(false),
+            _capped(false),
             _saveClientCursor(false),
             _wouldSaveClientCursor(false),
             _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ),
@@ -612,82 +661,111 @@ namespace mongo {
             _eb( eb ),
             _curop( curop )
         {}
-        
+
         virtual void _init() {
             // only need to put the QueryResult fields there if we're building the first buffer in the message.
             if ( _response.empty() ) {
                 _buf.skip( sizeof( QueryResult ) );
             }
-            
+
             if ( _oplogReplay ) {
                 _findingStartCursor.reset( new FindingStartCursor( qp() ) );
-            } else {
+                _capped = true;
+            }
+            else {
                 _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() );
+                _capped = _c->capped();
+
+                // setup check for if we can only use index to extract
+                if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) {
+                    _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) );
+                }
             }
 
             if ( qp().scanAndOrderRequired() ) {
                 _inMemSort = true;
                 _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder() ) );
             }
-            
+
             if ( _pq.isExplain() ) {
                 _eb.noteCursor( _c.get() );
             }
+
         }
-        
+
         virtual bool prepareToYield() {
             if ( _findingStartCursor.get() ) {
                 return _findingStartCursor->prepareToYield();
-            } else {
+            }
+            else {
                 if ( ! _cc ) {
                     _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) );
                 }
                 return _cc->prepareToYield( _yieldData );
             }
         }
-        
+
         virtual void recoverFromYield() {
+            _nYields++;
+
             if ( _findingStartCursor.get() ) {
                 _findingStartCursor->recoverFromYield();
-            } else {
-                if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
-                    _c.reset();
-                    _cc.reset();
-                    _so.reset();
-                    massert( 13338, "cursor dropped during query", false );
-                    // TODO maybe we want to prevent recording the winning plan as well?
-                } 
+            }
+            else if ( ! ClientCursor::recoverFromYield( _yieldData ) ) {
+                _c.reset();
+                _cc.reset();
+                _so.reset();
+
+                if ( _capped ) {
+                    msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() );
+                }
+                else {
+                    // we don't fail query since we're fine with returning partial data if collection dropped
+
+                    // todo: this is wrong.  the cursor could be gone if closeAllDatabases command just ran
+                }
+
             }
         }
-        
+
+        virtual long long nscanned() {
+            if ( _findingStartCursor.get() ) {
+                return 0; // should only be one query plan, so value doesn't really matter.
+            }
+            assert( _c.get() );
+            return _c->nscanned();
+        }
+
         virtual void next() {
             if ( _findingStartCursor.get() ) {
                 if ( _findingStartCursor->done() ) {
                     _c = _findingStartCursor->cRelease();
                     _findingStartCursor.reset( 0 );
-                } else {
+                }
+                else {
                     _findingStartCursor->next();
                 }
+                _capped = true;
                 return;
             }
-            
-            if ( !_c->ok() ) {
+
+            if ( !_c || !_c->ok() ) {
                 finish( false );
                 return;
             }
 
             bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors;
-            
-            if( 0 ) { 
+
+            if( 0 ) {
                 cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl;
             }
-            
-            if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ){
+
+            if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) {
                 finish( true ); //?
                 return;
             }
 
-            _nscanned++;
+            _nscanned = _c->nscanned();
             if ( !matcher()->matches(_c->currKey(), _c->currLoc() , &_details ) ) {
                 // not a match, continue onward
                 if ( _details.loadedObject )
@@ -696,22 +774,23 @@ namespace mongo {
             else {
                 _nscannedObjects++;
                 DiskLoc cl = _c->currLoc();
-                if ( _chunkMatcher && ! _chunkMatcher->belongsToMe( _c->currKey(), _c->currLoc() ) ){
-                    // cout << "TEMP skipping un-owned chunk: " << _c->current() << endl;
+                if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) {
+                    _nChunkSkips++;
+                    // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl;
                 }
-                else if( _c->getsetdup(cl) ) { 
+                else if( _c->getsetdup(cl) ) {
                     // dup
                 }
                 else {
                     // got a match.
-                    
+
                     if ( _inMemSort ) {
                         // note: no cursors for non-indexed, ordered results.  results must be fairly small.
                         _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 );
                     }
                     else if ( _ntoskip > 0 ) {
                         _ntoskip--;
-                    } 
+                    }
                     else {
                         if ( _pq.isExplain() ) {
                             _n++;
@@ -723,16 +802,19 @@ namespace mongo {
                         }
                         else {
 
-                            if ( _pq.returnKey() ){
+                            if ( _pq.returnKey() ) {
                                 BSONObjBuilder bb( _buf );
                                 bb.appendKeys( _c->indexKeyPattern() , _c->currKey() );
                                 bb.done();
                             }
+                            else if ( _keyFieldsOnly ) {
+                                fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) );
+                            }
                             else {
                                 BSONObj js = _c->current();
                                 assert( js.isValid() );
 
-                                if ( _oplogReplay ){
+                                if ( _oplogReplay ) {
                                     BSONElement e = js["ts"];
                                     if ( e.type() == Date || e.type() == Timestamp )
                                         _slaveReadTill = e._opTime();
@@ -741,13 +823,13 @@ namespace mongo {
                                 fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0));
                             }
                             _n++;
-                            if ( ! _c->supportGetMore() ){
-                                if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ){
+                            if ( ! _c->supportGetMore() ) {
+                                if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) {
                                     finish( true );
                                     return;
                                 }
                             }
-                            else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ){
+                            else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) {
                                 /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
                                 if ( mayCreateCursor1 ) {
                                     _wouldSaveClientCursor = true;
@@ -763,60 +845,73 @@ namespace mongo {
                     }
                 }
             }
-            _c->advance();            
+            _c->advance();
         }
 
         // this plan won, so set data for response broadly
         void finish( bool stop ) {
+
             if ( _pq.isExplain() ) {
                 _n = _inMemSort ? _so->size() : _n;
-            } 
+            }
             else if ( _inMemSort ) {
                 if( _so.get() )
                     _so->fill( _buf, _pq.getFields() , _n );
             }
-            
-            if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 )
-                _c->setTailable();
-            
-            // If the tailing request succeeded.
-            if ( _c->tailable() )
-                _saveClientCursor = true;
-
-            if ( _pq.isExplain()) {
-                _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(), _curop.elapsedMillis(), useHints && !_pq.getHint().eoo() );
-            } else {
-                if (_buf.len()) {
+
+            if ( _c.get() ) {
+                _nscanned = _c->nscanned();
+
+                if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 )
+                    _c->setTailable();
+
+                // If the tailing request succeeded.
+                if ( _c->tailable() )
+                    _saveClientCursor = true;
+            }
+
+            if ( _pq.isExplain() ) {
+                massert( 13638, "client cursor dropped during explain query yield", _c.get() );
+                _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(),
+                              _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields ,
+                              _nChunkSkips, _keyFieldsOnly.get() > 0 );
+            }
+            else {
+                if ( _buf.len() ) {
                     _response.appendData( _buf.buf(), _buf.len() );
                     _buf.decouple();
                 }
             }
+
             if ( stop ) {
                 setStop();
-            } else {
+            }
+            else {
                 setComplete();
             }
 
         }
-        
+
         void finishExplain( const BSONObj &suffix ) {
-            BSONObj obj = _eb.finishWithSuffix( nscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix);
+            BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix);
             fillQueryResultFromObj(_buf, 0, obj);
             _n = 1;
             _oldN = 0;
             _response.appendData( _buf.buf(), _buf.len() );
             _buf.decouple();
         }
-        
-        virtual bool mayRecordPlan() const { return _pq.getNumToReturn() != 1; }
-        
+
+        virtual bool mayRecordPlan() const {
+            return ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) );
+        }
+
         virtual QueryOp *_createChild() const {
             if ( _pq.isExplain() ) {
                 _eb.ensureStartScan();
             }
             UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop );
             ret->_oldN = n();
-            ret->_oldNscanned = nscanned();
+            ret->_oldNscanned = totalNscanned();
             ret->_oldNscannedObjects = nscannedObjects();
             ret->_ntoskip = _ntoskip;
             return ret;
@@ -825,19 +920,20 @@ namespace mongo {
         bool scanAndOrderRequired() const { return _inMemSort; }
         shared_ptr<Cursor> cursor() { return _c; }
         int n() const { return _oldN + _n; }
-        long long nscanned() const { return _nscanned + _oldNscanned; }
+        long long totalNscanned() const { return _nscanned + _oldNscanned; }
         long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; }
         bool saveClientCursor() const { return _saveClientCursor; }
         bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; }
-        
-        void finishForOplogReplay( ClientCursor * cc ){
+
+        void finishForOplogReplay( ClientCursor * cc ) {
             if ( _oplogReplay && ! _slaveReadTill.isNull() )
-                cc->_slaveReadTill = _slaveReadTill;
+                cc->slaveReadTill( _slaveReadTill );
 
         }
     private:
         BufBuilder _buf;
         const ParsedQuery& _pq;
+        scoped_ptr<Projection::KeyOnly> _keyFieldsOnly;
 
         long long _ntoskip;
         long long _nscanned;
@@ -846,30 +942,36 @@ namespace mongo {
         long long _oldNscannedObjects;
         int _n; // found so far
         int _oldN;
-        
+
+        int _nYields;
+        int _nChunkSkips;
+
         MatchDetails _details;
 
-        ChunkMatcherPtr _chunkMatcher;
-        
+        ShardChunkManagerPtr _chunkManager;
+
         bool _inMemSort;
         auto_ptr< ScanAndOrder > _so;
-        
+
         shared_ptr<Cursor> _c;
         ClientCursor::CleanupPointer _cc;
         ClientCursor::YieldData _yieldData;
 
+        bool _capped;
         bool _saveClientCursor;
         bool _wouldSaveClientCursor;
         bool _oplogReplay;
         auto_ptr< FindingStartCursor > _findingStartCursor;
-        
+
         Message &_response;
         ExplainBuilder &_eb;
         CurOp &_curop;
         OpTime _slaveReadTill;
     };
-    
-    /* run a query -- includes checking for and running a Command */
+
+    /* run a query -- includes checking for and running a Command \
+       @return points to ns if exhaust mode. 0=normal mode
+    */
     const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
         StringBuilder& ss = curop.debug().str;
         shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
@@ -878,25 +980,26 @@ namespace mongo {
         BSONObj jsobj = q.query;
         int queryOptions = q.queryOptions;
         const char *ns = q.ns;
-        
+
         if( logLevel >= 2 )
             log() << "query: " << ns << jsobj << endl;
-        
+
         ss << ns;
         {
-            // only say ntoreturn if nonzero. 
+            // only say ntoreturn if nonzero.
             int n =  pq.getNumToReturn();
-            if( n ) 
+            if( n )
                 ss << " ntoreturn:" << n;
         }
         curop.setQuery(jsobj);
-        
+
         if ( pq.couldBeCommand() ) {
             BufBuilder bb;
             bb.skip(sizeof(QueryResult));
             BSONObjBuilder cmdResBuf;
             if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
-                ss << " command: " << jsobj.toString();
+                ss << " command: ";
+                jsobj.toString( ss );
                 curop.markCommand();
                 auto_ptr< QueryResult > qr;
                 qr.reset( (QueryResult *) bb.buf() );
@@ -910,9 +1013,12 @@ namespace mongo {
                 qr->nReturned = 1;
                 result.setData( qr.release(), true );
             }
-            return false;
+            else {
+                uasserted(13530, "bad or malformed command request?");
+            }
+            return 0;
         }
-        
+
         /* --- regular query --- */
 
         int n = 0;
@@ -932,7 +1038,7 @@ namespace mongo {
             out() << query.toString() << endl;
             uassert( 10110 , "bad query object", false);
         }
-            
+
         /* --- read lock --- */
 
         mongolock lk(false);
@@ -947,17 +1053,18 @@ namespace mongo {
             const BSONObj nat1 = BSON( "$natural" << 1 );
             if ( order.isEmpty() ) {
                 order = nat1;
-            } else {
+            }
+            else {
                 uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
             }
         }
-        
+
         BSONObj snapshotHint; // put here to keep the data in scope
-        if( snapshot ) { 
+        if( snapshot ) {
             NamespaceDetails *d = nsdetails(ns);
-            if ( d ){
+            if ( d ) {
                 int i = d->findIdIndex();
-                if( i < 0 ) { 
+                if( i < 0 ) {
                     if ( strstr( ns , ".system." ) == 0 )
                         log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
                 }
@@ -973,7 +1080,7 @@ namespace mongo {
                 }
             }
         }
-            
+
         if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
             bool nsFound = false;
             bool indexFound = false;
@@ -981,12 +1088,12 @@ namespace mongo {
             BSONObj resObject;
             Client& c = cc();
             bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
-            if ( nsFound == false || indexFound == true ){
+            if ( nsFound == false || indexFound == true ) {
                 BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
                 bb.skip(sizeof(QueryResult));
-                
+
                 ss << " idhack ";
-                if ( found ){
+                if ( found ) {
                     n = 1;
                     fillQueryResultFromObj( bb , pq.getFields() , resObject );
                 }
@@ -999,16 +1106,16 @@ namespace mongo {
                 qr->setOperation(opReply);
                 qr->cursorId = 0;
                 qr->startingFrom = 0;
-                qr->nReturned = n;      
+                qr->nReturned = n;
                 result.setData( qr.release(), true );
                 return false;
-            }     
+            }
         }
-        
+
         // regular, not QO bypass query
-        
+
         BSONObj oldPlan;
-        if ( explain && ! pq.hasIndexSpecifier() ){
+        if ( explain && ! pq.hasIndexSpecifier() ) {
             MultiPlanScanner mps( ns, query, order );
             if ( mps.usingPrerecordedPlan() )
                 oldPlan = mps.oldExplain();
@@ -1031,7 +1138,7 @@ namespace mongo {
             dqo.finishExplain( explainSuffix );
         }
         n = dqo.n();
-        long long nscanned = dqo.nscanned();
+        long long nscanned = dqo.totalNscanned();
         if ( dqo.scanAndOrderRequired() )
             ss << " scanAndOrder ";
         shared_ptr<Cursor> cursor = dqo.cursor();
@@ -1046,18 +1153,19 @@ namespace mongo {
                 // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield
                 shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher(), dqo ) );
                 cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned());
-            } else {
+            }
+            else {
                 cursor->setMatcher( dqo.matcher() );
                 cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() );
             }
-            cursorid = cc->cursorid;
+            cursorid = cc->cursorid();
             DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
-            cc->pos = n;
+            cc->setPos( n );
             cc->pq = pq_shared;
             cc->fields = pq.getFieldPtr();
             cc->originalMessage = m;
             cc->updateLocation();
-            if ( !cc->c->ok() && cc->c->tailable() )
+            if ( !cc->ok() && cc->c()->tailable() )
                 DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
             if( queryOptions & QueryOption_Exhaust ) {
                 exhaust = ns;
@@ -1087,6 +1195,6 @@ namespace mongo {
         }
         ss << " nreturned:" << n;
         return exhaust;
-    }    
-    
+    }
+
 } // namespace mongo
diff --git a/db/query.h b/db/query.h
index cc88e5c..5de7ced 100644
--- a/db/query.h
+++ b/db/query.h
@@ -23,6 +23,7 @@
 #include "dbmessage.h"
 #include "jsobj.h"
 #include "diskloc.h"
+#include "projection.h"
 
 /* db request message format
 
@@ -37,29 +38,29 @@
       a series of JSObjects
    dbDelete:
       string collection;
-	  int flags=0; // 1=DeleteSingle
+      int flags=0; // 1=DeleteSingle
       JSObject query;
    dbUpdate:
       string collection;
-	  int flags; // 1=upsert
+      int flags; // 1=upsert
       JSObject query;
-	  JSObject objectToUpdate;
+      JSObject objectToUpdate;
         objectToUpdate may include { $inc: <field> } or { $set: ... }, see struct Mod.
    dbQuery:
       string collection;
-	  int nToSkip;
-	  int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit)            
+      int nToSkip;
+      int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit)
                      // greater than zero is simply a hint on how many objects to send back per "cursor batch".
                      // a negative number indicates a hard limit.
       JSObject query;
-	  [JSObject fieldsToReturn]
+      [JSObject fieldsToReturn]
    dbGetMore:
-	  string collection; // redundant, might use for security.
+      string collection; // redundant, might use for security.
       int nToReturn;
       int64 cursorID;
    dbKillCursors=2007:
       int n;
-	  int64 cursorIDs[n];
+      int64 cursorIDs[n];
 
    Note that on Update, there is only one object, which is different
    from insert where you can pass a list of objects to insert in the db.
@@ -77,7 +78,7 @@ namespace mongo {
     struct GetMoreWaitException { };
 
     QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op, int pass, bool& exhaust);
-    
+
     struct UpdateResult {
         bool existing; // if existing objects were modified
         bool mod;      // was this a $ mod
@@ -85,25 +86,25 @@ namespace mongo {
         OID upserted;  // if something was upserted, the new _id of the object
 
         UpdateResult( bool e, bool m, unsigned long long n , const BSONObj& upsertedObject = BSONObj() )
-            : existing(e) , mod(m), num(n){
+            : existing(e) , mod(m), num(n) {
             upserted.clear();
 
             BSONElement id = upsertedObject["_id"];
-            if ( ! e && n == 1 && id.type() == jstOID ){
+            if ( ! e && n == 1 && id.type() == jstOID ) {
                 upserted = id.OID();
             }
         }
-        
+
     };
 
     class RemoveSaver;
-    
+
     /* returns true if an existing object was updated, false if no existing object was found.
        multi - update multiple objects - mostly useful with things like $set
        god - allow access to system namespaces
     */
     UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
-    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern, 
+    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern,
                                 bool upsert, bool multi , bool logop , OpDebug& debug , RemoveSaver * rs = 0 );
 
     // If justOne is true, deletedId is set to the id of the deleted object.
@@ -112,7 +113,7 @@ namespace mongo {
     long long runCount(const char *ns, const BSONObj& cmd, string& err);
 
     const char * runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result);
-    
+
     /* This is for languages whose "objects" are not well ordered (JSON is well ordered).
        [ { a : ... } , { b : ... } ] -> { a : ..., b : ... }
     */
@@ -144,24 +145,24 @@ namespace mongo {
     class ParsedQuery {
     public:
         ParsedQuery( QueryMessage& qm )
-            : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ){
+            : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ) {
             init( qm.query );
             initFields( qm.fields );
         }
         ParsedQuery( const char* ns , int ntoskip , int ntoreturn , int queryoptions , const BSONObj& query , const BSONObj& fields )
-            : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ){
+            : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ) {
             init( query );
             initFields( fields );
         }
-        
-        ~ParsedQuery(){}
+
+        ~ParsedQuery() {}
 
         const char * ns() const { return _ns; }
         bool isLocalDB() const { return strncmp(_ns, "local.", 6) == 0; }
 
         const BSONObj& getFilter() const { return _filter; }
-        FieldMatcher* getFields() const { return _fields.get(); }
-        shared_ptr<FieldMatcher> getFieldPtr() const { return _fields; }
+        Projection* getFields() const { return _fields.get(); }
+        shared_ptr<Projection> getFieldPtr() const { return _fields; }
 
         int getSkip() const { return _ntoskip; }
         int getNumToReturn() const { return _ntoreturn; }
@@ -169,7 +170,7 @@ namespace mongo {
         int getOptions() const { return _options; }
         bool hasOption( int x ) const { return x & _options; }
 
-        
+
         bool isExplain() const { return _explain; }
         bool isSnapshot() const { return _snapshot; }
         bool returnKey() const { return _returnKey; }
@@ -180,7 +181,7 @@ namespace mongo {
         const BSONObj& getOrder() const { return _order; }
         const BSONElement& getHint() const { return _hint; }
         int getMaxScan() const { return _maxScan; }
-        
+
         bool couldBeCommand() const {
             /* we assume you are using findOne() for running a cmd... */
             return _ntoreturn == 1 && strstr( _ns , ".$cmd" );
@@ -193,7 +194,7 @@ namespace mongo {
         /* if ntoreturn is zero, we return up to 101 objects.  on the subsequent getmore, there
            is only a size limit.  The idea is that on a find() where one doesn't use much results,
            we don't return much, but once getmore kicks in, we start pushing significant quantities.
-           
+
            The n limit (vs. size) is important when someone fetches only one small field from big
            objects, which causes massive scanning server-side.
         */
@@ -208,14 +209,14 @@ namespace mongo {
                 return false;
             return n >= _ntoreturn;
         }
-        
+
     private:
-        void init( const BSONObj& q ){
+        void init( const BSONObj& q ) {
             _reset();
             uassert( 10105 , "bad skip value in query", _ntoskip >= 0);
-            
-            if ( _ntoreturn < 0 ){
-                /* _ntoreturn greater than zero is simply a hint on how many objects to send back per 
+
+            if ( _ntoreturn < 0 ) {
+                /* _ntoreturn greater than zero is simply a hint on how many objects to send back per
                    "cursor batch".
                    A negative number indicates a hard limit.
                 */
@@ -223,12 +224,12 @@ namespace mongo {
                 _ntoreturn = -_ntoreturn;
             }
 
-            
+
             BSONElement e = q["query"];
             if ( ! e.isABSONObj() )
                 e = q["$query"];
-            
-            if ( e.isABSONObj() ){
+
+            if ( e.isABSONObj() ) {
                 _filter = e.embeddedObject();
                 _initTop( q );
             }
@@ -237,7 +238,7 @@ namespace mongo {
             }
         }
 
-        void _reset(){
+        void _reset() {
             _wantMore = true;
             _explain = false;
             _snapshot = false;
@@ -246,20 +247,23 @@ namespace mongo {
             _maxScan = 0;
         }
 
-        void _initTop( const BSONObj& top ){
+        void _initTop( const BSONObj& top ) {
             BSONObjIterator i( top );
-            while ( i.more() ){
+            while ( i.more() ) {
                 BSONElement e = i.next();
                 const char * name = e.fieldName();
 
                 if ( strcmp( "$orderby" , name ) == 0 ||
-                     strcmp( "orderby" , name ) == 0 ){
-                    if ( e.type() == Object )
+                        strcmp( "orderby" , name ) == 0 ) {
+                    if ( e.type() == Object ) {
                         _order = e.embeddedObject();
-                    else if ( e.type() == Array )
+                    }
+                    else if ( e.type() == Array ) {
                         _order = transformOrderFromArrayFormat( _order );
-                    else
-                        assert( 0 );
+                    }
+                    else {
+                        uassert(13513, "sort must be an object or array", 0);
+                    }
                 }
                 else if ( strcmp( "$explain" , name ) == 0 )
                     _explain = e.trueValue();
@@ -277,25 +281,25 @@ namespace mongo {
                     _maxScan = e.numberInt();
                 else if ( strcmp( "$showDiskLoc" , name ) == 0 )
                     _showDiskLoc = e.trueValue();
-                
+
 
             }
 
-            if ( _snapshot ){
+            if ( _snapshot ) {
                 uassert( 12001 , "E12001 can't sort with $snapshot", _order.isEmpty() );
                 uassert( 12002 , "E12002 can't use hint with $snapshot", _hint.eoo() );
             }
-            
+
         }
 
-        void initFields( const BSONObj& fields ){
+        void initFields( const BSONObj& fields ) {
             if ( fields.isEmpty() )
                 return;
-            _fields.reset( new FieldMatcher() );
-            _fields->add( fields );
+            _fields.reset( new Projection() );
+            _fields->init( fields );
         }
 
-        ParsedQuery( const ParsedQuery& other ){
+        ParsedQuery( const ParsedQuery& other ) {
             assert(0);
         }
 
@@ -303,10 +307,10 @@ namespace mongo {
         int _ntoskip;
         int _ntoreturn;
         int _options;
-        
+
         BSONObj _filter;
-        shared_ptr< FieldMatcher > _fields;
-        
+        shared_ptr< Projection > _fields;
+
         bool _wantMore;
 
         bool _explain;
@@ -319,7 +323,7 @@ namespace mongo {
         BSONObj _order;
         int _maxScan;
     };
-    
+
 
 } // namespace mongo
 
diff --git a/db/queryoptimizer.cpp b/db/queryoptimizer.cpp
index e7068c2..0b9dce7 100644
--- a/db/queryoptimizer.cpp
+++ b/db/queryoptimizer.cpp
@@ -24,24 +24,25 @@
 #include "queryoptimizer.h"
 #include "cmdline.h"
 #include "clientcursor.h"
+#include <queue>
 
 //#define DEBUGQO(x) cout << x << endl;
 #define DEBUGQO(x)
 
 namespace mongo {
 
-    void checkTableScanAllowed( const char * ns ){
-        if ( ! cmdLine.notablescan )
+    void checkTableScanAllowed( const char * ns ) {
+        if ( ! cmdLine.noTableScan )
             return;
-        
+
         if ( strstr( ns , ".system." ) ||
-             strstr( ns , "local." ) )
+                strstr( ns , "local." ) )
             return;
-        
+
         if ( ! nsdetails( ns ) )
             return;
 
-        uassert( 10111 ,  (string)"table scans not allowed:" + ns , ! cmdLine.notablescan );
+        uassert( 10111 ,  (string)"table scans not allowed:" + ns , ! cmdLine.noTableScan );
     }
 
     double elementDirection( const BSONElement &e ) {
@@ -49,58 +50,59 @@ namespace mongo {
             return e.number();
         return 1;
     }
-    
-    QueryPlan::QueryPlan( 
-        NamespaceDetails *_d, int _idxNo,
-        const FieldRangeSet &fbs, const BSONObj &originalQuery, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) :
-    d(_d), idxNo(_idxNo),
-    fbs_( fbs ),
-    _originalQuery( originalQuery ),
-    order_( order ),
-    index_( 0 ),
-    optimal_( false ),
-    scanAndOrderRequired_( true ),
-    exactKeyMatch_( false ),
-    direction_( 0 ),
-    endKeyInclusive_( endKey.isEmpty() ),
-    unhelpful_( false ),
-    _special( special ),
-    _type(0),
-    _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ){
-
-        if ( !fbs_.matchPossible() ) {
-            unhelpful_ = true;
-            scanAndOrderRequired_ = false;
+
+    QueryPlan::QueryPlan(
+        NamespaceDetails *d, int idxNo,
+        const FieldRangeSet &fbs, const FieldRangeSet &originalFrs, const BSONObj &originalQuery, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) :
+        _d(d), _idxNo(idxNo),
+        _fbs( fbs ),
+        _originalQuery( originalQuery ),
+        _order( order ),
+        _index( 0 ),
+        _optimal( false ),
+        _scanAndOrderRequired( true ),
+        _exactKeyMatch( false ),
+        _direction( 0 ),
+        _endKeyInclusive( endKey.isEmpty() ),
+        _unhelpful( false ),
+        _special( special ),
+        _type(0),
+        _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ) {
+
+        if ( !_fbs.matchPossible() ) {
+            _unhelpful = true;
+            _scanAndOrderRequired = false;
             return;
         }
 
-        if( idxNo >= 0 ) {
-            index_ = &d->idx(idxNo);
-        } else {
+        if( _idxNo >= 0 ) {
+            _index = &d->idx(_idxNo);
+        }
+        else {
             // full table scan case
-            if ( order_.isEmpty() || !strcmp( order_.firstElement().fieldName(), "$natural" ) )
-                scanAndOrderRequired_ = false;
+            if ( _order.isEmpty() || !strcmp( _order.firstElement().fieldName(), "$natural" ) )
+                _scanAndOrderRequired = false;
             return;
         }
 
-        if ( _special.size() ){
-            optimal_ = true;
-            _type  = index_->getSpec().getType();
+        if ( _special.size() ) {
+            _optimal = true;
+            _type  = _index->getSpec().getType();
             massert( 13040 , (string)"no type for special: " + _special , _type );
             // hopefully safe to use original query in these contexts - don't think we can mix special with $or clause separation yet
-            scanAndOrderRequired_ = _type->scanAndOrderRequired( _originalQuery , order );
+            _scanAndOrderRequired = _type->scanAndOrderRequired( _originalQuery , order );
             return;
         }
 
-        BSONObj idxKey = index_->keyPattern();
+        BSONObj idxKey = _index->keyPattern();
         BSONObjIterator o( order );
         BSONObjIterator k( idxKey );
         if ( !o.moreWithEOO() )
-            scanAndOrderRequired_ = false;
+            _scanAndOrderRequired = false;
         while( o.moreWithEOO() ) {
             BSONElement oe = o.next();
             if ( oe.eoo() ) {
-                scanAndOrderRequired_ = false;
+                _scanAndOrderRequired = false;
                 break;
             }
             if ( !k.moreWithEOO() )
@@ -116,14 +118,14 @@ namespace mongo {
                     goto doneCheckOrder;
             }
             int d = elementDirection( oe ) == elementDirection( ke ) ? 1 : -1;
-            if ( direction_ == 0 )
-                direction_ = d;
-            else if ( direction_ != d )
+            if ( _direction == 0 )
+                _direction = d;
+            else if ( _direction != d )
                 break;
         }
-    doneCheckOrder:
-        if ( scanAndOrderRequired_ )
-            direction_ = 0;
+doneCheckOrder:
+        if ( _scanAndOrderRequired )
+            _direction = 0;
         BSONObjIterator i( idxKey );
         int exactIndexedQueryCount = 0;
         int optimalIndexedQueryCount = 0;
@@ -140,7 +142,8 @@ namespace mongo {
                     ++optimalIndexedQueryCount;
                 if ( !fb.equality() )
                     stillOptimalIndexedQueryCount = false;
-            } else {
+            }
+            else {
                 if ( fb.nontrivial() )
                     optimalIndexedQueryCount = -1;
             }
@@ -151,16 +154,17 @@ namespace mongo {
             }
             orderFieldsUnindexed.erase( e.fieldName() );
         }
-        if ( !scanAndOrderRequired_ &&
-             ( optimalIndexedQueryCount == fbs.nNontrivialRanges() ) )
-            optimal_ = true;
+        if ( !_scanAndOrderRequired &&
+                ( optimalIndexedQueryCount == fbs.nNontrivialRanges() ) )
+            _optimal = true;
         if ( exactIndexedQueryCount == fbs.nNontrivialRanges() &&
-            orderFieldsUnindexed.size() == 0 &&
-            exactIndexedQueryCount == index_->keyPattern().nFields() &&
-            exactIndexedQueryCount == _originalQuery.nFields() ) {
-            exactKeyMatch_ = true;
+                orderFieldsUnindexed.size() == 0 &&
+                exactIndexedQueryCount == _index->keyPattern().nFields() &&
+                exactIndexedQueryCount == _originalQuery.nFields() ) {
+            _exactKeyMatch = true;
         }
-        _frv.reset( new FieldRangeVector( fbs, idxKey, direction_ ) );
+        _frv.reset( new FieldRangeVector( fbs, idxKey, _direction ) );
+        _originalFrv.reset( new FieldRangeVector( originalFrs, idxKey, _direction ) );
         if ( _startOrEndSpec ) {
             BSONObj newStart, newEnd;
             if ( !startKey.isEmpty() )
@@ -173,100 +177,124 @@ namespace mongo {
                 _endKey = _frv->endKey();
         }
 
-        if ( ( scanAndOrderRequired_ || order_.isEmpty() ) &&
-            !fbs.range( idxKey.firstElement().fieldName() ).nontrivial() ) {
-            unhelpful_ = true;
+        if ( ( _scanAndOrderRequired || _order.isEmpty() ) &&
+                !fbs.range( idxKey.firstElement().fieldName() ).nontrivial() ) {
+            _unhelpful = true;
         }
     }
-    
+
     shared_ptr<Cursor> QueryPlan::newCursor( const DiskLoc &startLoc , int numWanted ) const {
 
         if ( _type ) {
-            // hopefully safe to use original query in these contexts - don't think we can mix type with $or clause separation yet   
-            return _type->newCursor( _originalQuery , order_ , numWanted );
+            // hopefully safe to use original query in these contexts - don't think we can mix type with $or clause separation yet
+            return _type->newCursor( _originalQuery , _order , numWanted );
         }
-        
-        if ( !fbs_.matchPossible() ){
-            if ( fbs_.nNontrivialRanges() )
-                checkTableScanAllowed( fbs_.ns() );
+
+        if ( !_fbs.matchPossible() ) {
+            if ( _fbs.nNontrivialRanges() )
+                checkTableScanAllowed( _fbs.ns() );
             return shared_ptr<Cursor>( new BasicCursor( DiskLoc() ) );
         }
-        if ( !index_ ){
-            if ( fbs_.nNontrivialRanges() )
-                checkTableScanAllowed( fbs_.ns() );
-            return findTableScan( fbs_.ns(), order_, startLoc );
+        if ( !_index ) {
+            if ( _fbs.nNontrivialRanges() )
+                checkTableScanAllowed( _fbs.ns() );
+            return findTableScan( _fbs.ns(), _order, startLoc );
         }
 
         massert( 10363 ,  "newCursor() with start location not implemented for indexed plans", startLoc.isNull() );
-        
+
         if ( _startOrEndSpec ) {
-            // we are sure to spec endKeyInclusive_
-            return shared_ptr<Cursor>( new BtreeCursor( d, idxNo, *index_, _startKey, _endKey, endKeyInclusive_, direction_ >= 0 ? 1 : -1 ) );
-        } else if ( index_->getSpec().getType() ) {
-            return shared_ptr<Cursor>( new BtreeCursor( d, idxNo, *index_, _frv->startKey(), _frv->endKey(), true, direction_ >= 0 ? 1 : -1 ) );            
-        } else {
-            return shared_ptr<Cursor>( new BtreeCursor( d, idxNo, *index_, _frv, direction_ >= 0 ? 1 : -1 ) );
+            // we are sure to spec _endKeyInclusive
+            return shared_ptr<Cursor>( new BtreeCursor( _d, _idxNo, *_index, _startKey, _endKey, _endKeyInclusive, _direction >= 0 ? 1 : -1 ) );
+        }
+        else if ( _index->getSpec().getType() ) {
+            return shared_ptr<Cursor>( new BtreeCursor( _d, _idxNo, *_index, _frv->startKey(), _frv->endKey(), true, _direction >= 0 ? 1 : -1 ) );
+        }
+        else {
+            return shared_ptr<Cursor>( new BtreeCursor( _d, _idxNo, *_index, _frv, _direction >= 0 ? 1 : -1 ) );
         }
     }
-    
+
     shared_ptr<Cursor> QueryPlan::newReverseCursor() const {
-        if ( !fbs_.matchPossible() )
+        if ( !_fbs.matchPossible() )
             return shared_ptr<Cursor>( new BasicCursor( DiskLoc() ) );
-        if ( !index_ ) {
-            int orderSpec = order_.getIntField( "$natural" );
+        if ( !_index ) {
+            int orderSpec = _order.getIntField( "$natural" );
             if ( orderSpec == INT_MIN )
                 orderSpec = 1;
-            return findTableScan( fbs_.ns(), BSON( "$natural" << -orderSpec ) );
+            return findTableScan( _fbs.ns(), BSON( "$natural" << -orderSpec ) );
         }
         massert( 10364 ,  "newReverseCursor() not implemented for indexed plans", false );
         return shared_ptr<Cursor>();
     }
-    
+
     BSONObj QueryPlan::indexKey() const {
-        if ( !index_ )
+        if ( !_index )
             return BSON( "$natural" << 1 );
-        return index_->keyPattern();
+        return _index->keyPattern();
     }
-    
+
     void QueryPlan::registerSelf( long long nScanned ) const {
-        if ( fbs_.matchPossible() ) {
+        if ( _fbs.matchPossible() ) {
             scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
-            NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( fbs_.pattern( order_ ), indexKey(), nScanned );  
-        }
-    }
-    
-    QueryPlanSet::QueryPlanSet( const char *_ns, auto_ptr< FieldRangeSet > frs, const BSONObj &originalQuery, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) :
-    ns(_ns),
-    _originalQuery( originalQuery ),
-    fbs_( frs ),
-    mayRecordPlan_( true ),
-    usingPrerecordedPlan_( false ),
-    hint_( BSONObj() ),
-    order_( order.getOwned() ),
-    oldNScanned_( 0 ),
-    honorRecordedPlan_( honorRecordedPlan ),
-    min_( min.getOwned() ),
-    max_( max.getOwned() ),
-    _bestGuessOnly( bestGuessOnly ),
-    _mayYield( mayYield ),
-    _yieldSometimesTracker( 256, 20 ){
+            NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( _fbs.pattern( _order ), indexKey(), nScanned );
+        }
+    }
+
+    bool QueryPlan::isMultiKey() const {
+        if ( _idxNo < 0 )
+            return false;
+        return _d->isMultikey( _idxNo );
+    }
+
+    QueryPlanSet::QueryPlanSet( const char *ns, auto_ptr< FieldRangeSet > frs, auto_ptr< FieldRangeSet > originalFrs, const BSONObj &originalQuery, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) :
+        _ns(ns),
+        _originalQuery( originalQuery ),
+        _fbs( frs ),
+        _originalFrs( originalFrs ),
+        _mayRecordPlan( true ),
+        _usingPrerecordedPlan( false ),
+        _hint( BSONObj() ),
+        _order( order.getOwned() ),
+        _oldNScanned( 0 ),
+        _honorRecordedPlan( honorRecordedPlan ),
+        _min( min.getOwned() ),
+        _max( max.getOwned() ),
+        _bestGuessOnly( bestGuessOnly ),
+        _mayYield( mayYield ),
+        _yieldSometimesTracker( 256, 20 ) {
         if ( hint && !hint->eoo() ) {
-            hint_ = hint->wrap();
+            _hint = hint->wrap();
         }
         init();
     }
-    
+
+    bool QueryPlanSet::modifiedKeys() const {
+        for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i )
+            if ( (*i)->isMultiKey() )
+                return true;
+        return false;
+    }
+
+    bool QueryPlanSet::hasMultiKey() const {
+        for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i )
+            if ( (*i)->isMultiKey() )
+                return true;
+        return false;
+    }
+
+
     void QueryPlanSet::addHint( IndexDetails &id ) {
-        if ( !min_.isEmpty() || !max_.isEmpty() ) {
+        if ( !_min.isEmpty() || !_max.isEmpty() ) {
             string errmsg;
             BSONObj keyPattern = id.keyPattern();
-            // This reformats min_ and max_ to be used for index lookup.
-            massert( 10365 ,  errmsg, indexDetailsForRange( fbs_->ns(), errmsg, min_, max_, keyPattern ) );
+            // This reformats _min and _max to be used for index lookup.
+            massert( 10365 ,  errmsg, indexDetailsForRange( _fbs->ns(), errmsg, _min, _max, keyPattern ) );
         }
-        NamespaceDetails *d = nsdetails(ns);
-        plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(id), *fbs_, _originalQuery, order_, min_, max_ ) ) );
+        NamespaceDetails *d = nsdetails(_ns);
+        _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_fbs, *_originalFrs, _originalQuery, _order, _min, _max ) ) );
     }
-    
+
     // returns an IndexDetails * for a hint, 0 if hint is $natural.
     // hint must not be eoo()
     IndexDetails *parseHint( const BSONElement &hint, NamespaceDetails *d ) {
@@ -281,7 +309,7 @@ namespace mongo {
                 }
             }
         }
-        else if( hint.type() == Object ) { 
+        else if( hint.type() == Object ) {
             BSONObj hintobj = hint.embeddedObject();
             uassert( 10112 ,  "bad hint", !hintobj.isEmpty() );
             if ( !strcmp( hintobj.firstElement().fieldName(), "$natural" ) ) {
@@ -294,92 +322,93 @@ namespace mongo {
                     return &ii;
                 }
             }
-        }        
+        }
         uassert( 10113 ,  "bad hint", false );
         return 0;
     }
-    
+
     void QueryPlanSet::init() {
         DEBUGQO( "QueryPlanSet::init " << ns << "\t" << _originalQuery );
-        plans_.clear();
-        mayRecordPlan_ = true;
-        usingPrerecordedPlan_ = false;
-        
-        const char *ns = fbs_->ns();
+        _plans.clear();
+        _mayRecordPlan = true;
+        _usingPrerecordedPlan = false;
+
+        const char *ns = _fbs->ns();
         NamespaceDetails *d = nsdetails( ns );
-        if ( !d || !fbs_->matchPossible() ) {
+        if ( !d || !_fbs->matchPossible() ) {
             // Table scan plan, when no matches are possible
-            plans_.push_back( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ) );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ) );
             return;
         }
-        
-        BSONElement hint = hint_.firstElement();
+
+        BSONElement hint = _hint.firstElement();
         if ( !hint.eoo() ) {
-            mayRecordPlan_ = false;
+            _mayRecordPlan = false;
             IndexDetails *id = parseHint( hint, d );
             if ( id ) {
                 addHint( *id );
-            } else {
-                massert( 10366 ,  "natural order cannot be specified with $min/$max", min_.isEmpty() && max_.isEmpty() );
+            }
+            else {
+                massert( 10366 ,  "natural order cannot be specified with $min/$max", _min.isEmpty() && _max.isEmpty() );
                 // Table scan plan
-                plans_.push_back( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ) );                
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ) );
             }
             return;
         }
-        
-        if ( !min_.isEmpty() || !max_.isEmpty() ) {
+
+        if ( !_min.isEmpty() || !_max.isEmpty() ) {
             string errmsg;
             BSONObj keyPattern;
-            IndexDetails *idx = indexDetailsForRange( ns, errmsg, min_, max_, keyPattern );
+            IndexDetails *idx = indexDetailsForRange( ns, errmsg, _min, _max, keyPattern );
             massert( 10367 ,  errmsg, idx );
-            plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(*idx), *fbs_, _originalQuery, order_, min_, max_ ) ) );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_fbs, *_originalFrs, _originalQuery, _order, _min, _max ) ) );
             return;
         }
 
-        if ( isSimpleIdQuery( _originalQuery ) ){
+        if ( isSimpleIdQuery( _originalQuery ) ) {
             int idx = d->findIdIndex();
-            if ( idx >= 0 ){
-                usingPrerecordedPlan_ = true;
-                mayRecordPlan_ = false;
-                plans_.push_back( PlanPtr( new QueryPlan( d , idx , *fbs_ , _originalQuery, order_ ) ) );
+            if ( idx >= 0 ) {
+                _usingPrerecordedPlan = true;
+                _mayRecordPlan = false;
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_fbs , *_fbs , _originalQuery, _order ) ) );
                 return;
             }
         }
 
-        if ( _originalQuery.isEmpty() && order_.isEmpty() ){
-            plans_.push_back( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ) );
+        if ( _originalQuery.isEmpty() && _order.isEmpty() ) {
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ) );
             return;
         }
 
-        DEBUGQO( "\t special : " << fbs_->getSpecial() );
-        if ( fbs_->getSpecial().size() ){
-            _special = fbs_->getSpecial();
+        DEBUGQO( "\t special : " << _fbs->getSpecial() );
+        if ( _fbs->getSpecial().size() ) {
+            _special = _fbs->getSpecial();
             NamespaceDetails::IndexIterator i = d->ii();
             while( i.more() ) {
                 int j = i.pos();
                 IndexDetails& ii = i.next();
                 const IndexSpec& spec = ii.getSpec();
-                if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , order_ ) ){
-                    usingPrerecordedPlan_ = true;
-                    mayRecordPlan_ = false;
-                    plans_.push_back( PlanPtr( new QueryPlan( d , j , *fbs_ , _originalQuery, order_ , 
-                                                              BSONObj() , BSONObj() , _special ) ) );
+                if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , _order ) ) {
+                    _usingPrerecordedPlan = true;
+                    _mayRecordPlan = false;
+                    _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_fbs , *_fbs , _originalQuery, _order ,
+                                                    BSONObj() , BSONObj() , _special ) ) );
                     return;
                 }
             }
             uassert( 13038 , (string)"can't find special index: " + _special + " for: " + _originalQuery.toString() , 0 );
         }
 
-        if ( honorRecordedPlan_ ) {
+        if ( _honorRecordedPlan ) {
             scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
             NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( ns );
-            BSONObj bestIndex = nsd.indexForPattern( fbs_->pattern( order_ ) );
+            BSONObj bestIndex = nsd.indexForPattern( _fbs->pattern( _order ) );
             if ( !bestIndex.isEmpty() ) {
-                PlanPtr p;
-                oldNScanned_ = nsd.nScannedForPattern( fbs_->pattern( order_ ) );
+                QueryPlanPtr p;
+                _oldNScanned = nsd.nScannedForPattern( _fbs->pattern( _order ) );
                 if ( !strcmp( bestIndex.firstElement().fieldName(), "$natural" ) ) {
                     // Table scan plan
-                    p.reset( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) );
+                    p.reset( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) );
                 }
 
                 NamespaceDetails::IndexIterator i = d->ii();
@@ -387,55 +416,56 @@ namespace mongo {
                     int j = i.pos();
                     IndexDetails& ii = i.next();
                     if( ii.keyPattern().woCompare(bestIndex) == 0 ) {
-                        p.reset( new QueryPlan( d, j, *fbs_, _originalQuery, order_ ) );
+                        p.reset( new QueryPlan( d, j, *_fbs, *_originalFrs, _originalQuery, _order ) );
                     }
                 }
 
                 massert( 10368 ,  "Unable to locate previously recorded index", p.get() );
                 if ( !( _bestGuessOnly && p->scanAndOrderRequired() ) ) {
-                    usingPrerecordedPlan_ = true;
-                    mayRecordPlan_ = false;
-                    plans_.push_back( p );
+                    _usingPrerecordedPlan = true;
+                    _mayRecordPlan = false;
+                    _plans.push_back( p );
                     return;
                 }
             }
         }
-        
+
         addOtherPlans( false );
     }
-    
+
     void QueryPlanSet::addOtherPlans( bool checkFirst ) {
-        const char *ns = fbs_->ns();
+        const char *ns = _fbs->ns();
         NamespaceDetails *d = nsdetails( ns );
         if ( !d )
             return;
 
         // If table scan is optimal or natural order requested or tailable cursor requested
-        if ( !fbs_->matchPossible() || ( fbs_->nNontrivialRanges() == 0 && order_.isEmpty() ) ||
-            ( !order_.isEmpty() && !strcmp( order_.firstElement().fieldName(), "$natural" ) ) ) {
+        if ( !_fbs->matchPossible() || ( _fbs->nNontrivialRanges() == 0 && _order.isEmpty() ) ||
+                ( !_order.isEmpty() && !strcmp( _order.firstElement().fieldName(), "$natural" ) ) ) {
             // Table scan plan
-            addPlan( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ), checkFirst );
+            addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ), checkFirst );
             return;
         }
-        
-        bool normalQuery = hint_.isEmpty() && min_.isEmpty() && max_.isEmpty();
+
+        bool normalQuery = _hint.isEmpty() && _min.isEmpty() && _max.isEmpty();
 
         PlanSet plans;
         for( int i = 0; i < d->nIndexes; ++i ) {
             IndexDetails& id = d->idx(i);
             const IndexSpec& spec = id.getSpec();
             IndexSuitability suitability = HELPFUL;
-            if ( normalQuery ){
-                suitability = spec.suitability( fbs_->simplifiedQuery() , order_ );
+            if ( normalQuery ) {
+                suitability = spec.suitability( _fbs->simplifiedQuery() , _order );
                 if ( suitability == USELESS )
                     continue;
             }
 
-            PlanPtr p( new QueryPlan( d, i, *fbs_, _originalQuery, order_ ) );
+            QueryPlanPtr p( new QueryPlan( d, i, *_fbs, *_originalFrs, _originalQuery, _order ) );
             if ( p->optimal() ) {
                 addPlan( p, checkFirst );
                 return;
-            } else if ( !p->unhelpful() ) {
+            }
+            else if ( !p->unhelpful() ) {
                 plans.push_back( p );
             }
         }
@@ -443,29 +473,29 @@ namespace mongo {
             addPlan( *i, checkFirst );
 
         // Table scan plan
-        addPlan( PlanPtr( new QueryPlan( d, -1, *fbs_, _originalQuery, order_ ) ), checkFirst );
+        addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ), checkFirst );
     }
-    
+
     shared_ptr< QueryOp > QueryPlanSet::runOp( QueryOp &op ) {
-        if ( usingPrerecordedPlan_ ) {
+        if ( _usingPrerecordedPlan ) {
             Runner r( *this, op );
             shared_ptr< QueryOp > res = r.run();
-            // plans_.size() > 1 if addOtherPlans was called in Runner::run().
-            if ( _bestGuessOnly || res->complete() || plans_.size() > 1 )
+            // _plans.size() > 1 if addOtherPlans was called in Runner::run().
+            if ( _bestGuessOnly || res->complete() || _plans.size() > 1 )
                 return res;
             {
                 scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
-                NamespaceDetailsTransient::get_inlock( fbs_->ns() ).registerIndexForPattern( fbs_->pattern( order_ ), BSONObj(), 0 );
+                NamespaceDetailsTransient::get_inlock( _fbs->ns() ).registerIndexForPattern( _fbs->pattern( _order ), BSONObj(), 0 );
             }
             init();
         }
         Runner r( *this, op );
         return r.run();
     }
-    
+
     BSONObj QueryPlanSet::explain() const {
         vector< BSONObj > arr;
-        for( PlanSet::const_iterator i = plans_.begin(); i != plans_.end(); ++i ) {
+        for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) {
             shared_ptr<Cursor> c = (*i)->newCursor();
             BSONObjBuilder explain;
             explain.append( "cursor", c->toString() );
@@ -477,37 +507,37 @@ namespace mongo {
         return b.obj();
     }
 
-    QueryPlanSet::PlanPtr QueryPlanSet::getBestGuess() const {
-        assert( plans_.size() );
-        if ( plans_[ 0 ]->scanAndOrderRequired() ){
-            for ( unsigned i=1; i<plans_.size(); i++ ){
-                if ( ! plans_[i]->scanAndOrderRequired() )
-                    return plans_[i];
+    QueryPlanSet::QueryPlanPtr QueryPlanSet::getBestGuess() const {
+        assert( _plans.size() );
+        if ( _plans[ 0 ]->scanAndOrderRequired() ) {
+            for ( unsigned i=1; i<_plans.size(); i++ ) {
+                if ( ! _plans[i]->scanAndOrderRequired() )
+                    return _plans[i];
             }
-            
+
             stringstream ss;
             ss << "best guess plan requested, but scan and order required:";
-            ss << " query: " << fbs_->simplifiedQuery();
-            ss << " order: " << order_;
+            ss << " query: " << _fbs->simplifiedQuery();
+            ss << " order: " << _order;
             ss << " choices: ";
-            for ( unsigned i=0; i<plans_.size(); i++ ){
-                ss << plans_[i]->indexKey() << " ";
+            for ( unsigned i=0; i<_plans.size(); i++ ) {
+                ss << _plans[i]->indexKey() << " ";
             }
 
             string s = ss.str();
             msgassertedNoTrace( 13284, s.c_str() );
         }
-        return plans_[0];
+        return _plans[0];
     }
-    
+
     QueryPlanSet::Runner::Runner( QueryPlanSet &plans, QueryOp &op ) :
-    op_( op ),
-    plans_( plans ) {
+        _op( op ),
+        _plans( plans ) {
     }
-    
+
     void QueryPlanSet::Runner::mayYield( const vector< shared_ptr< QueryOp > > &ops ) {
-        if ( plans_._mayYield ) {
-            if ( plans_._yieldSometimesTracker.ping() ) {
+        if ( _plans._mayYield ) {
+            if ( _plans._yieldSometimesTracker.ping() ) {
                 int micros = ClientCursor::yieldSuggest();
                 if ( micros > 0 ) {
                     for( vector< shared_ptr< QueryOp > >::const_iterator i = ops.begin(); i != ops.end(); ++i ) {
@@ -515,28 +545,38 @@ namespace mongo {
                             return;
                         }
                     }
-                    ClientCursor::staticYield( micros );
+                    ClientCursor::staticYield( micros , _plans._ns );
                     for( vector< shared_ptr< QueryOp > >::const_iterator i = ops.begin(); i != ops.end(); ++i ) {
                         recoverFromYield( **i );
-                    }                        
+                    }
                 }
             }
-        }        
+        }
     }
-    
+
+    struct OpHolder {
+        OpHolder( const shared_ptr< QueryOp > &op ) : _op( op ), _offset() {}
+        shared_ptr< QueryOp > _op;
+        long long _offset;
+        bool operator<( const OpHolder &other ) const {
+            return _op->nscanned() + _offset > other._op->nscanned() + other._offset;
+        }
+    };
+
     shared_ptr< QueryOp > QueryPlanSet::Runner::run() {
-        massert( 10369 ,  "no plans", plans_.plans_.size() > 0 );
-        
+        massert( 10369 ,  "no plans", _plans._plans.size() > 0 );
+
         vector< shared_ptr< QueryOp > > ops;
-        if ( plans_._bestGuessOnly ) {
-            shared_ptr< QueryOp > op( op_.createChild() );
-            op->setQueryPlan( plans_.getBestGuess().get() );
-            ops.push_back( op );            
-        } else {
-            if ( plans_.plans_.size() > 1 )
-                log(1) << "  running multiple plans" << endl;            
-            for( PlanSet::iterator i = plans_.plans_.begin(); i != plans_.plans_.end(); ++i ) {
-                shared_ptr< QueryOp > op( op_.createChild() );
+        if ( _plans._bestGuessOnly ) {
+            shared_ptr< QueryOp > op( _op.createChild() );
+            op->setQueryPlan( _plans.getBestGuess().get() );
+            ops.push_back( op );
+        }
+        else {
+            if ( _plans._plans.size() > 1 )
+                log(1) << "  running multiple plans" << endl;
+            for( PlanSet::iterator i = _plans._plans.begin(); i != _plans._plans.end(); ++i ) {
+                shared_ptr< QueryOp > op( _op.createChild() );
                 op->setQueryPlan( i->get() );
                 ops.push_back( op );
             }
@@ -547,53 +587,51 @@ namespace mongo {
             if ( (*i)->complete() )
                 return *i;
         }
-        
-        long long nScanned = 0;
-        long long nScannedBackup = 0;
-        while( 1 ) {
-            ++nScanned;
-            unsigned errCount = 0;
-            bool first = true;
-            for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) {
-                mayYield( ops );
-                QueryOp &op = **i;
-                nextOp( op );
-                if ( op.complete() ) {
-                    if ( first ) {
-                        nScanned += nScannedBackup;
-                    }
-                    if ( plans_.mayRecordPlan_ && op.mayRecordPlan() ) {
-                        op.qp().registerSelf( nScanned );
-                    }
-                    return *i;
+
+        std::priority_queue< OpHolder > queue;
+        for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) {
+            if ( !(*i)->error() ) {
+                queue.push( *i );
+            }
+        }
+
+        while( !queue.empty() ) {
+            mayYield( ops );
+            OpHolder holder = queue.top();
+            queue.pop();
+            QueryOp &op = *holder._op;
+            nextOp( op );
+            if ( op.complete() ) {
+                if ( _plans._mayRecordPlan && op.mayRecordPlan() ) {
+                    op.qp().registerSelf( op.nscanned() );
                 }
-                if ( op.error() )
-                    ++errCount;
-                first = false;
+                return holder._op;
             }
-            if ( errCount == ops.size() )
-                break;
-            if ( !plans_._bestGuessOnly && plans_.usingPrerecordedPlan_ && nScanned > plans_.oldNScanned_ * 10 && plans_._special.empty() ) {
-                plans_.addOtherPlans( true );
-                PlanSet::iterator i = plans_.plans_.begin();
+            if ( op.error() ) {
+                continue;
+            }
+            queue.push( holder );
+            if ( !_plans._bestGuessOnly && _plans._usingPrerecordedPlan && op.nscanned() > _plans._oldNScanned * 10 && _plans._special.empty() ) {
+                holder._offset = -op.nscanned();
+                _plans.addOtherPlans( true );
+                PlanSet::iterator i = _plans._plans.begin();
                 ++i;
-                for( ; i != plans_.plans_.end(); ++i ) {
-                    shared_ptr< QueryOp > op( op_.createChild() );
+                for( ; i != _plans._plans.end(); ++i ) {
+                    shared_ptr< QueryOp > op( _op.createChild() );
                     op->setQueryPlan( i->get() );
                     ops.push_back( op );
                     initOp( *op );
                     if ( op->complete() )
                         return op;
-                }                
-                plans_.mayRecordPlan_ = true;
-                plans_.usingPrerecordedPlan_ = false;
-                nScannedBackup = nScanned;
-                nScanned = 0;
+                    queue.push( op );
+                }
+                _plans._mayRecordPlan = true;
+                _plans._usingPrerecordedPlan = false;
             }
         }
         return ops[ 0 ];
     }
-    
+
 #define GUARD_OP_EXCEPTION( op, expression ) \
     try { \
         expression; \
@@ -607,8 +645,8 @@ namespace mongo {
     catch ( ... ) { \
         op.setException( ExceptionInfo( "Caught unknown exception" , 0 ) ); \
     }
-        
-    
+
+
     void QueryPlanSet::Runner::initOp( QueryOp &op ) {
         GUARD_OP_EXCEPTION( op, op.init() );
     }
@@ -619,39 +657,39 @@ namespace mongo {
 
     bool QueryPlanSet::Runner::prepareToYield( QueryOp &op ) {
         GUARD_OP_EXCEPTION( op,
-                           if ( op.error() ) {
-                               return true;
-                           } else {
-                               return op.prepareToYield();
-                           } );
+        if ( op.error() ) {
+        return true;
+    }
+    else {
+        return op.prepareToYield();
+        } );
         return true;
     }
 
     void QueryPlanSet::Runner::recoverFromYield( QueryOp &op ) {
         GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.recoverFromYield(); } );
     }
-    
-    
+
+
     MultiPlanScanner::MultiPlanScanner( const char *ns,
-                                       const BSONObj &query,
-                                       const BSONObj &order,
-                                       const BSONElement *hint,
-                                       bool honorRecordedPlan,
-                                       const BSONObj &min,
-                                       const BSONObj &max,
-                                       bool bestGuessOnly,
-                                       bool mayYield ) :
-    _ns( ns ),
-    _or( !query.getField( "$or" ).eoo() ),
-    _query( query.getOwned() ),
-    _fros( ns, _query ),
-    _i(),
-    _honorRecordedPlan( honorRecordedPlan ),
-    _bestGuessOnly( bestGuessOnly ),
-    _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ),
-    _mayYield( mayYield ),
-    _tableScanned()
-    {
+                                        const BSONObj &query,
+                                        const BSONObj &order,
+                                        const BSONElement *hint,
+                                        bool honorRecordedPlan,
+                                        const BSONObj &min,
+                                        const BSONObj &max,
+                                        bool bestGuessOnly,
+                                        bool mayYield ) :
+        _ns( ns ),
+        _or( !query.getField( "$or" ).eoo() ),
+        _query( query.getOwned() ),
+        _fros( ns, _query ),
+        _i(),
+        _honorRecordedPlan( honorRecordedPlan ),
+        _bestGuessOnly( bestGuessOnly ),
+        _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ),
+        _mayYield( mayYield ),
+        _tableScanned() {
         if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() || !_fros.getSpecial().empty() ) {
             _or = false;
         }
@@ -661,8 +699,10 @@ namespace mongo {
         // if _or == false, don't use or clauses for index selection
         if ( !_or ) {
             auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns, _query ) );
-            _currentQps.reset( new QueryPlanSet( ns, frs, _query, order, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) );
-        } else {
+            auto_ptr< FieldRangeSet > oldFrs( new FieldRangeSet( *frs ) );
+            _currentQps.reset( new QueryPlanSet( ns, frs, oldFrs, _query, order, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) );
+        }
+        else {
             BSONElement e = _query.getField( "$or" );
             massert( 13268, "invalid $or spec", e.type() == Array && e.embeddedObject().nFields() > 0 );
         }
@@ -676,16 +716,17 @@ namespace mongo {
         }
         ++_i;
         auto_ptr< FieldRangeSet > frs( _fros.topFrs() );
+        auto_ptr< FieldRangeSet > originalFrs( _fros.topFrsOriginal() );
         BSONElement hintElt = _hint.firstElement();
-        _currentQps.reset( new QueryPlanSet( _ns, frs, _query, BSONObj(), &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
+        _currentQps.reset( new QueryPlanSet( _ns, frs, originalFrs, _query, BSONObj(), &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
         shared_ptr< QueryOp > ret( _currentQps->runOp( op ) );
         if ( ret->qp().willScanTable() ) {
             _tableScanned = true;
         }
-        _fros.popOrClause();
+        _fros.popOrClause( ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() );
         return ret;
     }
-    
+
     shared_ptr< QueryOp > MultiPlanScanner::runOp( QueryOp &op ) {
         shared_ptr< QueryOp > ret = runOpOnce( op );
         while( !ret->stopRequested() && mayRunMore() ) {
@@ -693,7 +734,7 @@ namespace mongo {
         }
         return ret;
     }
-    
+
     bool MultiPlanScanner::uselessOr( const BSONElement &hint ) const {
         NamespaceDetails *nsd = nsdetails( _ns );
         if ( !nsd ) {
@@ -713,7 +754,8 @@ namespace mongo {
                 if ( id->getSpec().suitability( *i, BSONObj() ) == USELESS ) {
                     return true;
                 }
-            } else {
+            }
+            else {
                 bool useful = false;
                 NamespaceDetails::IndexIterator j = nsd->ii();
                 while( j.more() ) {
@@ -725,12 +767,12 @@ namespace mongo {
                 }
                 if ( !useful ) {
                     return true;
-                }       
+                }
             }
         }
         return false;
     }
-    
+
     bool indexWorks( const BSONObj &idxPattern, const BSONObj &sampleKey, int direction, int firstSignificantField ) {
         BSONObjIterator p( idxPattern );
         BSONObjIterator k( sampleKey );
@@ -761,19 +803,19 @@ namespace mongo {
             int idxDirection = e.number() >= 0 ? 1 : -1;
             int direction = idxDirection * baseDirection;
             switch( direction ) {
-                case 1:
-                    b.appendMaxKey( e.fieldName() );
-                    break;
-                case -1:
-                    b.appendMinKey( e.fieldName() );
-                    break;
-                default:
-                    assert( false );
+            case 1:
+                b.appendMaxKey( e.fieldName() );
+                break;
+            case -1:
+                b.appendMinKey( e.fieldName() );
+                break;
+            default:
+                assert( false );
             }
         }
-        return b.obj();        
+        return b.obj();
     }
-    
+
     pair< int, int > keyAudit( const BSONObj &min, const BSONObj &max ) {
         int direction = 0;
         int firstSignificantField = 0;
@@ -802,18 +844,19 @@ namespace mongo {
     pair< int, int > flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) {
         if ( min.isEmpty() || max.isEmpty() ) {
             return make_pair( 1, -1 );
-        } else {
+        }
+        else {
             return keyAudit( min, max );
         }
     }
-    
+
     // NOTE min, max, and keyPattern will be updated to be consistent with the selected index.
     IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
         if ( min.isEmpty() && max.isEmpty() ) {
             errmsg = "one of min or max must be specified";
             return 0;
         }
-        
+
         Client::Context ctx( ns );
         IndexDetails *id = 0;
         NamespaceDetails *d = nsdetails( ns );
@@ -821,7 +864,7 @@ namespace mongo {
             errmsg = "ns not found";
             return 0;
         }
-        
+
         pair< int, int > ret = flexibleKeyAudit( min, max );
         if ( ret == make_pair( -1, -1 ) ) {
             errmsg = "min and max keys do not share pattern";
@@ -832,15 +875,16 @@ namespace mongo {
             while( i.more() ) {
                 IndexDetails& ii = i.next();
                 if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) {
-                    if ( ii.getSpec().getType() == 0 ){
+                    if ( ii.getSpec().getType() == 0 ) {
                         id = &ii;
                         keyPattern = ii.keyPattern();
                         break;
                     }
                 }
             }
-            
-        } else {            
+
+        }
+        else {
             if ( !indexWorks( keyPattern, min.isEmpty() ? max : min, ret.first, ret.second ) ) {
                 errmsg = "requested keyPattern does not match specified keys";
                 return 0;
@@ -853,30 +897,31 @@ namespace mongo {
                     break;
                 }
                 if ( keyPattern.nFields() == 1 && ii.keyPattern().nFields() == 1 &&
-                     IndexDetails::isIdIndexPattern( keyPattern ) &&
-                     ii.isIdIndex() ){
+                        IndexDetails::isIdIndexPattern( keyPattern ) &&
+                        ii.isIdIndex() ) {
                     id = &ii;
                     break;
                 }
-                     
+
             }
         }
 
         if ( min.isEmpty() ) {
             min = extremeKeyForIndex( keyPattern, -1 );
-        } else if ( max.isEmpty() ) {
+        }
+        else if ( max.isEmpty() ) {
             max = extremeKeyForIndex( keyPattern, 1 );
         }
-                
+
         if ( !id ) {
             errmsg = (string)"no index found for specified keyPattern: " + keyPattern.toString();
             return 0;
         }
-        
+
         min = min.extractFieldsUnDotted( keyPattern );
         max = max.extractFieldsUnDotted( keyPattern );
 
         return id;
     }
-        
+
 } // namespace mongo
diff --git a/db/queryoptimizer.h b/db/queryoptimizer.h
index 8314bfa..cf3180a 100644
--- a/db/queryoptimizer.h
+++ b/db/queryoptimizer.h
@@ -25,15 +25,17 @@
 #include "../util/message.h"
 
 namespace mongo {
-    
+
     class IndexDetails;
     class IndexType;
 
     class QueryPlan : boost::noncopyable {
     public:
-        QueryPlan(NamespaceDetails *_d, 
-                  int _idxNo, // -1 = no index
+
+        QueryPlan(NamespaceDetails *d,
+                  int idxNo, // -1 = no index
                   const FieldRangeSet &fbs,
+                  const FieldRangeSet &originalFrs,
                   const BSONObj &originalQuery,
                   const BSONObj &order,
                   const BSONObj &startKey = BSONObj(),
@@ -41,44 +43,50 @@ namespace mongo {
                   string special="" );
 
         /* If true, no other index can do better. */
-        bool optimal() const { return optimal_; }
+        bool optimal() const { return _optimal; }
         /* ScanAndOrder processing will be required if true */
-        bool scanAndOrderRequired() const { return scanAndOrderRequired_; }
+        bool scanAndOrderRequired() const { return _scanAndOrderRequired; }
         /* When true, the index we are using has keys such that it can completely resolve the
          query expression to match by itself without ever checking the main object.
          */
-        bool exactKeyMatch() const { return exactKeyMatch_; }
-        /* If true, the startKey and endKey are unhelpful and the index order doesn't match the 
+        bool exactKeyMatch() const { return _exactKeyMatch; }
+        /* If true, the startKey and endKey are unhelpful and the index order doesn't match the
            requested sort order */
-        bool unhelpful() const { return unhelpful_; }
-        int direction() const { return direction_; }
+        bool unhelpful() const { return _unhelpful; }
+        int direction() const { return _direction; }
         shared_ptr<Cursor> newCursor( const DiskLoc &startLoc = DiskLoc() , int numWanted=0 ) const;
         shared_ptr<Cursor> newReverseCursor() const;
         BSONObj indexKey() const;
-        bool willScanTable() const { return !index_ && fbs_.matchPossible(); }
-        const char *ns() const { return fbs_.ns(); }
-        NamespaceDetails *nsd() const { return d; }
+        bool indexed() const { return _index; }
+        bool willScanTable() const { return !_index && _fbs.matchPossible(); }
+        const char *ns() const { return _fbs.ns(); }
+        NamespaceDetails *nsd() const { return _d; }
         BSONObj originalQuery() const { return _originalQuery; }
-        BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return fbs_.simplifiedQuery( fields ); }
-        const FieldRange &range( const char *fieldName ) const { return fbs_.range( fieldName ); }
+        BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return _fbs.simplifiedQuery( fields ); }
+        const FieldRange &range( const char *fieldName ) const { return _fbs.range( fieldName ); }
         void registerSelf( long long nScanned ) const;
+        shared_ptr< FieldRangeVector > originalFrv() const { return _originalFrv; }
+        // just for testing
         shared_ptr< FieldRangeVector > frv() const { return _frv; }
+        bool isMultiKey() const;
+
     private:
-        NamespaceDetails *d;
-        int idxNo;
-        const FieldRangeSet &fbs_;
+        NamespaceDetails * _d;
+        int _idxNo;
+        const FieldRangeSet &_fbs;
         const BSONObj &_originalQuery;
-        const BSONObj &order_;
-        const IndexDetails *index_;
-        bool optimal_;
-        bool scanAndOrderRequired_;
-        bool exactKeyMatch_;
-        int direction_;
+        const BSONObj &_order;
+        const IndexDetails * _index;
+        bool _optimal;
+        bool _scanAndOrderRequired;
+        bool _exactKeyMatch;
+        int _direction;
         shared_ptr< FieldRangeVector > _frv;
+        shared_ptr< FieldRangeVector > _originalFrv;
         BSONObj _startKey;
         BSONObj _endKey;
-        bool endKeyInclusive_;
-        bool unhelpful_;
+        bool _endKeyInclusive;
+        bool _unhelpful;
         string _special;
         IndexType * _type;
         bool _startOrEndSpec;
@@ -93,16 +101,17 @@ namespace mongo {
 
         // Used when handing off from one QueryOp type to another
         QueryOp( const QueryOp &other ) :
-        _complete(), _stopRequested(), _qp(), _error(), _matcher( other._matcher ),
-        _orConstraint( other._orConstraint ) {}
-        
+            _complete(), _stopRequested(), _qp(), _error(), _matcher( other._matcher ),
+            _orConstraint( other._orConstraint ) {}
+
         virtual ~QueryOp() {}
-        
+
         /** these gets called after a query plan is set */
-        void init() { 
+        void init() {
             if ( _oldMatcher.get() ) {
                 _matcher.reset( _oldMatcher->nextClauseMatcher( qp().indexKey() ) );
-            } else {
+            }
+            else {
                 _matcher.reset( new CoveredIndexMatcher( qp().originalQuery(), qp().indexKey(), alwaysUseRecord() ) );
             }
             _init();
@@ -110,10 +119,12 @@ namespace mongo {
         virtual void next() = 0;
 
         virtual bool mayRecordPlan() const = 0;
-        
+
         virtual bool prepareToYield() { massert( 13335, "yield not supported", false ); return false; }
         virtual void recoverFromYield() { massert( 13336, "yield not supported", false ); }
-        
+
+        virtual long long nscanned() = 0;
+
         /** @return a copy of the inheriting class, which will be run with its own
                     query plan.  If multiple plan sets are required for an $or query,
                     the QueryOp of the winning plan from a given set will be cloned
@@ -143,17 +154,17 @@ namespace mongo {
         shared_ptr< CoveredIndexMatcher > matcher() const { return _matcher; }
     protected:
         void setComplete() {
-            _orConstraint = qp().frv();
+            _orConstraint = qp().originalFrv();
             _complete = true;
         }
         void setStop() { setComplete(); _stopRequested = true; }
 
         virtual void _init() = 0;
-        
+
         virtual QueryOp *_createChild() const = 0;
-        
+
         virtual bool alwaysUseRecord() const { return false; }
-    
+
     private:
         bool _complete;
         bool _stopRequested;
@@ -164,42 +175,47 @@ namespace mongo {
         shared_ptr< CoveredIndexMatcher > _oldMatcher;
         shared_ptr< FieldRangeVector > _orConstraint;
     };
-    
+
     // Set of candidate query plans for a particular query.  Used for running
     // a QueryOp on these plans.
     class QueryPlanSet {
     public:
 
-        typedef boost::shared_ptr< QueryPlan > PlanPtr;
-        typedef vector< PlanPtr > PlanSet;
+        typedef boost::shared_ptr< QueryPlan > QueryPlanPtr;
+        typedef vector< QueryPlanPtr > PlanSet;
 
         QueryPlanSet( const char *ns,
-                     auto_ptr< FieldRangeSet > frs,
-                     const BSONObj &originalQuery,
-                     const BSONObj &order,
-                     const BSONElement *hint = 0,
-                     bool honorRecordedPlan = true,
-                     const BSONObj &min = BSONObj(),
-                     const BSONObj &max = BSONObj(),
-                     bool bestGuessOnly = false,
-                     bool mayYield = false);
-        int nPlans() const { return plans_.size(); }
+                      auto_ptr< FieldRangeSet > frs,
+                      auto_ptr< FieldRangeSet > originalFrs,
+                      const BSONObj &originalQuery,
+                      const BSONObj &order,
+                      const BSONElement *hint = 0,
+                      bool honorRecordedPlan = true,
+                      const BSONObj &min = BSONObj(),
+                      const BSONObj &max = BSONObj(),
+                      bool bestGuessOnly = false,
+                      bool mayYield = false);
+        int nPlans() const { return _plans.size(); }
         shared_ptr< QueryOp > runOp( QueryOp &op );
         template< class T >
         shared_ptr< T > runOp( T &op ) {
             return dynamic_pointer_cast< T >( runOp( static_cast< QueryOp& >( op ) ) );
         }
         BSONObj explain() const;
-        bool usingPrerecordedPlan() const { return usingPrerecordedPlan_; }
-        PlanPtr getBestGuess() const;
+        bool usingPrerecordedPlan() const { return _usingPrerecordedPlan; }
+        QueryPlanPtr getBestGuess() const;
         //for testing
-        const FieldRangeSet &fbs() const { return *fbs_; }
+        const FieldRangeSet &fbs() const { return *_fbs; }
+        const FieldRangeSet &originalFrs() const { return *_originalFrs; }
+        bool modifiedKeys() const;
+        bool hasMultiKey() const;
+
     private:
         void addOtherPlans( bool checkFirst );
-        void addPlan( PlanPtr plan, bool checkFirst ) {
-            if ( checkFirst && plan->indexKey().woCompare( plans_[ 0 ]->indexKey() ) == 0 )
+        void addPlan( QueryPlanPtr plan, bool checkFirst ) {
+            if ( checkFirst && plan->indexKey().woCompare( _plans[ 0 ]->indexKey() ) == 0 )
                 return;
-            plans_.push_back( plan );
+            _plans.push_back( plan );
         }
         void init();
         void addHint( IndexDetails &id );
@@ -207,25 +223,27 @@ namespace mongo {
             Runner( QueryPlanSet &plans, QueryOp &op );
             shared_ptr< QueryOp > run();
             void mayYield( const vector< shared_ptr< QueryOp > > &ops );
-            QueryOp &op_;
-            QueryPlanSet &plans_;
+            QueryOp &_op;
+            QueryPlanSet &_plans;
             static void initOp( QueryOp &op );
             static void nextOp( QueryOp &op );
             static bool prepareToYield( QueryOp &op );
             static void recoverFromYield( QueryOp &op );
         };
-        const char *ns;
+
+        const char *_ns;
         BSONObj _originalQuery;
-        auto_ptr< FieldRangeSet > fbs_;
-        PlanSet plans_;
-        bool mayRecordPlan_;
-        bool usingPrerecordedPlan_;
-        BSONObj hint_;
-        BSONObj order_;
-        long long oldNScanned_;
-        bool honorRecordedPlan_;
-        BSONObj min_;
-        BSONObj max_;
+        auto_ptr< FieldRangeSet > _fbs;
+        auto_ptr< FieldRangeSet > _originalFrs;
+        PlanSet _plans;
+        bool _mayRecordPlan;
+        bool _usingPrerecordedPlan;
+        BSONObj _hint;
+        BSONObj _order;
+        long long _oldNScanned;
+        bool _honorRecordedPlan;
+        BSONObj _min;
+        BSONObj _max;
         string _special;
         bool _bestGuessOnly;
         bool _mayYield;
@@ -258,24 +276,24 @@ namespace mongo {
     class MultiPlanScanner {
     public:
         MultiPlanScanner( const char *ns,
-                         const BSONObj &query,
-                         const BSONObj &order,
-                         const BSONElement *hint = 0,
-                         bool honorRecordedPlan = true,
-                         const BSONObj &min = BSONObj(),
-                         const BSONObj &max = BSONObj(),
-                         bool bestGuessOnly = false,
-                         bool mayYield = false);
+                          const BSONObj &query,
+                          const BSONObj &order,
+                          const BSONElement *hint = 0,
+                          bool honorRecordedPlan = true,
+                          const BSONObj &min = BSONObj(),
+                          const BSONObj &max = BSONObj(),
+                          bool bestGuessOnly = false,
+                          bool mayYield = false);
         shared_ptr< QueryOp > runOp( QueryOp &op );
         template< class T >
         shared_ptr< T > runOp( T &op ) {
             return dynamic_pointer_cast< T >( runOp( static_cast< QueryOp& >( op ) ) );
-        }       
+        }
         shared_ptr< QueryOp > runOpOnce( QueryOp &op );
         template< class T >
         shared_ptr< T > runOpOnce( T &op ) {
             return dynamic_pointer_cast< T >( runOpOnce( static_cast< QueryOp& >( op ) ) );
-        }       
+        }
         bool mayRunMore() const { return _or ? ( !_tableScanned && !_fros.orFinished() ) : _i == 0; }
         BSONObj oldExplain() const { assertNotOr(); return _currentQps->explain(); }
         // just report this when only one query op
@@ -284,6 +302,9 @@ namespace mongo {
         }
         void setBestGuessOnly() { _bestGuessOnly = true; }
         void mayYield( bool val ) { _mayYield = val; }
+        bool modifiedKeys() const { return _currentQps->modifiedKeys(); }
+        bool hasMultiKey() const { return _currentQps->hasMultiKey(); }
+
     private:
         void assertNotOr() const {
             massert( 13266, "not implemented for $or query", !_or );
@@ -301,21 +322,22 @@ namespace mongo {
         bool _mayYield;
         bool _tableScanned;
     };
-    
+
     class MultiCursor : public Cursor {
     public:
         class CursorOp : public QueryOp {
         public:
             CursorOp() {}
             CursorOp( const QueryOp &other ) : QueryOp( other ) {}
-            virtual shared_ptr< Cursor > newCursor() const = 0;  
+            virtual shared_ptr< Cursor > newCursor() const = 0;
         };
         // takes ownership of 'op'
         MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr< CursorOp > op = shared_ptr< CursorOp >(), bool mayYield = false )
-        : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ) {
+            : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ), _nscanned() {
             if ( op.get() ) {
                 _op = op;
-            } else {
+            }
+            else {
                 _op.reset( new NoOp() );
             }
             if ( _mps->mayRunMore() ) {
@@ -323,13 +345,14 @@ namespace mongo {
                 if ( !ok() ) {
                     advance();
                 }
-            } else {
+            }
+            else {
                 _c.reset( new BasicCursor( DiskLoc() ) );
             }
         }
         // used to handoff a query to a getMore()
         MultiCursor( auto_ptr< MultiPlanScanner > mps, const shared_ptr< Cursor > &c, const shared_ptr< CoveredIndexMatcher > &matcher, const QueryOp &op )
-        : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ) {
+            : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ), _nscanned( -1 ) {
             _mps->setBestGuessOnly();
             _mps->mayYield( false ); // with a NoOp, there's no need to yield in QueryPlanSet
             if ( !ok() ) {
@@ -355,16 +378,24 @@ namespace mongo {
         }
         virtual void checkLocation() {
             _c->checkLocation();
-        }        
+        }
         virtual bool supportGetMore() { return true; }
         virtual bool supportYields() { return _c->supportYields(); }
+
         // with update we could potentially get the same document on multiple
         // indexes, but update appears to already handle this with seenObjects
         // so we don't have to do anything special here.
         virtual bool getsetdup(DiskLoc loc) {
-            return _c->getsetdup( loc );   
+            return _c->getsetdup( loc );
         }
+
+        virtual bool modifiedKeys() const { return _mps->modifiedKeys(); }
+
+        virtual bool isMultiKey() const { return _mps->hasMultiKey(); }
+
         virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
+        // return -1 if we're a getmore handoff
+        virtual long long nscanned() { return _nscanned >= 0 ? _nscanned + _c->nscanned() : _nscanned; }
         // just for testing
         shared_ptr< Cursor > sub_c() const { return _c; }
     private:
@@ -377,8 +408,12 @@ namespace mongo {
             virtual bool mayRecordPlan() const { return false; }
             virtual QueryOp *_createChild() const { return new NoOp(); }
             virtual shared_ptr< Cursor > newCursor() const { return qp().newCursor(); }
+            virtual long long nscanned() { assert( false ); return 0; }
         };
         void nextClause() {
+            if ( _nscanned >= 0 && _c.get() ) {
+                _nscanned += _c->nscanned();
+            }
             shared_ptr< CursorOp > best = _mps->runOpOnce( *_op );
             if ( ! best->complete() )
                 throw MsgAssertionException( best->exception() );
@@ -390,12 +425,13 @@ namespace mongo {
         shared_ptr< Cursor > _c;
         auto_ptr< MultiPlanScanner > _mps;
         shared_ptr< CoveredIndexMatcher > _matcher;
+        long long _nscanned;
     };
-    
+
     // NOTE min, max, and keyPattern will be updated to be consistent with the selected index.
     IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern );
 
-    inline bool isSimpleIdQuery( const BSONObj& query ){
+    inline bool isSimpleIdQuery( const BSONObj& query ) {
         BSONObjIterator i(query);
         if( !i.more() ) return false;
         BSONElement e = i.next();
@@ -403,14 +439,16 @@ namespace mongo {
         if( strcmp("_id", e.fieldName()) != 0 ) return false;
         return e.isSimpleType(); // e.g. not something like { _id : { $gt : ...
     }
-    
+
     // matcher() will always work on the returned cursor
     inline shared_ptr< Cursor > bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ) {
         if( !query.getField( "$or" ).eoo() ) {
             return shared_ptr< Cursor >( new MultiCursor( ns, query, sort ) );
-        } else {
+        }
+        else {
             auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns, query ) );
-            shared_ptr< Cursor > ret = QueryPlanSet( ns, frs, query, sort ).getBestGuess()->newCursor();
+            auto_ptr< FieldRangeSet > origFrs( new FieldRangeSet( *frs ) );
+            shared_ptr< Cursor > ret = QueryPlanSet( ns, frs, origFrs, query, sort ).getBestGuess()->newCursor();
             if ( !query.isEmpty() ) {
                 shared_ptr< CoveredIndexMatcher > matcher( new CoveredIndexMatcher( query, ret->indexKeyPattern() ) );
                 ret->setMatcher( matcher );
@@ -418,5 +456,5 @@ namespace mongo {
             return ret;
         }
     }
-        
+
 } // namespace mongo
diff --git a/db/queryutil.cpp b/db/queryutil.cpp
index 2153046..1cd750b 100644
--- a/db/queryutil.cpp
+++ b/db/queryutil.cpp
@@ -23,111 +23,119 @@
 #include "queryoptimizer.h"
 #include "../util/unittest.h"
 #include "dbmessage.h"
+#include "indexkey.h"
 
 namespace mongo {
     extern BSONObj staticNull;
-    
+
     /** returns a string that when used as a matcher, would match a super set of regex()
         returns "" for complex regular expressions
         used to optimize queries in some simple regex cases that start with '^'
 
         if purePrefix != NULL, sets it to whether the regex can be converted to a range query
     */
-    string simpleRegex(const char* regex, const char* flags, bool* purePrefix){
+    string simpleRegex(const char* regex, const char* flags, bool* purePrefix) {
         string r = "";
 
         if (purePrefix) *purePrefix = false;
 
         bool multilineOK;
-        if ( regex[0] == '\\' && regex[1] == 'A'){
+        if ( regex[0] == '\\' && regex[1] == 'A') {
             multilineOK = true;
             regex += 2;
-        } else if (regex[0] == '^') {
+        }
+        else if (regex[0] == '^') {
             multilineOK = false;
             regex += 1;
-        } else {
+        }
+        else {
             return r;
         }
 
         bool extended = false;
-        while (*flags){
-            switch (*(flags++)){
-                case 'm': // multiline
-                    if (multilineOK)
-                        continue;
-                    else
-                        return r;
-                case 'x': // extended
-                    extended = true;
-                    break;
-                default:
-                    return r; // cant use index
+        while (*flags) {
+            switch (*(flags++)) {
+            case 'm': // multiline
+                if (multilineOK)
+                    continue;
+                else
+                    return r;
+            case 'x': // extended
+                extended = true;
+                break;
+            default:
+                return r; // cant use index
             }
         }
 
         stringstream ss;
 
-        while(*regex){
+        while(*regex) {
             char c = *(regex++);
-            if ( c == '*' || c == '?' ){
+            if ( c == '*' || c == '?' ) {
                 // These are the only two symbols that make the last char optional
                 r = ss.str();
                 r = r.substr( 0 , r.size() - 1 );
                 return r; //breaking here fails with /^a?/
-            } else if (c == '\\'){
+            }
+            else if (c == '\\') {
                 // slash followed by non-alphanumeric represents the following char
                 c = *(regex++);
                 if ((c >= 'A' && c <= 'Z') ||
-                    (c >= 'a' && c <= 'z') ||
-                    (c >= '0' && c <= '0') ||
-                    (c == '\0'))
-                {
+                        (c >= 'a' && c <= 'z') ||
+                        (c >= '0' && c <= '0') ||
+                        (c == '\0')) {
                     r = ss.str();
                     break;
-                } else {
+                }
+                else {
                     ss << c;
                 }
-            } else if (strchr("^$.[|()+{", c)){
+            }
+            else if (strchr("^$.[|()+{", c)) {
                 // list of "metacharacters" from man pcrepattern
                 r = ss.str();
                 break;
-            } else if (extended && c == '#'){
+            }
+            else if (extended && c == '#') {
                 // comment
                 r = ss.str();
                 break;
-            } else if (extended && isspace(c)){
+            }
+            else if (extended && isspace(c)) {
                 continue;
-            } else {
+            }
+            else {
                 // self-matching char
                 ss << c;
             }
         }
 
-        if ( r.empty() && *regex == 0 ){
+        if ( r.empty() && *regex == 0 ) {
             r = ss.str();
             if (purePrefix) *purePrefix = !r.empty();
         }
 
         return r;
     }
-    inline string simpleRegex(const BSONElement& e){
-        switch(e.type()){
-            case RegEx:
-                return simpleRegex(e.regex(), e.regexFlags());
-            case Object:{
-                BSONObj o = e.embeddedObject();
-                return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe());
-            }
-            default: assert(false); return ""; //return squashes compiler warning
+    inline string simpleRegex(const BSONElement& e) {
+        switch(e.type()) {
+        case RegEx:
+            return simpleRegex(e.regex(), e.regexFlags());
+        case Object: {
+            BSONObj o = e.embeddedObject();
+            return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe());
+        }
+        default: assert(false); return ""; //return squashes compiler warning
         }
     }
 
     string simpleRegexEnd( string regex ) {
         ++regex[ regex.length() - 1 ];
         return regex;
-    }    
-    
-    
+    }
+
+
     FieldRange::FieldRange( const BSONElement &e, bool isNot, bool optimize ) {
         // NOTE with $not, we could potentially form a complementary set of intervals.
         if ( !isNot && !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) {
@@ -139,7 +147,8 @@ namespace mongo {
                 BSONElement ie = i.next();
                 if ( ie.type() == RegEx ) {
                     regexes.push_back( FieldRange( ie, false, optimize ) );
-                } else {
+                }
+                else {
                     vals.insert( ie );
                 }
             }
@@ -149,22 +158,22 @@ namespace mongo {
 
             for( vector< FieldRange >::const_iterator i = regexes.begin(); i != regexes.end(); ++i )
                 *this |= *i;
-            
+
             return;
         }
-        
-        if ( e.type() == Array && e.getGtLtOp() == BSONObj::Equality ){
-            
+
+        if ( e.type() == Array && e.getGtLtOp() == BSONObj::Equality ) {
+
             _intervals.push_back( FieldInterval(e) );
-            
+
             const BSONElement& temp = e.embeddedObject().firstElement();
-            if ( ! temp.eoo() ){
+            if ( ! temp.eoo() ) {
                 if ( temp < e )
                     _intervals.insert( _intervals.begin() , temp );
                 else
                     _intervals.push_back( FieldInterval(temp) );
             }
-            
+
             return;
         }
 
@@ -181,17 +190,19 @@ namespace mongo {
 
         if ( e.eoo() )
             return;
+        int op = e.getGtLtOp();
         if ( e.type() == RegEx
-             || (e.type() == Object && !e.embeddedObject()["$regex"].eoo())
-           )
-        {
+                || (e.type() == Object && !e.embeddedObject()["$regex"].eoo())
+           ) {
+            uassert( 13454, "invalid regular expression operator", op == BSONObj::Equality || op == BSONObj::opREGEX );
             if ( !isNot ) { // no optimization for negated regex - we could consider creating 2 intervals comprising all nonmatching prefixes
                 const string r = simpleRegex(e);
                 if ( r.size() ) {
                     lower = addObj( BSON( "" << r ) ).firstElement();
                     upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement();
                     upperInclusive = false;
-                } else {
+                }
+                else {
                     BSONObjBuilder b1(32), b2(32);
                     b1.appendMinForType( "" , String );
                     lower = addObj( b1.obj() ).firstElement();
@@ -202,10 +213,11 @@ namespace mongo {
                 }
 
                 // regex matches self - regex type > string type
-                if (e.type() == RegEx){
+                if (e.type() == RegEx) {
                     BSONElement re = addObj( BSON( "" << e ) ).firstElement();
                     _intervals.push_back( FieldInterval(re) );
-                } else {
+                }
+                else {
                     BSONObj orig = e.embeddedObject();
                     BSONObjBuilder b;
                     b.appendRegex("", orig["$regex"].valuestrsafe(), orig["$options"].valuestrsafe());
@@ -216,38 +228,53 @@ namespace mongo {
             }
             return;
         }
-        int op = e.getGtLtOp();
         if ( isNot ) {
             switch( op ) {
-                case BSONObj::Equality:
-                case BSONObj::opALL:
-                case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in)
-                case BSONObj::opTYPE:
-                    op = BSONObj::NE; // no bound calculation
-                    break;
-                case BSONObj::NE:
-                    op = BSONObj::Equality;
-                    break;
-                case BSONObj::LT:
-                    op = BSONObj::GTE;
-                    break;
-                case BSONObj::LTE:
-                    op = BSONObj::GT;
-                    break;
-                case BSONObj::GT:
-                    op = BSONObj::LTE;
-                    break;
-                case BSONObj::GTE:
-                    op = BSONObj::LT;
-                    break;
-                default: // otherwise doesn't matter
-                    break;
+            case BSONObj::Equality:
+                return;
+//                    op = BSONObj::NE;
+//                    break;
+            case BSONObj::opALL:
+            case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in)
+            case BSONObj::opTYPE:
+                // no bound calculation
+                return;
+            case BSONObj::NE:
+                op = BSONObj::Equality;
+                break;
+            case BSONObj::LT:
+                op = BSONObj::GTE;
+                break;
+            case BSONObj::LTE:
+                op = BSONObj::GT;
+                break;
+            case BSONObj::GT:
+                op = BSONObj::LTE;
+                break;
+            case BSONObj::GTE:
+                op = BSONObj::LT;
+                break;
+            default: // otherwise doesn't matter
+                break;
             }
         }
         switch( op ) {
         case BSONObj::Equality:
             lower = upper = e;
             break;
+        case BSONObj::NE: {
+            // this will invalidate the upper/lower references above
+            _intervals.push_back( FieldInterval() );
+            // optimize doesn't make sense for negative ranges
+            _intervals[ 0 ]._upper._bound = e;
+            _intervals[ 0 ]._upper._inclusive = false;
+            _intervals[ 1 ]._lower._bound = e;
+            _intervals[ 1 ]._lower._inclusive = false;
+            _intervals[ 1 ]._upper._bound = maxKey.firstElement();
+            _intervals[ 1 ]._upper._inclusive = true;
+            optimize = false; // don't run optimize code below
+            break;
+        }
         case BSONObj::LT:
             upperInclusive = false;
         case BSONObj::LTE:
@@ -262,9 +289,9 @@ namespace mongo {
             massert( 10370 ,  "$all requires array", e.type() == Array );
             BSONObjIterator i( e.embeddedObject() );
             bool bound = false;
-            while ( i.more() ){
+            while ( i.more() ) {
                 BSONElement x = i.next();
-                if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ){
+                if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
                     // taken care of elsewhere
                 }
                 else if ( x.type() != RegEx ) {
@@ -299,7 +326,7 @@ namespace mongo {
                 BSONObjBuilder b;
                 b.appendMaxForType( "" , NumberDouble );
                 upper = addObj( b.obj() ).firstElement();
-            }            
+            }
             break;
         }
         case BSONObj::opTYPE: {
@@ -314,7 +341,7 @@ namespace mongo {
                 b.appendMaxForType( "" , t );
                 upper = addObj( b.obj() ).firstElement();
             }
-            
+
             break;
         }
         case BSONObj::opREGEX:
@@ -332,14 +359,14 @@ namespace mongo {
         default:
             break;
         }
-        
-        if ( optimize ){
-            if ( lower.type() != MinKey && upper.type() == MaxKey && lower.isSimpleType() ){ // TODO: get rid of isSimpleType
+
+        if ( optimize ) {
+            if ( lower.type() != MinKey && upper.type() == MaxKey && lower.isSimpleType() ) { // TODO: get rid of isSimpleType
                 BSONObjBuilder b;
                 b.appendMaxForType( lower.fieldName() , lower.type() );
                 upper = addObj( b.obj() ).firstElement();
             }
-            else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ){ // TODO: get rid of isSimpleType
+            else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ) { // TODO: get rid of isSimpleType
                 BSONObjBuilder b;
                 b.appendMinForType( upper.fieldName() , upper.type() );
                 lower = addObj( b.obj() ).firstElement();
@@ -355,7 +382,7 @@ namespace mongo {
         if ( _special.size() == 0 && other._special.size() )
             _special = other._special;
     }
-    
+
     // as called, these functions find the max/min of a bound in the
     // opposite direction, so inclusive bounds are considered less
     // superlative
@@ -378,41 +405,46 @@ namespace mongo {
         result._upper = minFieldBound( one._upper, two._upper );
         return result.strictValid();
     }
-    
-	// NOTE Not yet tested for complex $or bounds, just for simple bounds generated by $in
+
     const FieldRange &FieldRange::operator&=( const FieldRange &other ) {
         vector< FieldInterval > newIntervals;
         vector< FieldInterval >::const_iterator i = _intervals.begin();
         vector< FieldInterval >::const_iterator j = other._intervals.begin();
         while( i != _intervals.end() && j != other._intervals.end() ) {
             FieldInterval overlap;
-            if ( fieldIntervalOverlap( *i, *j, overlap ) )
+            if ( fieldIntervalOverlap( *i, *j, overlap ) ) {
                 newIntervals.push_back( overlap );
-            if ( i->_upper == minFieldBound( i->_upper, j->_upper ) )
+            }
+            if ( i->_upper == minFieldBound( i->_upper, j->_upper ) ) {
                 ++i;
-            else
-                ++j;      
+            }
+            else {
+                ++j;
+            }
         }
         finishOperation( newIntervals, other );
         return *this;
     }
-    
+
     void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector< FieldInterval > &newIntervals ) {
         if ( low._bound.eoo() ) {
             low = lower._lower; high = lower._upper;
-        } else {
-            if ( high._bound.woCompare( lower._lower._bound, false ) < 0 ) { // when equal but neither inclusive, just assume they overlap, since current btree scanning code just as efficient either way
+        }
+        else {
+            int cmp = high._bound.woCompare( lower._lower._bound, false );
+            if ( ( cmp < 0 ) || ( cmp == 0 && !high._inclusive && !lower._lower._inclusive ) ) {
                 FieldInterval tmp;
                 tmp._lower = low;
                 tmp._upper = high;
                 newIntervals.push_back( tmp );
-                low = lower._lower; high = lower._upper;                    
-            } else {
+                low = lower._lower; high = lower._upper;
+            }
+            else {
                 high = lower._upper;
             }
-        }        
+        }
     }
-    
+
     const FieldRange &FieldRange::operator|=( const FieldRange &other ) {
         vector< FieldInterval > newIntervals;
         FieldBound low;
@@ -424,90 +456,107 @@ namespace mongo {
             if ( ( cmp == 0 && i->_lower._inclusive ) || cmp < 0 ) {
                 handleInterval( *i, low, high, newIntervals );
                 ++i;
-            } else {
+            }
+            else {
                 handleInterval( *j, low, high, newIntervals );
                 ++j;
-            } 
+            }
         }
         while( i != _intervals.end() ) {
             handleInterval( *i, low, high, newIntervals );
-            ++i;            
+            ++i;
         }
         while( j != other._intervals.end() ) {
             handleInterval( *j, low, high, newIntervals );
-            ++j;            
+            ++j;
         }
         FieldInterval tmp;
         tmp._lower = low;
         tmp._upper = high;
-        newIntervals.push_back( tmp );        
+        newIntervals.push_back( tmp );
         finishOperation( newIntervals, other );
-        return *this;        
+        return *this;
     }
-    
+
     const FieldRange &FieldRange::operator-=( const FieldRange &other ) {
+        vector< FieldInterval > newIntervals;
         vector< FieldInterval >::iterator i = _intervals.begin();
         vector< FieldInterval >::const_iterator j = other._intervals.begin();
         while( i != _intervals.end() && j != other._intervals.end() ) {
             int cmp = i->_lower._bound.woCompare( j->_lower._bound, false );
             if ( cmp < 0 ||
-                ( cmp == 0 && i->_lower._inclusive && !j->_lower._inclusive ) ) {
+                    ( cmp == 0 && i->_lower._inclusive && !j->_lower._inclusive ) ) {
                 int cmp2 = i->_upper._bound.woCompare( j->_lower._bound, false );
                 if ( cmp2 < 0 ) {
+                    newIntervals.push_back( *i );
                     ++i;
-                } else if ( cmp2 == 0 ) {
-                    if ( i->_upper._inclusive && j->_lower._inclusive ) {
-                        i->_upper._inclusive = false;
+                }
+                else if ( cmp2 == 0 ) {
+                    newIntervals.push_back( *i );
+                    if ( newIntervals.back()._upper._inclusive && j->_lower._inclusive ) {
+                        newIntervals.back()._upper._inclusive = false;
                     }
                     ++i;
-                } else {
+                }
+                else {
+                    newIntervals.push_back( *i );
+                    newIntervals.back()._upper = j->_lower;
+                    newIntervals.back()._upper.flipInclusive();
                     int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false );
                     if ( cmp3 < 0 ||
-                        ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
-                        i->_upper = j->_lower;
-                        i->_upper.flipInclusive();
+                            ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
                         ++i;
-                    } else {
+                    }
+                    else {
+                        i->_lower = j->_upper;
+                        i->_lower.flipInclusive();
                         ++j;
                     }
                 }
-            } else {
+            }
+            else {
                 int cmp2 = i->_lower._bound.woCompare( j->_upper._bound, false );
                 if ( cmp2 > 0 ||
-                    ( cmp2 == 0 && ( !i->_lower._inclusive || !j->_lower._inclusive ) ) ) {
+                        ( cmp2 == 0 && ( !i->_lower._inclusive || !j->_upper._inclusive ) ) ) {
                     ++j;
-                } else {
+                }
+                else {
                     int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false );
                     if ( cmp3 < 0 ||
-                        ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
-                        i = _intervals.erase( i );
-                    } else {
+                            ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
+                        ++i;
+                    }
+                    else {
                         i->_lower = j->_upper;
-                        i->_lower.flipInclusive();                        
+                        i->_lower.flipInclusive();
                         ++j;
                     }
-                }                
+                }
             }
         }
-        finishOperation( _intervals, other );
-        return *this;        
+        while( i != _intervals.end() ) {
+            newIntervals.push_back( *i );
+            ++i;
+        }
+        finishOperation( newIntervals, other );
+        return *this;
     }
-    
+
     // TODO write a proper implementation that doesn't do a full copy
     bool FieldRange::operator<=( const FieldRange &other ) {
         FieldRange temp = *this;
         temp -= other;
         return temp.empty();
     }
-    
+
     BSONObj FieldRange::addObj( const BSONObj &o ) {
         _objData.push_back( o );
         return o;
     }
-        
+
     string FieldRangeSet::getSpecial() const {
         string s = "";
-        for ( map<string,FieldRange>::iterator i=_ranges.begin(); i!=_ranges.end(); i++ ){
+        for ( map<string,FieldRange>::iterator i=_ranges.begin(); i!=_ranges.end(); i++ ) {
             if ( i->second.getSpecial().size() == 0 )
                 continue;
             uassert( 13033 , "can't have 2 special fields" , s.size() == 0 );
@@ -533,34 +582,35 @@ namespace mongo {
         }
         if ( op2 == BSONObj::opELEM_MATCH ) {
             BSONObjIterator k( g.embeddedObjectUserCheck() );
-            while ( k.more() ){
+            while ( k.more() ) {
                 BSONElement h = k.next();
                 StringBuilder buf(32);
                 buf << fieldName << "." << h.fieldName();
                 string fullname = buf.str();
-                
+
                 int op3 = getGtLtOp( h );
-                if ( op3 == BSONObj::Equality ){
+                if ( op3 == BSONObj::Equality ) {
                     _ranges[ fullname ] &= FieldRange( h , isNot , optimize );
                 }
                 else {
                     BSONObjIterator l( h.embeddedObject() );
-                    while ( l.more() ){
+                    while ( l.more() ) {
                         _ranges[ fullname ] &= FieldRange( l.next() , isNot , optimize );
                     }
                 }
-            }                        
-        } else {
+            }
+        }
+        else {
             _ranges[ fieldName ] &= FieldRange( f , isNot , optimize );
-        }        
+        }
     }
-    
+
     void FieldRangeSet::processQueryField( const BSONElement &e, bool optimize ) {
         bool equality = ( getGtLtOp( e ) == BSONObj::Equality );
         if ( equality && e.type() == Object ) {
             equality = ( strcmp( e.embeddedObject().firstElement().fieldName(), "$not" ) != 0 );
         }
-        
+
         if ( equality || ( e.type() == Object && !e.embeddedObject()[ "$regex" ].eoo() ) ) {
             _ranges[ e.fieldName() ] &= FieldRange( e , false , optimize );
         }
@@ -570,67 +620,69 @@ namespace mongo {
                 BSONElement f = j.next();
                 if ( strcmp( f.fieldName(), "$not" ) == 0 ) {
                     switch( f.type() ) {
-                        case Object: {
-                            BSONObjIterator k( f.embeddedObject() );
-                            while( k.more() ) {
-                                BSONElement g = k.next();
-                                uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality );
-                                processOpElement( e.fieldName(), g, true, optimize );
-                            }
-                            break;
+                    case Object: {
+                        BSONObjIterator k( f.embeddedObject() );
+                        while( k.more() ) {
+                            BSONElement g = k.next();
+                            uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality );
+                            processOpElement( e.fieldName(), g, true, optimize );
                         }
-                        case RegEx:
-                            processOpElement( e.fieldName(), f, true, optimize );
-                            break;
-                        default:
-                            uassert( 13041, "invalid use of $not", false );
+                        break;
                     }
-                } else {
+                    case RegEx:
+                        processOpElement( e.fieldName(), f, true, optimize );
+                        break;
+                    default:
+                        uassert( 13041, "invalid use of $not", false );
+                    }
+                }
+                else {
                     processOpElement( e.fieldName(), f, false, optimize );
                 }
-            }                
-        }   
+            }
+        }
     }
-    
+
     FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query , bool optimize )
         : _ns( ns ), _queries( 1, query.getOwned() ) {
-            BSONObjIterator i( _queries[ 0 ] );
-            
-            while( i.more() ) {
-                BSONElement e = i.next();
-                // e could be x:1 or x:{$gt:1}
-                
-                if ( strcmp( e.fieldName(), "$where" ) == 0 ) {
-                    continue;
-                }
-                
-                if ( strcmp( e.fieldName(), "$or" ) == 0 ) {                                                                                                                                                        
-                    continue;
-                }
-                
-                if ( strcmp( e.fieldName(), "$nor" ) == 0 ) {
-                    continue;
-                }
-                
-                processQueryField( e, optimize );
-            }   
+        BSONObjIterator i( _queries[ 0 ] );
+
+        while( i.more() ) {
+            BSONElement e = i.next();
+            // e could be x:1 or x:{$gt:1}
+
+            if ( strcmp( e.fieldName(), "$where" ) == 0 ) {
+                continue;
+            }
+
+            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+                continue;
+            }
+
+            if ( strcmp( e.fieldName(), "$nor" ) == 0 ) {
+                continue;
+            }
+
+            processQueryField( e, optimize );
         }
+    }
 
     FieldRangeOrSet::FieldRangeOrSet( const char *ns, const BSONObj &query , bool optimize )
         : _baseSet( ns, query, optimize ), _orFound() {
 
         BSONObjIterator i( _baseSet._queries[ 0 ] );
-        
+
         while( i.more() ) {
             BSONElement e = i.next();
-            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {                                                                                                                                                        
-                massert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );                                                                                         
-                BSONObjIterator j( e.embeddedObject() );                                                                                                                                                        
-                while( j.more() ) {                                                                                                                                                                             
-                    BSONElement f = j.next();                                                                                                                                                                   
-                    massert( 13263, "$or array must contain objects", f.type() == Object );                                                                                                                     
+            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+                massert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
+                BSONObjIterator j( e.embeddedObject() );
+                while( j.more() ) {
+                    BSONElement f = j.next();
+                    massert( 13263, "$or array must contain objects", f.type() == Object );
                     _orSets.push_back( FieldRangeSet( ns, f.embeddedObject(), optimize ) );
                     massert( 13291, "$or may not contain 'special' query", _orSets.back().getSpecial().empty() );
+                    _originalOrSets.push_back( _orSets.back() );
                 }
                 _orFound = true;
                 continue;
@@ -638,13 +690,41 @@ namespace mongo {
         }
     }
 
+    void FieldRangeOrSet::popOrClause( const BSONObj &indexSpec ) {
+        massert( 13274, "no or clause to pop", !orFinished() );
+        auto_ptr< FieldRangeSet > holder;
+        FieldRangeSet *toDiff = &_originalOrSets.front();
+        if ( toDiff->matchPossible() && !indexSpec.isEmpty() ) {
+            holder.reset( toDiff->subset( indexSpec ) );
+            toDiff = holder.get();
+        }
+        list< FieldRangeSet >::iterator i = _orSets.begin();
+        list< FieldRangeSet >::iterator j = _originalOrSets.begin();
+        ++i;
+        ++j;
+        while( i != _orSets.end() ) {
+            *i -= *toDiff;
+            if( !i->matchPossible() ) {
+                i = _orSets.erase( i );
+                j = _originalOrSets.erase( j );
+            }
+            else {
+                ++i;
+                ++j;
+            }
+        }
+        _oldOrSets.push_front( _orSets.front() );
+        _orSets.pop_front();
+        _originalOrSets.pop_front();
+    }
+
     FieldRange *FieldRangeSet::trivialRange_ = 0;
     FieldRange &FieldRangeSet::trivialRange() {
         if ( trivialRange_ == 0 )
             trivialRange_ = new FieldRange();
         return *trivialRange_;
     }
-    
+
     BSONObj FieldRangeSet::simplifiedQuery( const BSONObj &_fields ) const {
         BSONObj fields = _fields;
         if ( fields.isEmpty() ) {
@@ -676,14 +756,15 @@ namespace mongo {
         }
         return b.obj();
     }
-    
+
     QueryPattern FieldRangeSet::pattern( const BSONObj &sort ) const {
         QueryPattern qp;
         for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
             assert( !i->second.empty() );
             if ( i->second.equality() ) {
                 qp._fieldTypes[ i->first ] = QueryPattern::Equality;
-            } else if ( i->second.nontrivial() ) {
+            }
+            else if ( i->second.nontrivial() ) {
                 bool upper = i->second.max().type() != MaxKey;
                 bool lower = i->second.min().type() != MinKey;
                 if ( upper && lower )
@@ -691,18 +772,18 @@ namespace mongo {
                 else if ( upper )
                     qp._fieldTypes[ i->first ] = QueryPattern::UpperBound;
                 else if ( lower )
-                    qp._fieldTypes[ i->first ] = QueryPattern::LowerBound;                    
+                    qp._fieldTypes[ i->first ] = QueryPattern::LowerBound;
             }
         }
         qp.setSort( sort );
         return qp;
     }
-    
+
     // TODO get rid of this
     BoundList FieldRangeSet::indexBounds( const BSONObj &keyPattern, int direction ) const {
         typedef vector< pair< shared_ptr< BSONObjBuilder >, shared_ptr< BSONObjBuilder > > > BoundBuilders;
         BoundBuilders builders;
-        builders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );        
+        builders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );
         BSONObjIterator i( keyPattern );
         bool ineq = false; // until ineq is true, we are just dealing with equality and $in bounds
         while( i.more() ) {
@@ -716,7 +797,8 @@ namespace mongo {
                         j->first->appendAs( fr.min(), "" );
                         j->second->appendAs( fr.min(), "" );
                     }
-                } else {
+                }
+                else {
                     if ( !fr.inQuery() ) {
                         ineq = true;
                     }
@@ -725,18 +807,21 @@ namespace mongo {
                     for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i ) {
                         BSONObj first = i->first->obj();
                         BSONObj second = i->second->obj();
+
+                        const unsigned maxCombinations = 4000000;
                         if ( forward ) {
                             for( vector< FieldInterval >::const_iterator j = intervals.begin(); j != intervals.end(); ++j ) {
-                                uassert( 13303, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < 1000000 );
+                                uassert( 13303, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
                                 newBuilders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );
                                 newBuilders.back().first->appendElements( first );
                                 newBuilders.back().second->appendElements( second );
                                 newBuilders.back().first->appendAs( j->_lower._bound, "" );
                                 newBuilders.back().second->appendAs( j->_upper._bound, "" );
                             }
-                        } else {
+                        }
+                        else {
                             for( vector< FieldInterval >::const_reverse_iterator j = intervals.rbegin(); j != intervals.rend(); ++j ) {
-                                uassert( 13304, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < 1000000 );
+                                uassert( 13304, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
                                 newBuilders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );
                                 newBuilders.back().first->appendElements( first );
                                 newBuilders.back().second->appendElements( second );
@@ -747,7 +832,8 @@ namespace mongo {
                     }
                     builders = newBuilders;
                 }
-            } else {
+            }
+            else {
                 for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) {
                     j->first->appendAs( forward ? fr.min() : fr.max(), "" );
                     j->second->appendAs( forward ? fr.max() : fr.min(), "" );
@@ -758,204 +844,45 @@ namespace mongo {
         for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i )
             ret.push_back( make_pair( i->first->obj(), i->second->obj() ) );
         return ret;
-    }    
-    
-    ///////////////////
-    // FieldMatcher //
-    ///////////////////
-    
-    void FieldMatcher::add( const BSONObj& o ){
-        massert( 10371 , "can only add to FieldMatcher once", _source.isEmpty());
-        _source = o;
-
-        BSONObjIterator i( o );
-        int true_false = -1;
-        while ( i.more() ){
-            BSONElement e = i.next();
-
-            if (e.type() == Object){
-                BSONObj obj = e.embeddedObject();
-                BSONElement e2 = obj.firstElement();
-                if ( strcmp(e2.fieldName(), "$slice") == 0 ){
-                    if (e2.isNumber()){
-                        int i = e2.numberInt();
-                        if (i < 0)
-                            add(e.fieldName(), i, -i); // limit is now positive
-                        else
-                            add(e.fieldName(), 0, i);
-
-                    } else if (e2.type() == Array) {
-                        BSONObj arr = e2.embeddedObject();
-                        uassert(13099, "$slice array wrong size", arr.nFields() == 2 );
-
-                        BSONObjIterator it(arr);
-                        int skip = it.next().numberInt();
-                        int limit = it.next().numberInt();
-                        uassert(13100, "$slice limit must be positive", limit > 0 );
-                        add(e.fieldName(), skip, limit);
-
-                    } else {
-                        uassert(13098, "$slice only supports numbers and [skip, limit] arrays", false);
-                    }
-                } else {
-                    uassert(13097, string("Unsupported projection option: ") + obj.firstElement().fieldName(), false);
-                }
-
-            } else if (!strcmp(e.fieldName(), "_id") && !e.trueValue()){
-                _includeID = false;
-
-            } else {
-
-                add (e.fieldName(), e.trueValue());
-
-                // validate input
-                if (true_false == -1){
-                    true_false = e.trueValue();
-                    _include = !e.trueValue();
-                }
-                else{
-                    uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." , 
-                             (bool)true_false == e.trueValue() );
-                }
-            }
-        }
-    }
-
-    void FieldMatcher::add(const string& field, bool include){
-        if (field.empty()){ // this is the field the user referred to
-            _include = include;
-        } else {
-            _include = !include;
-
-            const size_t dot = field.find('.');
-            const string subfield = field.substr(0,dot);
-            const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); 
-
-            boost::shared_ptr<FieldMatcher>& fm = _fields[subfield];
-            if (!fm)
-                fm.reset(new FieldMatcher());
-
-            fm->add(rest, include);
-        }
-    }
-
-    void FieldMatcher::add(const string& field, int skip, int limit){
-        _special = true; // can't include or exclude whole object
-
-        if (field.empty()){ // this is the field the user referred to
-            _skip = skip;
-            _limit = limit;
-        } else {
-            const size_t dot = field.find('.');
-            const string subfield = field.substr(0,dot);
-            const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos));
-
-            boost::shared_ptr<FieldMatcher>& fm = _fields[subfield];
-            if (!fm)
-                fm.reset(new FieldMatcher());
-
-            fm->add(rest, skip, limit);
-        }
     }
 
-    BSONObj FieldMatcher::getSpec() const{
-        return _source;
-    }
-
-    //b will be the value part of an array-typed BSONElement
-    void FieldMatcher::appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested) const {
-        int skip  = nested ?  0 : _skip;
-        int limit = nested ? -1 : _limit;
-
-        if (skip < 0){
-            skip = max(0, skip + a.nFields());
-        }
-
-        int i=0;
-        BSONObjIterator it(a);
-        while (it.more()){
-            BSONElement e = it.next();
-
-            if (skip){
-                skip--;
-                continue;
-            }
-
-            if (limit != -1 && (limit-- == 0)){
-                break;
-            }
-
-            switch(e.type()){
-                case Array:{
-                    BSONObjBuilder subb;
-                    appendArray(subb , e.embeddedObject(), true);
-                    b.appendArray(b.numStr(i++), subb.obj());
-                    break;
-                }
-                case Object:{
-                    BSONObjBuilder subb;
-                    BSONObjIterator jt(e.embeddedObject());
-                    while (jt.more()){
-                        append(subb , jt.next());
-                    }
-                    b.append(b.numStr(i++), subb.obj());
-                    break;
-                }
-                default:
-                    if (_include)
-                        b.appendAs(e, b.numStr(i++));
+    FieldRangeSet *FieldRangeSet::subset( const BSONObj &fields ) const {
+        FieldRangeSet *ret = new FieldRangeSet( _ns, BSONObj() );
+        BSONObjIterator i( fields );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( _ranges[ e.fieldName() ].nontrivial() ) {
+                ret->_ranges[ e.fieldName() ] = _ranges[ e.fieldName() ];
             }
         }
+        ret->_queries = _queries;
+        return ret;
     }
 
-    void FieldMatcher::append( BSONObjBuilder& b , const BSONElement& e ) const {
-        FieldMap::const_iterator field = _fields.find( e.fieldName() );
-        
-        if (field == _fields.end()){
-            if (_include)
-                b.append(e);
-        } 
-        else {
-            FieldMatcher& subfm = *field->second;
-            
-            if ((subfm._fields.empty() && !subfm._special) || !(e.type()==Object || e.type()==Array) ){
-                if (subfm._include)
-                    b.append(e);
-            }
-            else if (e.type() == Object){ 
-                BSONObjBuilder subb;
-                BSONObjIterator it(e.embeddedObject());
-                while (it.more()){
-                    subfm.append(subb, it.next());
-                }
-                b.append(e.fieldName(), subb.obj());
-
-            } 
-            else { //Array
-                BSONObjBuilder subb;
-                subfm.appendArray(subb, e.embeddedObject());
-                b.appendArray(e.fieldName(), subb.obj());
-            }
-        }
-    }
-    
     bool FieldRangeVector::matchesElement( const BSONElement &e, int i, bool forward ) const {
-        int l = matchingLowElement( e, i, forward );
-        return ( l % 2 == 0 ); // if we're inside an interval        
+        bool eq;
+        int l = matchingLowElement( e, i, forward, eq );
+        return ( l % 2 == 0 ); // if we're inside an interval
     }
-    
+
     // binary search for interval containing the specified element
     // an even return value indicates that the element is contained within a valid interval
-    int FieldRangeVector::matchingLowElement( const BSONElement &e, int i, bool forward ) const {
+    int FieldRangeVector::matchingLowElement( const BSONElement &e, int i, bool forward, bool &lowEquality ) const {
+        lowEquality = false;
         int l = -1;
         int h = _ranges[ i ].intervals().size() * 2;
         while( l + 1 < h ) {
             int m = ( l + h ) / 2;
             BSONElement toCmp;
+            bool toCmpInclusive;
+            const FieldInterval &interval = _ranges[ i ].intervals()[ m / 2 ];
             if ( m % 2 == 0 ) {
-                toCmp = _ranges[ i ].intervals()[ m / 2 ]._lower._bound;
-            } else {
-                toCmp = _ranges[ i ].intervals()[ m / 2 ]._upper._bound;
+                toCmp = interval._lower._bound;
+                toCmpInclusive = interval._lower._inclusive;
+            }
+            else {
+                toCmp = interval._upper._bound;
+                toCmpInclusive = interval._upper._inclusive;
             }
             int cmp = toCmp.woCompare( e, false );
             if ( !forward ) {
@@ -963,41 +890,60 @@ namespace mongo {
             }
             if ( cmp < 0 ) {
                 l = m;
-            } else if ( cmp > 0 ) {
+            }
+            else if ( cmp > 0 ) {
                 h = m;
-            } else {
-                return ( m % 2 == 0 ) ? m : m - 1;
+            }
+            else {
+                if ( m % 2 == 0 ) {
+                    lowEquality = true;
+                }
+                int ret = m;
+                // if left match and inclusive, all good
+                // if left match and not inclusive, return right before left bound
+                // if right match and inclusive, return left bound
+                // if right match and not inclusive, return right bound
+                if ( ( m % 2 == 0 && !toCmpInclusive ) || ( m % 2 == 1 && toCmpInclusive ) ) {
+                    --ret;
+                }
+                return ret;
             }
         }
         assert( l + 1 == h );
         return l;
     }
-    
+
     bool FieldRangeVector::matches( const BSONObj &obj ) const {
-        BSONObjIterator k( _keyPattern );
-        for( int i = 0; i < (int)_ranges.size(); ++i ) {
-            if ( _ranges[ i ].empty() ) {
-                return false;
-            }
-            BSONElement kk = k.next();
-            int number = (int) kk.number();
-            bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0;
-            BSONElementSet keys;
-            obj.getFieldsDotted( kk.fieldName(), keys );
-            bool match = false;
-            for( BSONElementSet::const_iterator j = keys.begin(); j != keys.end(); ++j ) {
-                if ( matchesElement( *j, i, forward ) ) {
-                    match = true;
+        if ( !_indexSpec.get() ) {
+            _indexSpec.reset( new IndexSpec( _keyPattern ) );
+        }
+        // TODO The representation of matching keys could potentially be optimized
+        // more for the case at hand.  (For example, we can potentially consider
+        // fields individually instead of constructing several bson objects using
+        // multikey arrays.)  But getKeys() canonically defines the key set for a
+        // given object and for now we are using it as is.
+        BSONObjSetDefaultOrder keys;
+        _indexSpec->getKeys( obj, keys );
+        for( BSONObjSetDefaultOrder::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+            BSONObjIterator j( *i );
+            BSONObjIterator k( _keyPattern );
+            bool match = true;
+            for( int l = 0; l < (int)_ranges.size(); ++l ) {
+                int number = (int) k.next().number();
+                bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0;
+                if ( !matchesElement( j.next(), l, forward ) ) {
+                    match = false;
                     break;
                 }
             }
-            if ( !match ) {
-                return false;
+            if ( match ) {
+                // The *i key matched a valid range for every element.
+                return true;
             }
         }
-        return true;
+        return false;
     }
-    
+
     // TODO optimize more
     int FieldRangeVector::Iterator::advance( const BSONObj &curr ) {
         BSONObjIterator j( curr );
@@ -1009,7 +955,8 @@ namespace mongo {
         for( int i = 0; i < (int)_i.size(); ++i ) {
             if ( i > 0 && !_v._ranges[ i - 1 ].intervals()[ _i[ i - 1 ] ].equality() ) {
                 // if last bound was inequality, we don't know anything about where we are for this field
-                // TODO if possible avoid this certain cases when field in prev key is the same
+                // TODO if possible avoid this certain cases when value in previous field of the previous
+                // key is the same as value of previous field in current key
                 setMinus( i );
             }
             bool eq = false;
@@ -1017,20 +964,23 @@ namespace mongo {
             bool reverse = ( ( oo.number() < 0 ) ^ ( _v._direction < 0 ) );
             BSONElement jj = j.next();
             if ( _i[ i ] == -1 ) { // unknown position for this field, do binary search
-                int l = _v.matchingLowElement( jj, i, !reverse );
+                bool lowEquality;
+                int l = _v.matchingLowElement( jj, i, !reverse, lowEquality );
                 if ( l % 2 == 0 ) { // we are in a valid range for this field
                     _i[ i ] = l / 2;
                     int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ];
                     if ( diff > 1 ) {
                         latestNonEndpoint = i;
-                    } else if ( diff == 1 ) {
+                    }
+                    else if ( diff == 1 ) {
                         int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false );
                         if ( x != 0 ) {
                             latestNonEndpoint = i;
                         }
                     }
                     continue;
-                } else { // not in a valid range for this field - determine if and how to advance
+                }
+                else {   // not in a valid range for this field - determine if and how to advance
                     // check if we're after the last interval for this field
                     if ( l == (int)_v._ranges[ i ].intervals().size() * 2 - 1 ) {
                         if ( latestNonEndpoint == -1 ) {
@@ -1038,18 +988,24 @@ namespace mongo {
                         }
                         setZero( latestNonEndpoint + 1 );
                         // skip to curr / latestNonEndpoint + 1 / superlative
-                        for( int j = latestNonEndpoint + 1; j < (int)_i.size(); ++j ) {
-                            _cmp[ j ] = _superlative[ j ];
-                        }
-                        return latestNonEndpoint + 1;                        
+                        _after = true;
+                        return latestNonEndpoint + 1;
                     }
                     _i[ i ] = ( l + 1 ) / 2;
+                    if ( lowEquality ) {
+                        // skip to curr / i + 1 / superlative
+                        _after = true;
+                        return i + 1;
+                    }
                     // skip to curr / i / nextbounds
                     _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound;
+                    _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive;
                     for( int j = i + 1; j < (int)_i.size(); ++j ) {
                         _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+                        _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
                     }
-                    return i;                    
+                    _after = false;
+                    return i;
                 }
             }
             bool first = true;
@@ -1062,7 +1018,7 @@ namespace mongo {
                 if ( reverse ) {
                     x = -x;
                 }
-                if ( x == 0 ) {
+                if ( x == 0 && _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._inclusive ) {
                     eq = true;
                     break;
                 }
@@ -1081,16 +1037,27 @@ namespace mongo {
                             x = -x;
                         }
                     }
+                    // if we're equal to and not inclusive the lower bound, advance
+                    if ( ( x == 0 && !_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive ) ) {
+                        setZero( i + 1 );
+                        // skip to curr / i + 1 / superlative
+                        _after = true;
+                        return i + 1;
+                    }
                     // if we're less than the lower bound, advance
                     if ( x > 0 ) {
                         setZero( i + 1 );
                         // skip to curr / i / nextbounds
                         _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound;
+                        _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive;
                         for( int j = i + 1; j < (int)_i.size(); ++j ) {
                             _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+                            _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
                         }
+                        _after = false;
                         return i;
-                    } else {
+                    }
+                    else {
                         break;
                     }
                 }
@@ -1101,26 +1068,32 @@ namespace mongo {
             }
             int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ];
             if ( diff > 1 || ( !eq && diff == 1 ) ) {
-                 // check if we're not at the end of valid values for this field
+                // check if we're not at the end of valid values for this field
                 latestNonEndpoint = i;
-            } else if ( diff == 0 ) { // check if we're past the last interval for this field
+            }
+            else if ( diff == 0 ) {   // check if we're past the last interval for this field
                 if ( latestNonEndpoint == -1 ) {
                     return -2;
                 }
                 // more values possible, skip...
                 setZero( latestNonEndpoint + 1 );
                 // skip to curr / latestNonEndpoint + 1 / superlative
-                for( int j = latestNonEndpoint + 1; j < (int)_i.size(); ++j ) {
-                    _cmp[ j ] = _superlative[ j ];
-                }
+                _after = true;
                 return latestNonEndpoint + 1;
             }
         }
-        return -1;        
+        return -1;
     }
-    
+
+    void FieldRangeVector::Iterator::prepDive() {
+        for( int j = 0; j < (int)_i.size(); ++j ) {
+            _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+            _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+        }
+    }
+
     struct SimpleRegexUnitTest : UnitTest {
-        void run(){
+        void run() {
             {
                 BSONObjBuilder b;
                 b.appendRegex("r", "^foo");
@@ -1179,38 +1152,39 @@ namespace mongo {
     } simple_regex_unittest;
 
 
-    long long applySkipLimit( long long num , const BSONObj& cmd ){
+    long long applySkipLimit( long long num , const BSONObj& cmd ) {
         BSONElement s = cmd["skip"];
         BSONElement l = cmd["limit"];
-        
-        if ( s.isNumber() ){
+
+        if ( s.isNumber() ) {
             num = num - s.numberLong();
             if ( num < 0 ) {
                 num = 0;
             }
         }
-        
-        if ( l.isNumber() ){
+
+        if ( l.isNumber() ) {
             long long limit = l.numberLong();
-            if ( limit < num ){
+            if ( limit < num ) {
                 num = limit;
             }
         }
 
-        return num;        
+        return num;
     }
 
-    string debugString( Message& m ){
+    string debugString( Message& m ) {
         stringstream ss;
         ss << "op: " << opToString( m.operation() ) << " len: " << m.size();
-        if ( m.operation() >= 2000 && m.operation() < 2100 ){
+        if ( m.operation() >= 2000 && m.operation() < 2100 ) {
             DbMessage d(m);
             ss << " ns: " << d.getns();
-            switch ( m.operation() ){
+            switch ( m.operation() ) {
             case dbUpdate: {
                 int flags = d.pullInt();
                 BSONObj q = d.nextJsObj();
-                ss << " flags: " << flags << " query: " << q;
+                BSONObj o = d.nextJsObj();
+                ss << " flags: " << flags << " query: " << q << " update: " << o;
                 break;
             }
             case dbInsert:
@@ -1225,10 +1199,10 @@ namespace mongo {
             default:
                 ss << " CANNOT HANDLE YET";
             }
-                    
-                
+
+
         }
         return ss.str();
-    }    
+    }
 
 } // namespace mongo
diff --git a/db/queryutil.h b/db/queryutil.h
index 37dfa2a..2746695 100644
--- a/db/queryutil.h
+++ b/db/queryutil.h
@@ -26,7 +26,7 @@ namespace mongo {
         bool _inclusive;
         bool operator==( const FieldBound &other ) const {
             return _bound.woCompare( other._bound ) == 0 &&
-            _inclusive == other._inclusive;
+                   _inclusive == other._inclusive;
         }
         void flipInclusive() { _inclusive = !_inclusive; }
     };
@@ -59,8 +59,6 @@ namespace mongo {
         FieldRange( const BSONElement &e = BSONObj().firstElement() , bool isNot=false , bool optimize=true );
         const FieldRange &operator&=( const FieldRange &other );
         const FieldRange &operator|=( const FieldRange &other );
-        // does not remove fully contained ranges (eg [1,3] - [2,2] doesn't remove anything)
-        // in future we can change so that an or on $in:[3] combined with $in:{$gt:2} doesn't scan 3 a second time
         const FieldRange &operator-=( const FieldRange &other );
         // true iff other includes this
         bool operator<=( const FieldRange &other );
@@ -79,7 +77,7 @@ namespace mongo {
             if ( equality() ) {
                 return true;
             }
-            for( vector< FieldInterval >::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {            
+            for( vector< FieldInterval >::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
                 if ( !i->equality() ) {
                     return false;
                 }
@@ -88,13 +86,14 @@ namespace mongo {
         }
         bool nontrivial() const {
             return
-                ! empty() && 
-                ( minKey.firstElement().woCompare( min(), false ) != 0 ||
+                ! empty() &&
+                ( _intervals.size() != 1 ||
+                  minKey.firstElement().woCompare( min(), false ) != 0 ||
                   maxKey.firstElement().woCompare( max(), false ) != 0 );
         }
         bool empty() const { return _intervals.empty(); }
         void makeEmpty() { _intervals.clear(); }
-		const vector< FieldInterval > &intervals() const { return _intervals; }
+        const vector< FieldInterval > &intervals() const { return _intervals; }
         string getSpecial() const { return _special; }
         void setExclusiveBounds() {
             for( vector< FieldInterval >::iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
@@ -122,7 +121,7 @@ namespace mongo {
         vector< BSONObj > _objData;
         string _special;
     };
-    
+
     // implements query pattern matching, used to determine if a query is
     // similar to an earlier query and should use the same plan
     class QueryPattern {
@@ -193,8 +192,8 @@ namespace mongo {
     // the specified direction of traversal.  For example, given a simple index {i:1}
     // and direction +1, one valid BoundList is: (1, 2); (4, 6).  The same BoundList
     // would be valid for index {i:-1} with direction -1.
-    typedef vector< pair< BSONObj, BSONObj > > BoundList;	
-    
+    typedef vector< pair< BSONObj, BSONObj > > BoundList;
+
     // ranges of fields' value that may be determined from query -- used to
     // determine index limits
     class FieldRangeSet {
@@ -210,19 +209,20 @@ namespace mongo {
             map< string, FieldRange >::const_iterator f = _ranges.find( fieldName );
             if ( f == _ranges.end() )
                 return trivialRange();
-            return f->second;            
+            return f->second;
         }
         FieldRange &range( const char *fieldName ) {
             map< string, FieldRange >::iterator f = _ranges.find( fieldName );
             if ( f == _ranges.end() )
                 return trivialRange();
-            return f->second;            
+            return f->second;
         }
         int nNontrivialRanges() const {
             int count = 0;
-            for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i )
+            for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
                 if ( i->second.nontrivial() )
                     ++count;
+            }
             return count;
         }
         const char *ns() const { return _ns; }
@@ -236,6 +236,18 @@ namespace mongo {
         }
         QueryPattern pattern( const BSONObj &sort = BSONObj() ) const;
         string getSpecial() const;
+        // Btree scanning for a multidimentional key range will yield a
+        // multidimensional box.  The idea here is that if an 'other'
+        // multidimensional box contains the current box we don't have to scan
+        // the current box.  If the 'other' box contains the current box in
+        // all dimensions but one, we can safely subtract the values of 'other'
+        // along that one dimension from the values for the current box on the
+        // same dimension.  In other situations, subtracting the 'other'
+        // box from the current box yields a result that is not a box (but
+        // rather can be expressed as a union of boxes).  We don't support
+        // such splitting currently in calculating index ranges.  Note that
+        // where I have said 'box' above, I actually mean sets of boxes because
+        // a field range can consist of multiple intervals.
         const FieldRangeSet &operator-=( const FieldRangeSet &other ) {
             int nUnincluded = 0;
             string unincludedKey;
@@ -246,22 +258,25 @@ namespace mongo {
                 if ( cmp == 0 ) {
                     if ( i->second <= j->second ) {
                         // nothing
-                    } else {
+                    }
+                    else {
                         ++nUnincluded;
                         unincludedKey = i->first;
                     }
                     ++i;
                     ++j;
-                } else if ( cmp < 0 ) {
+                }
+                else if ( cmp < 0 ) {
                     ++i;
-                } else {
+                }
+                else {
                     // other has a bound we don't, nothing can be done
                     return *this;
                 }
             }
             if ( j != other._ranges.end() ) {
                 // other has a bound we don't, nothing can be done
-                return *this;                
+                return *this;
             }
             if ( nUnincluded > 1 ) {
                 return *this;
@@ -284,27 +299,37 @@ namespace mongo {
                     i->second &= j->second;
                     ++i;
                     ++j;
-                } else if ( cmp < 0 ) {
+                }
+                else if ( cmp < 0 ) {
                     ++i;
-                } else {
+                }
+                else {
                     _ranges[ j->first ] = j->second;
                     ++j;
                 }
             }
             while( j != other._ranges.end() ) {
                 _ranges[ j->first ] = j->second;
-                ++j;                
+                ++j;
             }
             appendQueries( other );
             return *this;
         }
         // TODO get rid of this
         BoundList indexBounds( const BSONObj &keyPattern, int direction ) const;
+
+        /**
+         * @param return - A new FieldRangeSet based on this FieldRangeSet, but with only
+         * a subset of the fields.
+         * @param fields - Only fields which are represented as field names in this object
+         * will be included in the returned FieldRangeSet.
+         */
+        FieldRangeSet *subset( const BSONObj &fields ) const;
     private:
         void appendQueries( const FieldRangeSet &other ) {
             for( vector< BSONObj >::const_iterator i = other._queries.begin(); i != other._queries.end(); ++i ) {
-                _queries.push_back( *i );                
-            }                        
+                _queries.push_back( *i );
+            }
         }
         void makeEmpty() {
             for( map< string, FieldRange >::iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
@@ -321,11 +346,21 @@ namespace mongo {
         vector< BSONObj > _queries;
     };
 
+    class IndexSpec;
+
+    /**
+     * This class manages the ranges of valid element values for each field in
+     * an ordered list of signed fields corresponding to an index specification.
+     */
     class FieldRangeVector {
     public:
+        /**
+         * @param frs The valid ranges for all fields, as defined by the query spec
+         * @prarm keyPattern The index key pattern
+         * @param direction The direction of index traversal
+         */
         FieldRangeVector( const FieldRangeSet &frs, const BSONObj &keyPattern, int direction )
-        :_keyPattern( keyPattern ), _direction( direction >= 0 ? 1 : -1 )
-        {
+            :_keyPattern( keyPattern ), _direction( direction >= 0 ? 1 : -1 ) {
             _queries = frs._queries;
             BSONObjIterator i( _keyPattern );
             while( i.more() ) {
@@ -334,7 +369,8 @@ namespace mongo {
                 bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
                 if ( forward ) {
                     _ranges.push_back( frs.range( e.fieldName() ) );
-                } else {
+                }
+                else {
                     _ranges.push_back( FieldRange() );
                     frs.range( e.fieldName() ).reverse( _ranges.back() );
                 }
@@ -348,14 +384,14 @@ namespace mongo {
                 ret *= i->intervals().size();
             }
             return ret;
-        }        
+        }
         BSONObj startKey() const {
             BSONObjBuilder b;
             for( vector< FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
                 const FieldInterval &fi = i->intervals().front();
                 b.appendAs( fi._lower._bound, "" );
             }
-            return b.obj();            
+            return b.obj();
         }
         BSONObj endKey() const {
             BSONObjBuilder b;
@@ -363,7 +399,7 @@ namespace mongo {
                 const FieldInterval &fi = i->intervals().back();
                 b.appendAs( fi._upper._bound, "" );
             }
-            return b.obj();            
+            return b.obj();
         }
         BSONObj obj() const {
             BSONObjBuilder b;
@@ -371,27 +407,23 @@ namespace mongo {
             for( int i = 0; i < (int)_ranges.size(); ++i ) {
                 BSONArrayBuilder a( b.subarrayStart( k.next().fieldName() ) );
                 for( vector< FieldInterval >::const_iterator j = _ranges[ i ].intervals().begin();
-                    j != _ranges[ i ].intervals().end(); ++j ) {
+                        j != _ranges[ i ].intervals().end(); ++j ) {
                     a << BSONArray( BSON_ARRAY( j->_lower._bound << j->_upper._bound ).clientReadable() );
                 }
                 a.done();
             }
             return b.obj();
         }
+        /**
+         * @return true iff the provided document matches valid ranges on all
+         * of this FieldRangeVector's fields, which is the case iff this document
+         * would be returned while scanning the index corresponding to this
+         * FieldRangeVector.  This function is used for $or clause deduping.
+         */
         bool matches( const BSONObj &obj ) const;
         class Iterator {
         public:
-            Iterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _superlative( _v._ranges.size(), 0 ) {
-                static BSONObj minObj = minObject();
-                static BSONElement minElt = minObj.firstElement();
-                static BSONObj maxObj = maxObject();
-                static BSONElement maxElt = maxObj.firstElement();
-                BSONObjIterator i( _v._keyPattern );
-                for( int j = 0; j < (int)_superlative.size(); ++j ) {
-                    int number = (int) i.next().number();
-                    bool forward = ( ( number >= 0 ? 1 : -1 ) * ( _v._direction >= 0 ? 1 : -1 ) > 0 );
-                    _superlative[ j ] = forward ? &maxElt : &minElt;
-                }
+            Iterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _inc( _v._ranges.size(), false ), _after() {
             }
             static BSONObj minObject() {
                 BSONObjBuilder b;
@@ -413,7 +445,8 @@ namespace mongo {
                     for( unsigned j = i + 1; j < _i.size(); ++j ) {
                         _i[ j ] = 0;
                     }
-                } else {
+                }
+                else {
                     _i[ 0 ] = _v._ranges[ 0 ].intervals().size();
                 }
                 return ok();
@@ -424,6 +457,9 @@ namespace mongo {
             // >= 0 skip parameter
             int advance( const BSONObj &curr );
             const vector< const BSONElement * > &cmp() const { return _cmp; }
+            const vector< bool > &inc() const { return _inc; }
+            bool after() const { return _after; }
+            void prepDive();
             void setZero( int i ) {
                 for( int j = i; j < (int)_i.size(); ++j ) {
                     _i[ j ] = 0;
@@ -452,55 +488,61 @@ namespace mongo {
                     const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
                     b.appendAs( fi._upper._bound, "" );
                 }
-                return b.obj();            
+                return b.obj();
             }
             // check
         private:
             const FieldRangeVector &_v;
             vector< int > _i;
             vector< const BSONElement* > _cmp;
-            vector< const BSONElement* > _superlative;
+            vector< bool > _inc;
+            bool _after;
         };
     private:
-        int matchingLowElement( const BSONElement &e, int i, bool direction ) const;
+        int matchingLowElement( const BSONElement &e, int i, bool direction, bool &lowEquality ) const;
         bool matchesElement( const BSONElement &e, int i, bool direction ) const;
         vector< FieldRange > _ranges;
         BSONObj _keyPattern;
         int _direction;
         vector< BSONObj > _queries; // make sure mem owned
+        // This IndexSpec is lazily constructed directly from _keyPattern if needed.
+        mutable shared_ptr< IndexSpec > _indexSpec;
     };
-        
+
     // generages FieldRangeSet objects, accounting for or clauses
     class FieldRangeOrSet {
     public:
         FieldRangeOrSet( const char *ns, const BSONObj &query , bool optimize=true );
         // if there's a useless or clause, we won't use or ranges to help with scanning
         bool orFinished() const { return _orFound && _orSets.empty(); }
-        // removes first or clause, and removes the field ranges it covers from all subsequent or clauses
-        // this could invalidate the result of the last topFrs()
-        void popOrClause() {
-            massert( 13274, "no or clause to pop", !orFinished() );
-            const FieldRangeSet &toPop = _orSets.front();
-            list< FieldRangeSet >::iterator i = _orSets.begin();
-            ++i;
-            while( i != _orSets.end() ) {
-                *i -= toPop;
-                if( !i->matchPossible() ) {
-                    i = _orSets.erase( i );
-                } else {    
-                    ++i;
-                }
-            }
-            _oldOrSets.push_front( toPop );
-            _orSets.pop_front();
-        }
+        /**
+         * Removes the top or clause, which would have been recently scanned, and
+         * removes the field ranges it covers from all subsequent or clauses.  As a
+         * side effect, this function may invalidate the return values of topFrs()
+         * calls made before this function was called.
+         * @param indexSpec - Keys of the index that was used to satisfy the last or
+         * clause.  Used to determine the range of keys that were scanned.  If
+         * empty we do not constrain the previous clause's ranges using index keys,
+         * which may reduce opportunities for range elimination.
+         */
+        void popOrClause( const BSONObj &indexSpec = BSONObj() );
         FieldRangeSet *topFrs() const {
             FieldRangeSet *ret = new FieldRangeSet( _baseSet );
-            if (_orSets.size()){
+            if (_orSets.size()) {
                 *ret &= _orSets.front();
             }
             return ret;
         }
+        // while the original bounds are looser, they are composed of fewer
+        // ranges and it is faster to do operations with them; when they can be
+        // used instead of more precise bounds, they should
+        FieldRangeSet *topFrsOriginal() const {
+            FieldRangeSet *ret = new FieldRangeSet( _baseSet );
+            if (_originalOrSets.size()) {
+                *ret &= _originalOrSets.front();
+            }
+            return ret;
+        }
         void allClausesSimplified( vector< BSONObj > &ret ) const {
             for( list< FieldRangeSet >::const_iterator i = _orSets.begin(); i != _orSets.end(); ++i ) {
                 if ( i->matchPossible() ) {
@@ -514,47 +556,10 @@ namespace mongo {
     private:
         FieldRangeSet _baseSet;
         list< FieldRangeSet > _orSets;
+        list< FieldRangeSet > _originalOrSets;
         list< FieldRangeSet > _oldOrSets; // make sure memory is owned
         bool _orFound;
     };
-    
-    /**
-       used for doing field limiting
-     */
-    class FieldMatcher {
-    public:
-        FieldMatcher()
-            : _include(true)
-            , _special(false)
-            , _includeID(true)
-            , _skip(0)
-            , _limit(-1)
-        {}
-        
-        void add( const BSONObj& o );
-
-        void append( BSONObjBuilder& b , const BSONElement& e ) const;
-
-        BSONObj getSpec() const;
-        bool includeID() { return _includeID; }
-    private:
-
-        void add( const string& field, bool include );
-        void add( const string& field, int skip, int limit );
-        void appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested=false) const;
-
-        bool _include; // true if default at this level is to include
-        bool _special; // true if this level can't be skipped or included without recursing
-        //TODO: benchmark vector<pair> vs map
-        typedef map<string, boost::shared_ptr<FieldMatcher> > FieldMap;
-        FieldMap _fields;
-        BSONObj _source;
-        bool _includeID;
-
-        // used for $slice operator
-        int _skip;
-        int _limit;
-    };
 
     /** returns a string that when used as a matcher, would match a super set of regex()
         returns "" for complex regular expressions
diff --git a/db/rec.h b/db/rec.h
deleted file mode 100644
index 7b79c73..0000000
--- a/db/rec.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// rec.h
-/*
- *    Copyright (C) 2010 10gen Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-/* TODO for _RECSTORE
-
-   _ support > 2GB data per file
-   _ multiple files, not just indexes.dat
-   _ lazier writes? (may be done?)
-   _ configurable cache size
-   _ fix on abnormal terminations to be able to restart some
-*/
-
-#pragma once
-
-#include "reci.h"
-//#include "reccache.h"
-
-namespace mongo { 
-
-/* --------------------------------------------------------------------------
-   A RecStoreInterface for the normal mongo mem mapped file (MongoDataFile) 
-   storage
-*/
-
-NamespaceDetails* nsdetails_notinline(const char *ns);
-
-class MongoMemMapped_RecStore : public RecStoreInterface { 
-public:
-    VIRT char* get(DiskLoc d, unsigned len) { return d.rec()->data; }
-
-    VIRT DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { 
-        return theDataFileMgr.insert(ns, obuf, len, god);
-    }
-
-    VIRT void deleteRecord(const char *ns, DiskLoc d) { 
-        theDataFileMgr._deleteRecord(nsdetails_notinline(ns), ns, d.rec(), d);
-    }
-
-    VIRT void modified(DiskLoc d) { }
-
-    VIRT void drop(const char *ns) { 
-        dropNS(ns);
-    }
-
-    VIRT void rename(const char *fromNs, const char *toNs) {
-      renameNamespace( fromNs, toNs );
-    }
-
-    /* close datafiles associated with the db specified. */
-    VIRT void closeFiles(string dbname, string path) {
-        /* as this is only used for indexes so far, and we are in the same 
-           PDFiles as the nonindex data, we just rely on them having been closed 
-           at the same time.  one day this may need to change.
-        */
-    }
-
-};
-
-/* An in memory RecStoreInterface implementation ----------------------------
-*/
-
-#if 0
-class InMem_RecStore : public RecStoreInterface { 
-    enum InmemfileValue { INMEMFILE = 0x70000000 };
-public:
-    static char* get(DiskLoc d, unsigned len) { 
-        assert( d.a() == INMEMFILE );
-#ifdef __LP64__
-		massert( 10372 , "64 bit not done", false);
-		return 0;
-#else
-		return (char *) d.getOfs();
-#endif
-    }
-
-    static DiskLoc insert(const char *ns, const void *obuf, int len, bool god) {
-#ifdef __LP64__
-      assert( 0 );
-      throw -1;
-#else
-        char *p = (char *) malloc(len);
-        assert( p );
-        memcpy(p, obuf, len);
-        int b = (int) p;
-        assert( b > 0 );
-        return DiskLoc(INMEMFILE, b);
-#endif
-    }
-
-    static void modified(DiskLoc d) { }
-
-    static void drop(const char *ns) { 
-        log() << "warning: drop() not yet implemented for InMem_RecStore" << endl;
-    }
-
-    virtual void rename(const char *fromNs, const char *toNs) {
-      massert( 10373 ,  "rename not yet implemented for InMem_RecStore", false );
-    }
-};
-#endif
-
-/* Glue btree to RecStoreInterface: ---------------------------- */
-
-typedef MongoMemMapped_RecStore StoreToUse;
-
-extern StoreToUse *btreeStore;
-
-const int BucketSize = 8192;
-
-inline BtreeBucket* DiskLoc::btree() const {
-    assert( fileNo != -1 );
-    return (BtreeBucket*) btreeStore->get(*this, BucketSize);
-}
-
-inline BtreeBucket* DiskLoc::btreemod() const {
-    assert( fileNo != -1 );
-    BtreeBucket *b = (BtreeBucket*) btreeStore->get(*this, BucketSize);
-    btreeStore->modified(*this);
-    return b;
-}
-
-}
diff --git a/db/reccache.cpp b/db/reccache.cpp
deleted file mode 100644
index eb20728..0000000
--- a/db/reccache.cpp
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- *    Copyright (C) 2010 10gen Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-// storage.cpp
-
-#include "pch.h"
-#include "pdfile.h"
-//#include "reccache.h"
-#include "rec.h"
-#include "db.h"
-
-#error deprecated - do not include in project
-
-namespace mongo {
-
-//RecCache theRecCache(BucketSize);
-
-// 100k * 8KB = 800MB
-unsigned RecCache::MAXNODES = 50000;
-
-void setRecCacheSize(unsigned mb) {
-    unsigned long long MB = mb;
-    log(2) << "reccache size: " << MB << "MB\n";
-    uassert( 10114 ,  "bad cache size", MB > 0 && MB < 1000000 );
-    RecCache::MAXNODES = (unsigned) MB * 1024 * 1024 / 8192;
-    log(3) << "RecCache::MAXNODES=" << RecCache::MAXNODES << '\n';
-}
-
-void writerThread() { 
-    sleepsecs(10);
-    while( 1 ) { 
-        try { 
-            theRecCache.writeLazily();
-        }
-        catch(...) { 
-            log() << "exception in writerThread()" << endl;
-            sleepsecs(3);
-        }
-    }
-}
-
-// called on program exit.
-void recCacheCloseAll() { 
-#if defined(_RECSTORE)
-    theRecCache.closing();
-#endif
-}
-
-int ndirtywritten;
-
-inline static string escape(const char *ns) {
-    char buf[256];
-    char *p = buf;
-    while( 1 ) {
-        if( *ns == '$' ) *p = '~';
-        else
-            *p = *ns;
-        if( *ns == 0 )
-            break;
-        p++; ns++;
-    }
-    assert( p - buf < (int) sizeof(buf) );
-    return buf;
-}
-
-inline static string unescape(const char *ns) {
-    char buf[256];
-    char *p = buf;
-    while( 1 ) {
-        if( *ns == '~' ) *p = '$';
-        else
-            *p = *ns;
-        if( *ns == 0 )
-            break;
-        p++; ns++;
-    }
-    assert( p - buf < (int) sizeof(buf) );
-    return buf;
-}
-
-string RecCache::directory() { 
-    return cc().database()->path;
-}
-
-/* filename format is 
-
-     <n>-<ns>.idx
-*/
-
-BasicRecStore* RecCache::_initStore(string fname) { 
-
-    assert( strchr(fname.c_str(), '/') == 0 );
-    assert( strchr(fname.c_str(), '\\') == 0 );
-
-    stringstream ss(fname);
-    int n;
-    ss >> n;
-    assert( n >= 0 );
-    char ch;
-    ss >> ch;
-    assert( ch == '-' );
-    string rest;
-    ss >> rest;
-    const char *p = rest.c_str();
-    const char *q = strstr(p, ".idx");
-    assert( q );
-    string escaped_ns(p, q-p);
-
-    // arbitrary limit.  if you are hitting, we should use fewer files and put multiple 
-    // indexes in a single file (which is easy to do)
-    massert( 10374 ,  "too many index files", n < 10000 );
-
-    if( stores.size() < (unsigned)n+1 )
-        stores.resize(n+1);
-    assert( stores[n] == 0 );
-    BasicRecStore *rs = new BasicRecStore(n);
-    path pf(directory());
-    pf /= fname;
-    string full = pf.string();
-    rs->init(full.c_str(), recsize);
-    stores[n] = rs;
-    string ns = unescape(escaped_ns.c_str());
-    storesByNsKey[mknskey(ns.c_str())] = rs;
-    return rs;
-}
-
-BasicRecStore* RecCache::initStore(int n) { 
-    string ns;
-    { 
-        stringstream ss;
-        ss << '/' << n << '-';
-        ns = ss.str();
-    }
-
-    /* this will be slow if there are thousands of files */
-    path dir(directory());
-    directory_iterator end;
-    try {
-        directory_iterator i(dir);
-        while ( i != end ) {
-            string s = i->string();
-            const char *p = strstr(s.c_str(), ns.c_str());
-            if( p && strstr(p, ".idx") ) { 
-                // found it
-                path P = *i;
-                return _initStore(P.leaf());
-            }
-            i++;
-        }
-    }
-    catch( DBException & ) { 
-        throw;
-    }
-    catch (...) {
-        string s = string("i/o error looking for .idx file in ") + directory();
-        massert( 10375 , s, false);
-    }
-    stringstream ss;
-    ss << "index datafile missing? n=" << n;
-    uasserted(12500,ss.str());
-    return 0;
-}
-
-/* find the filename for a given ns.
-   format is 
-     <n>-<escaped_ns>.idx
-   returns filename.  found is true if found.  If false, a proposed name is returned for (optional) creation
-   of the file.
-*/
-string RecCache::findStoreFilename(const char *_ns, bool& found) {
-    string namefrag;
-    { 
-        stringstream ss;
-        ss << '-';
-        ss << escape(_ns);
-        ss << ".idx";
-        namefrag = ss.str();
-    }
-
-    path dir(directory());
-    directory_iterator end;
-    int nmax = -1;
-    try {
-        directory_iterator i(dir);
-        while ( i != end ) {
-            string s = path(*i).leaf();
-            const char *p = strstr(s.c_str(), namefrag.c_str());
-            if( p ) {
-                found = true;
-                return s;
-            }
-            if( strstr(s.c_str(), ".idx") ) { 
-                stringstream ss(s);
-                int n = -1;
-                ss >> n;
-                if( n > nmax )
-                    nmax = n;
-            }
-            i++;
-        }
-    }
-    catch (...) {
-        string s = string("i/o error looking for .idx file in ") + directory();
-        massert( 10376 , s, false);
-    }
-
-    // DNE.  return a name that would work.
-    stringstream ss;
-    ss << nmax+1 << namefrag;
-    found = false;
-    return ss.str();
-}
-
-void RecCache::initStoreByNs(const char *_ns, const string& nskey) {
-    bool found;
-    string fn = findStoreFilename(_ns, found);
-    _initStore(fn);
-}
-
-inline void RecCache::writeIfDirty(Node *n) {
-    if( n->dirty ) {
-        ndirtywritten++;
-        n->dirty = false;
-        store(n->loc).update(fileOfs(n->loc), n->data, recsize);
-    }
-}
-
-void RecCache::closeFiles(string dbname, string path) { 
-    assertInWriteLock();
-    scoped_lock lk(rcmutex);
-
-    // first we write all dirty pages.  it is not easy to check which Nodes are for a particular
-    // db, so we just write them all.
-    writeDirty( dirtyl.begin(), true );
-
-    string key = path + dbname + '.';
-    unsigned sz = key.size();
-    for( map<string, BasicRecStore*>::iterator i = storesByNsKey.begin(); i != storesByNsKey.end(); i++ ) { 
-        map<string, BasicRecStore*>::iterator j = i;
-        i++;
-        if( strncmp(j->first.c_str(), key.c_str(), sz) == 0 ) {
-            assert( stores[j->second->fileNumber] != 0 );
-            stores[j->second->fileNumber] = 0;
-            delete j->second;
-            storesByNsKey.erase(j);
-        }
-    }
-}
-
-void RecCache::closing() { 
-    scoped_lock lk(rcmutex);
-    (cout << "TEMP: recCacheCloseAll() writing dirty pages...\n").flush();
-    writeDirty( dirtyl.begin(), true );
-    for( unsigned i = 0; i < stores.size(); i++ ) { 
-        if( stores[i] ) {
-            delete stores[i];
-        }
-    }
-    (cout << "TEMP: write dirty done\n").flush();
-}
-
-/* note that this is written in order, as much as possible, given that dirtyl is of type set. */
-void RecCache::writeDirty( set<DiskLoc>::iterator startAt, bool rawLog ) { 
-    try { 
-        ndirtywritten=0;
-        for( set<DiskLoc>::iterator i = startAt; i != dirtyl.end(); i++ ) { 
-            map<DiskLoc, Node*>::iterator j = m.find(*i);
-            if( j != m.end() )
-                writeIfDirty(j->second);
-        }
-        OCCASIONALLY out() << "TEMP: ndirtywritten: " << ndirtywritten << endl;
-    }
-    catch(...) {
-        const char *message = "Problem: bad() in RecCache::writeDirty, file io error\n";
-
-        if ( rawLog )
-            rawOut( message );
-        else
-            ( log() << message ).flush();
-    }
-    dirtyl.clear();
-}
-
-void RecCache::writeLazily() {
-    int sleep = 0;
-    int k;
-    {
-        scoped_lock lk(rcmutex);
-        Timer t;
-        set<DiskLoc>::iterator i = dirtyl.end();
-        for( k = 0; k < 100; k++ ) {
-            if( i == dirtyl.begin() ) { 
-                // we're not very far behind
-                sleep = k < 20 ? 2000 : 1000;
-                break;
-            }
-            i--;
-        }
-        writeDirty(i);
-        if( sleep == 0 ) {
-            sleep = t.millis() * 4 + 10;
-        }
-    }
-
-    OCCASIONALLY cout << "writeLazily " << k << " sleep:" << sleep << '\n';
-    sleepmillis(sleep);
-}
-
-void RecCache::_ejectOld() { 
-    scoped_lock lk(rcmutex);
-    if( nnodes <= MAXNODES )
-        return;
-    Node *n = oldest;
-    while( 1 ) {
-        if( nnodes <= MAXNODES - 4 ) { 
-            n->older = 0;
-            oldest = n;
-            assert( oldest ) ;
-            break;
-        }
-        nnodes--;
-        assert(n);
-        Node *nxt = n->newer;
-        writeIfDirty(n);
-        m.erase(n->loc);
-        delete n;
-        n = nxt;
-    }
-}
-
-void RecCache::dump() { 
-    Node *n = oldest;
-    Node *last = 0;
-    while( n ) { 
-        assert( n->older == last );
-        last = n;
-//        cout << n << ' ' << n->older << ' ' << n->newer << '\n';
-        n=n->newer;
-    }
-    assert( newest == last );
-//    cout << endl;
-}
-
-/* cleans up everything EXCEPT storesByNsKey.
-   note this function is slow should not be invoked often
-*/
-void RecCache::closeStore(BasicRecStore *rs) { 
-    int n = rs->fileNumber + Base;
-    for( set<DiskLoc>::iterator i = dirtyl.begin(); i != dirtyl.end(); ) { 
-        DiskLoc k = *i++;
-        if( k.a() == n )
-            dirtyl.erase(k);
-    }
-
-    for( map<DiskLoc,Node*>::iterator i = m.begin(); i != m.end(); ) { 
-        DiskLoc k = i->first;
-        i++;
-        if( k.a() == n )
-            m.erase(k);
-    }
-
-    assert( stores[rs->fileNumber] != 0 );
-    stores[rs->fileNumber] = 0;
-/*
-    for( unsigned i = 0; i < stores.size(); i++ ) { 
-        if( stores[i] == rs ) { 
-            stores[i] = 0;
-            break;
-        }
-    }*/
-    delete rs; // closes file
-}
-
-void RecCache::drop(const char *_ns) { 
-    // todo: test with a non clean shutdown file
-    scoped_lock lk(rcmutex);
-
-    map<string, BasicRecStore*>::iterator it = storesByNsKey.find(mknskey(_ns));
-    string fname;
-    if( it != storesByNsKey.end() ) {
-        fname = it->second->filename;
-        closeStore(it->second); // cleans up stores[] etc.
-        storesByNsKey.erase(it);
-    }
-    else { 
-        bool found;
-        fname = findStoreFilename(_ns, found);
-        if( !found ) { 
-            log() << "RecCache::drop: no idx file found for " << _ns << endl;
-            return;
-        }
-        path pf(directory());
-        pf /= fname;
-        fname = pf.string();
-    }
-    try {
-        if( !boost::filesystem::exists(fname) ) 
-            log() << "RecCache::drop: can't find file to remove " << fname << endl;
-        boost::filesystem::remove(fname);
-    } 
-    catch(...) { 
-        log() << "RecCache::drop: exception removing file " << fname << endl;
-    }
-}
-
-}
diff --git a/db/reccache.h b/db/reccache.h
deleted file mode 100644
index d0fd118..0000000
--- a/db/reccache.h
+++ /dev/null
@@ -1,262 +0,0 @@
-// reccache.h
-/*
- *    Copyright (C) 2010 10gen Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-/* CachedBasicRecStore
-   This is our store which implements a traditional page-cache type of storage
-   (not memory mapped files).
-*/
-
-/* LOCK HIERARCHY
-     
-     dblock
-       RecCache::rcmutex
-
-     i.e. always lock dblock first if you lock both
-
-*/
-
-#pragma once
-
-#error deprecated 
-
-#include "reci.h"
-#include "recstore.h"
-
-namespace mongo { 
-
-class RecCache {
-    struct Node { 
-        Node(void* _data) : data((char *) _data) { dirty = false; newer = 0; }
-        ~Node() { 
-            free(data);
-            data = 0;
-        }
-        char *data;
-        DiskLoc loc;
-        bool dirty;
-        Node *older, *newer; // lru
-    };
-    mongo::mutex rcmutex; // mainly to coordinate with the lazy writer thread
-    unsigned recsize;
-    map<DiskLoc, Node*> m; // the cache
-    Node *newest, *oldest;
-    unsigned nnodes;
-    set<DiskLoc> dirtyl;
-    vector<BasicRecStore*> stores; // DiskLoc::a() indicates the index into this vector
-    map<string, BasicRecStore*> storesByNsKey; // nskey -> BasicRecStore*
-public:
-    static unsigned MAXNODES;
-    enum BaseValue { Base = 10000 };
-private:
-    BasicRecStore* _initStore(string fname);
-    BasicRecStore* initStore(int n);
-    string findStoreFilename(const char *_ns, bool& found);
-    void initStoreByNs(const char *ns, const string& nskey);
-    void closeStore(BasicRecStore *rs);
-
-    static string directory();
-    static string mknskey(const char *ns) { 
-        return directory() + ns;
-    }
-
-    /* get the right file for a given diskloc */
-    BasicRecStore& store(DiskLoc& d) { 
-        int n = d.a() - Base;
-        if( (int) stores.size() > n ) { 
-            BasicRecStore *rs = stores[n];
-            if( rs ) {
-                assert( rs->fileNumber == n );
-                return *rs;
-            }
-        }
-        return *initStore(n);
-    }
-    BasicRecStore& store(const char *ns) {
-        string nskey = mknskey(ns);
-        BasicRecStore *&rs = storesByNsKey[nskey];
-        if( rs )
-            return *rs;
-        initStoreByNs(ns, nskey);
-        return *rs;
-    }
-
-    void writeDirty( set<DiskLoc>::iterator i, bool rawLog = false );
-    void writeIfDirty(Node *n);
-    void touch(Node* n) { 
-        if( n == newest )
-            return;
-        if( n == oldest ) {
-            oldest = oldest->newer;
-            assert( oldest || nnodes == 1 );
-        }
-        if( n->older ) 
-            n->older->newer = n->newer;
-        if( n->newer ) 
-            n->newer->older = n->older;
-        n->newer = 0;        
-        n->older = newest;
-        newest->newer = n;
-        newest = n;
-    }
-    Node* mkNode() { 
-        Node *n = new Node(calloc(recsize,1)); // calloc is TEMP for testing.  change to malloc
-        n->older = newest;
-        if( newest )
-            newest->newer = n;
-        else {
-            assert( oldest == 0 );
-            oldest = n;
-        }
-        newest = n;
-        nnodes++;
-        return n;
-    }
-    fileofs fileOfs(DiskLoc d) { 
-        return ((fileofs) d.getOfs()) * recsize;
-    }
-
-    void dump();
-    void _ejectOld();
-
-public:
-    /* all public functions (except constructor) should use the mutex */
-
-    RecCache(unsigned recsz) : recsize(recsz) { 
-        nnodes = 0;
-        newest = oldest = 0;
-    }
-
-    /* call this after doing some work, after you are sure you are done with modifications.
-       we call it from dbunlocking().
-    */
-    void ejectOld() { 
-        if( nnodes > MAXNODES ) // just enough here to be inlineable for speed reasons.  _ejectOld does the real work
-            _ejectOld();
-    }
-
-    /* bg writer thread invokes this */
-    void writeLazily();
-
-    /* Note that this may be called BEFORE the actual writing to the node 
-       takes place.  We do flushing later on a dbunlocking() call, which happens 
-       after the writing.
-    */
-    void dirty(DiskLoc d) {
-        assert( d.a() >= Base );
-        scoped_lock lk(rcmutex);
-        map<DiskLoc, Node*>::iterator i = m.find(d);
-        if( i != m.end() ) {
-            Node *n = i->second;
-            if( !n->dirty ) { 
-                n->dirty = true;
-                dirtyl.insert(n->loc);
-            }
-        }
-    }
-
-    char* get(DiskLoc d, unsigned len) { 
-        assert( d.a() >= Base );
-        assert( len == recsize );
-
-        scoped_lock lk(rcmutex);
-        map<DiskLoc, Node*>::iterator i = m.find(d);
-        if( i != m.end() ) {
-            touch(i->second);
-            return i->second->data;
-        }
-
-        Node *n = mkNode();
-        n->loc = d;
-        store(d).get(fileOfs(d), n->data, recsize); // could throw exception
-        m.insert( pair<DiskLoc, Node*>(d, n) );
-        return n->data;
-    }
-
-    void drop(const char *ns);
-
-    DiskLoc insert(const char *ns, const void *obuf, int len, bool god) {
-        scoped_lock lk(rcmutex);
-        BasicRecStore& rs = store(ns);
-        fileofs o = rs.insert((const char *) obuf, len);
-        assert( o % recsize == 0 );
-        fileofs recnum = o / recsize;
-        massert( 10377 ,  "RecCache file too large?", recnum <= 0x7fffffff );
-        Node *n = mkNode();
-        memcpy(n->data, obuf, len);
-        DiskLoc d(rs.fileNumber + Base, (int) recnum);
-        n->loc = d;
-        m[d] = n;
-        return d;
-    }
-
-    void closeFiles(string dbname, string path);
-
-    // at termination: write dirty pages and close all files
-    void closing();
-};
-
-extern RecCache theRecCache;
-
-class CachedBasicRecStore : public RecStoreInterface { 
-public:
-    VIRT char* get(DiskLoc d, unsigned len) { 
-        return theRecCache.get(d, len);
-    }
-
-    VIRT DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { 
-        return theRecCache.insert(ns, obuf, len, god);
-    }
-
-    VIRT void modified(DiskLoc d) { 
-        theRecCache.dirty(d);
-    }
-
-    /* drop collection */
-    VIRT void drop(const char *ns) { 
-        theRecCache.drop(ns);
-    }
-
-    VIRT void rename(const char *fromNs, const char *toNs) {
-      massert( 10378 ,  "rename not yet implemented for CachedBasicRecStore", false );
-    }
-
-    /* close datafiles associated with the db specified. */
-    VIRT void closeFiles(string dbname, string path) {
-        theRecCache.closeFiles(dbname, dbpath);
-    }
-};
-
-/* see concurrency.h - note on a lock reset from read->write we don't 
-   call dbunlocking_read, we just wait for the final dbunlocking_write 
-   call 
-*/
-
-//inline void dbunlocking_read() { 
-    /*
-    Client *c = currentClient.get();
-    if ( c )
-        c->top.clientStop();
-    */
-//}
-
-//inline void dbunlocking_write() { 
-    //theRecCache.ejectOld();
-//	dbunlocking_read();
-//}
-
-} /*namespace*/
diff --git a/db/reci.h b/db/reci.h
deleted file mode 100644
index a22f1f1..0000000
--- a/db/reci.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// reci.h
-/*
- *    Copyright (C) 2010 10gen Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "diskloc.h"
-
-namespace mongo { 
-
-// #define VIRT virtual
-#define VIRT 
-
-/* Subclass this and implement your real storage interface.
-*/
-class RecStoreInterface {
-public:
-    //VIRT ~RecStoreInterface() {}
-
-    /* Get a pointer to the data at diskloc d.  Pointer guaranteed to stay in
-       scope through the current database operation's life.
-    */
-    //VIRT char* get(DiskLoc d, unsigned len) = 0;
-
-    /* indicate that the diskloc specified has been updated. note that as-is today, the modification may come AFTER this 
-       call -- we handle that currently -- until the dblock finishes.
-    */
-    //VIRT void modified(DiskLoc d) = 0;
-
-    /* insert specified data as a record */
-    //VIRT DiskLoc insert(const char *ns, const void *obuf, int len, bool god) = 0;
-
-    //VIRT void deleteRecord(const char *ns, DiskLoc d) { massert( 10379 , "not implemented RecStoreInterface::deleteRecord", false); }
-
-    /* drop the collection */
-    //VIRT void drop(const char *ns) = 0;
-
-    /* rename collection */
-    //VIRT void rename(const char *fromNs, const char *toNs) = 0;
-
-    /* close datafiles associated with the db specified. */
-    //VIRT void closeFiles(string dbname, string path) = 0;
-
-    /* todo add: 
-       closeFiles(dbname)
-       eraseFiles(dbname)
-    */
-};
-
-}
diff --git a/db/recstore.h b/db/recstore.h
deleted file mode 100644
index 913070f..0000000
--- a/db/recstore.h
+++ /dev/null
@@ -1,126 +0,0 @@
-// recstore.h
-/*
- *    Copyright (C) 2010 10gen Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#error deprecated
-
-#include "../util/file.h"
-
-namespace mongo { 
-
-using boost::uint32_t;
-using boost::uint64_t;
-
-/* Current version supports only consistent record sizes within a store. */
-
-class BasicRecStore { 
-    struct RecStoreHeader { 
-        uint32_t version;
-        uint32_t recsize;
-        uint64_t leof; // logical eof, actual file might be prealloc'd further
-        uint64_t firstDeleted; // 0 = no deleted recs
-        uint32_t cleanShutdown; // 0 = clean
-        char reserved[8192-8-8-4-4-4]; // we want our records page-aligned in the file if they are a multiple of a page's size -- so we make this 8KB with that goal
-        RecStoreHeader() { 
-            version = 65;
-            recsize = 0;
-            leof = sizeof(RecStoreHeader);
-            firstDeleted = 0;
-            cleanShutdown = 1;
-            memset(reserved, 0, sizeof(reserved));
-        }
-    };
-
-public:
-    BasicRecStore(int _fileNumber) : fileNumber(_fileNumber) { }
-    ~BasicRecStore();
-    void init(const char *fn, unsigned recsize);
-    fileofs insert(const char *buf, unsigned len);
-    void update(fileofs o, const char *buf, unsigned len);
-    void remove(fileofs o, unsigned len);
-    void get(fileofs o, char *buf, unsigned len);
-
-    int fileNumber; // this goes in DiskLoc::a
-
-    string filename;
-
-private:
-
-    void writeHeader();
-    File f;
-    fileofs len;
-    RecStoreHeader h; // h.reserved is wasteful here; fix later.
-    void write(fileofs ofs, const char *data, unsigned len) { 
-        f.write(ofs, data, len);
-        massert( 10380 , "basicrecstore write io error", !f.bad());
-    }
-};
-
-/* --- implementation --- */
-
-inline BasicRecStore::~BasicRecStore() { 
-    h.cleanShutdown = 0;
-    if( f.is_open() ) {
-        writeHeader();
-        f.fsync();
-    }
-}
-
-inline void BasicRecStore::writeHeader() { 
-    write(0, (const char *) &h, 28); // update header in file for new leof
-    uassert( 10115 , "file io error in BasicRecStore [1]", !f.bad()); 
-}
-
-inline fileofs BasicRecStore::insert(const char *buf, unsigned reclen) { 
-    if( h.firstDeleted ) { 
-        uasserted(11500, "deleted not yet implemented recstoreinsert");
-    }
-    massert( 10381 , "bad len", reclen == h.recsize);
-    fileofs ofs = h.leof;
-    h.leof += reclen;
-    if( h.leof > len ) { 
-        // grow the file.  we grow quite a bit to avoid excessive file system fragmentations
-        len += (len / 8) + h.recsize;
-        uassert( 10116 ,  "recstore file too big for 32 bit", len <= 0x7fffffff || sizeof(std::streamoff) > 4 );
-        write(len, "", 0);
-    }
-    writeHeader();
-    write(ofs, buf, reclen);
-    uassert( 10117 , "file io error in BasicRecStore [2]", !f.bad());
-    return ofs;
-}
-
-/* so far, it's ok to read or update a subset of a record */
-
-inline void BasicRecStore::update(fileofs o, const char *buf, unsigned len) { 
-    assert(o <= h.leof && o >= sizeof(RecStoreHeader));
-    write(o, buf, len);
-}
-
-inline void BasicRecStore::get(fileofs o, char *buf, unsigned len) { 
-    assert(o <= h.leof && o >= sizeof(RecStoreHeader));
-    f.read(o, buf, len);
-    massert( 10382 , "basicrestore::get I/O error", !f.bad());
-}
-
-inline void BasicRecStore::remove(fileofs o, unsigned len) { 
-    uasserted(11501, "not yet implemented recstoreremove");
-}
-
-}
diff --git a/db/repl.cpp b/db/repl.cpp
index ea0eab9..b14034d 100644
--- a/db/repl.cpp
+++ b/db/repl.cpp
@@ -25,7 +25,7 @@
 
    local.sources         - indicates what sources we pull from as a "slave", and the last update of each
    local.oplog.$main     - our op log as "master"
-   local.dbinfo.<dbname>
+   local.dbinfo.<dbname> - no longer used???
    local.pair.startup    - can contain a special value indicating for a pair that we have the master copy.
                            used when replacing other half of the pair which has permanently failed.
    local.pair.sync       - { initialsynccomplete: 1 }
@@ -49,13 +49,13 @@
 #include "repl/rs.h"
 
 namespace mongo {
-    
+
     // our config from command line etc.
     ReplSettings replSettings;
 
     /* if 1 sync() is running */
     volatile int syncing = 0;
-	static volatile int relinquishSyncingSome = 0;
+    static volatile int relinquishSyncingSome = 0;
 
     /* if true replace our peer in a replication pair -- don't worry about if his
        local.oplog.$main is empty.
@@ -68,9 +68,9 @@ namespace mongo {
     const char *replAllDead = 0;
 
     time_t lastForcedResync = 0;
-    
+
     IdTracker &idTracker = *( new IdTracker() );
-    
+
 } // namespace mongo
 
 #include "replpair.h"
@@ -159,8 +159,8 @@ namespace mongo {
                     break;
                 {
                     dbtemprelease t;
-					relinquishSyncingSome = 1;
-					sleepmillis(1);
+                    relinquishSyncingSome = 1;
+                    sleepmillis(1);
                 }
             }
             if ( syncing ) {
@@ -206,7 +206,7 @@ namespace mongo {
             return true;
         }
     } cmdForceDead;
-    
+
     /* operator requested resynchronization of replication (on the slave).  { resync : 1 } */
     class CmdResync : public Command {
     public:
@@ -221,22 +221,28 @@ namespace mongo {
         void help(stringstream&h) const { h << "resync (from scratch) an out of date replica slave.\nhttp://www.mongodb.org/display/DOCS/Master+Slave"; }
         CmdResync() : Command("resync") { }
         virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( cmdLine.usingReplSets() ) {
+                errmsg = "resync command not currently supported with replica sets.  See RS102 info in the mongodb documentations";
+                result.append("info", "http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member");
+                return false;
+            }
+
             if ( cmdObj.getBoolField( "force" ) ) {
                 if ( !waitForSyncToFinish( errmsg ) )
                     return false;
                 replAllDead = "resync forced";
-            }            
+            }
             if ( !replAllDead ) {
                 errmsg = "not dead, no need to resync";
                 return false;
             }
             if ( !waitForSyncToFinish( errmsg ) )
                 return false;
-            
+
             ReplSource::forceResyncDead( "client" );
             result.append( "info", "triggered resync for all sources" );
-            return true;                
-        }        
+            return true;
+        }
         bool waitForSyncToFinish( string &errmsg ) const {
             // Wait for slave thread to finish syncing, so sources will be be
             // reloaded with new saved state on next pass.
@@ -246,7 +252,7 @@ namespace mongo {
                     break;
                 {
                     dbtemprelease t;
-					relinquishSyncingSome = 1;
+                    relinquishSyncingSome = 1;
                     sleepmillis(1);
                 }
             }
@@ -257,16 +263,31 @@ namespace mongo {
             return true;
         }
     } cmdResync;
-    
-    bool anyReplEnabled(){
-        return replPair || replSettings.slave || replSettings.master;
+
+    bool anyReplEnabled() {
+        return replPair || replSettings.slave || replSettings.master || theReplSet;
     }
 
-    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ){
-        
+    bool replAuthenticate(DBClientBase *conn);
+
+    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ) {
+
+        if ( replSet ) {
+            if( theReplSet == 0 ) {
+                result.append("ismaster", false);
+                result.append("secondary", false);
+                result.append("info", ReplSet::startupStatusMsg);
+                result.append( "isreplicaset" , true );
+                return;
+            }
+
+            theReplSet->fillIsMaster(result);
+            return;
+        }
+
         if ( replAllDead ) {
             result.append("ismaster", 0);
-            if( authed ) { 
+            if( authed ) {
                 if ( replPair )
                     result.append("remote", replPair->remote);
             }
@@ -285,25 +306,25 @@ namespace mongo {
             result.appendBool("ismaster", _isMaster() );
         }
 
-        if ( level && replSet ){
+        if ( level && replSet ) {
             result.append( "info" , "is replica set" );
         }
-        else if ( level ){
+        else if ( level ) {
             BSONObjBuilder sources( result.subarrayStart( "sources" ) );
-            
+
             readlock lk( "local.sources" );
-            Client::Context ctx( "local.sources" );
+            Client::Context ctx( "local.sources", dbpath, 0, authed );
             shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
             int n = 0;
-            while ( c->ok() ){
+            while ( c->ok() ) {
                 BSONObj s = c->current();
-                
+
                 BSONObjBuilder bb;
                 bb.append( s["host"] );
                 string sourcename = s["source"].valuestr();
                 if ( sourcename != "main" )
                     bb.append( s["source"] );
-                
+
                 {
                     BSONElement e = s["syncedTo"];
                     BSONObjBuilder t( bb.subobjStart( "syncedTo" ) );
@@ -311,23 +332,27 @@ namespace mongo {
                     t.append( "inc" , e.timestampInc() );
                     t.done();
                 }
-                
-                if ( level > 1 ){
+
+                if ( level > 1 ) {
                     dbtemprelease unlock;
+                    // note: there is no so-style timeout on this connection; perhaps we should have one.
                     ScopedDbConnection conn( s["host"].valuestr() );
-                    BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) );
-                    BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) );
-                    bb.appendDate( "masterFirst" , first["ts"].timestampTime() );
-                    bb.appendDate( "masterLast" , last["ts"].timestampTime() );
-                    double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime());
-                    bb.append( "lagSeconds" , lag / 1000 );
+                    DBClientConnection *cliConn = dynamic_cast< DBClientConnection* >( &conn.conn() );
+                    if ( cliConn && replAuthenticate( cliConn ) ) {
+                        BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) );
+                        BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) );
+                        bb.appendDate( "masterFirst" , first["ts"].timestampTime() );
+                        bb.appendDate( "masterLast" , last["ts"].timestampTime() );
+                        double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime());
+                        bb.append( "lagSeconds" , lag / 1000 );
+                    }
                     conn.done();
                 }
 
                 sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() );
                 c->advance();
             }
-            
+
             sources.done();
         }
     }
@@ -345,26 +370,15 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         CmdIsMaster() : Command("isMaster", true, "ismaster") { }
         virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
-			/* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not 
-			   authenticated.
-			   we allow unauthenticated ismaster but we aren't as verbose informationally if 
-			   one is not authenticated for admin db to be safe.
-			*/
-
-            if( replSet ) {
-                if( theReplSet == 0 ) { 
-                    result.append("ismaster", false);
-                    result.append("secondary", false);
-                    errmsg = "replSet still trying to initialize";
-                    result.append("info", ReplSet::startupStatusMsg);
-                    return true;
-                }
-                theReplSet->fillIsMaster(result);
-                return true;
-            }
-            
-			bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
+            /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not
+               authenticated.
+               we allow unauthenticated ismaster but we aren't as verbose informationally if
+               one is not authenticated for admin db to be safe.
+            */
+            bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
             appendReplicationInfo( result , authed );
+
+            result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
             return true;
         }
     } cmdismaster;
@@ -375,14 +389,14 @@ namespace mongo {
         virtual bool slaveOk() const {
             return true;
         }
-        virtual LockType locktype() const { return WRITE; }
+        virtual LockType locktype() const { return NONE; }
         CmdIsInitialSyncComplete() : Command( "isinitialsynccomplete" ) {}
         virtual bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             result.appendBool( "initialsynccomplete", getInitialSyncCompleted() );
             return true;
         }
     } cmdisinitialsynccomplete;
-    
+
     /* negotiate who is master
 
        -1=not set (probably means we just booted)
@@ -482,7 +496,7 @@ namespace mongo {
             return true;
         }
     } cmdnegotiatemaster;
-    
+
     int ReplPair::negotiate(DBClientConnection *conn, string method) {
         BSONObjBuilder b;
         b.append("negotiatemaster",1);
@@ -491,7 +505,7 @@ namespace mongo {
         b.append("your_port", remotePort);
         BSONObj cmd = b.done();
         BSONObj res = conn->findOne("admin.$cmd", cmd);
-        if ( ! res["ok"].trueValue() ){
+        if ( ! res["ok"].trueValue() ) {
             string message = method + " negotiate failed";
             problem() << message << ": " << res.toString() << '\n';
             setMasterLocked(State_Confused, message.c_str());
@@ -503,7 +517,8 @@ namespace mongo {
         // choose who is master.
         if ( x != State_Slave && x != State_Master && x != State_Negotiating ) {
             problem() << method << " negotiate: bad you_are value " << res.toString() << endl;
-        } else if ( x != State_Negotiating ) {
+        }
+        else if ( x != State_Negotiating ) {
             string message = method + " negotiation";
             setMasterLocked(x, message.c_str());
         }
@@ -542,8 +557,8 @@ namespace mongo {
                     break;
                 addDbNextPass.insert( e.fieldName() );
             }
-        }        
-        
+        }
+
         dbsObj = o.getObjectField("incompleteCloneDbs");
         if ( !dbsObj.isEmpty() ) {
             BSONObjIterator i(dbsObj);
@@ -553,7 +568,7 @@ namespace mongo {
                     break;
                 incompleteCloneDbs.insert( e.fieldName() );
             }
-        }        
+        }
 
         _lastSavedLocalTs = OpTime( o.getField( "localLogTs" ).date() );
     }
@@ -569,7 +584,7 @@ namespace mongo {
             b.appendTimestamp("syncedTo", syncedTo.asDate());
 
         b.appendTimestamp("localLogTs", _lastSavedLocalTs.asDate());
-        
+
         BSONObjBuilder dbsNextPassBuilder;
         int n = 0;
         for ( set<string>::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++ ) {
@@ -622,7 +637,7 @@ namespace mongo {
         }
     }
 
-    static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, const BSONObj &spec, ReplSource::SourceVector &old) {
+    static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, ReplSource::SourceVector &old) {
         if ( !s.syncedTo.isNull() ) { // Don't reuse old ReplSource if there was a forced resync.
             for ( ReplSource::SourceVector::iterator i = old.begin(); i != old.end();  ) {
                 if ( s == **i ) {
@@ -684,11 +699,12 @@ namespace mongo {
         else {
             try {
                 massert( 10384 , "--only requires use of --source", cmdLine.only.empty());
-            } catch ( ... ) {
+            }
+            catch ( ... ) {
                 dbexit( EXIT_BADOPTIONS );
             }
         }
-        
+
         if ( replPair ) {
             const string &remote = replPair->remote;
             // --pairwith host specified.
@@ -730,9 +746,9 @@ namespace mongo {
                     tmp.syncedTo = OpTime();
                     tmp.replacing = true;
                 }
-            } 
+            }
             if ( ( !replPair && tmp.syncedTo.isNull() ) ||
-                ( replPair && replSettings.fastsync ) ) {
+                    ( replPair && replSettings.fastsync ) ) {
                 DBDirectClient c;
                 if ( c.exists( "local.oplog.$main" ) ) {
                     BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) );
@@ -742,7 +758,7 @@ namespace mongo {
                     }
                 }
             }
-            addSourceToList(v, tmp, c->current(), old);
+            addSourceToList(v, tmp, old);
             c->advance();
         }
 
@@ -766,7 +782,7 @@ namespace mongo {
         }
         return false;
     }
-    
+
     void ReplSource::forceResyncDead( const char *requester ) {
         if ( !replAllDead )
             return;
@@ -775,9 +791,9 @@ namespace mongo {
         for( SourceVector::iterator i = sources.begin(); i != sources.end(); ++i ) {
             (*i)->forceResync( requester );
         }
-        replAllDead = 0;        
+        replAllDead = 0;
     }
-    
+
     void ReplSource::forceResync( const char *requester ) {
         BSONObj info;
         {
@@ -800,7 +816,7 @@ namespace mongo {
                     }
                 }
             }
-        }        
+        }
         syncedTo = OpTime();
         addDbNextPass.clear();
         save();
@@ -812,7 +828,7 @@ namespace mongo {
         dropDatabase(db);
         return db;
     }
-    
+
     /* grab initial copy of a database from the master */
     bool ReplSource::resync(string db) {
         string dummyNs = resyncDrop( db.c_str(), "internal" );
@@ -841,7 +857,7 @@ namespace mongo {
             log() << "sync: caught user assertion " << e << " while applying op: " << op << endl;;
         }
         catch ( DBException& e ) {
-            log() << "sync: caught db exception " << e << " while applying op: " << op << endl;;            
+            log() << "sync: caught db exception " << e << " while applying op: " << op << endl;;
         }
 
     }
@@ -850,15 +866,17 @@ namespace mongo {
          { ts: ..., op: <optype>, ns: ..., o: <obj> , o2: <extraobj>, b: <boolflag> }
          ...
        see logOp() comments.
+
+       @param alreadyLocked caller already put us in write lock if true
     */
-    void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail) {
+    void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail, bool alreadyLocked) {
         if( logLevel >= 6 ) // op.tostring is expensive so doing this check explicitly
             log(6) << "processing op: " << op << endl;
 
         if( op.getStringField("op")[0] == 'n' )
             return;
 
-        char clientName[MaxDatabaseLen];
+        char clientName[MaxDatabaseNameLen];
         const char *ns = op.getStringField("ns");
         nsToDatabase(ns, clientName);
 
@@ -867,22 +885,27 @@ namespace mongo {
             return;
         }
         else if ( *ns == 0 ) {
-            problem() << "halting replication, bad op in oplog:\n  " << op.toString() << endl;
-            replAllDead = "bad object in oplog";
-            throw SyncException();
+            /*if( op.getStringField("op")[0] != 'n' )*/ {
+                problem() << "halting replication, bad op in oplog:\n  " << op.toString() << endl;
+                replAllDead = "bad object in oplog";
+                throw SyncException();
+            }
+            //ns = "local.system.x";
+            //nsToDatabase(ns, clientName);
         }
 
         if ( !only.empty() && only != clientName )
             return;
 
-        if( cmdLine.pretouch ) {
+        if( cmdLine.pretouch && !alreadyLocked/*doesn't make sense if in write lock already*/ ) {
             if( cmdLine.pretouch > 1 ) {
                 /* note: this is bad - should be put in ReplSource.  but this is first test... */
                 static int countdown;
+                assert( countdown >= 0 );
                 if( countdown > 0 ) {
                     countdown--; // was pretouched on a prev pass
-                    assert( countdown >= 0 );
-                } else {
+                }
+                else {
                     const int m = 4;
                     if( tp.get() == 0 ) {
                         int nthr = min(8, cmdLine.pretouch);
@@ -911,7 +934,7 @@ namespace mongo {
             }
         }
 
-        dblock lk;
+        scoped_ptr<writelock> lk( alreadyLocked ? 0 : new writelock() );
 
         if ( localLogTail && replPair && replPair->state == ReplPair::State_Master ) {
             updateSetsWithLocalOps( *localLogTail, true ); // allow unlocking
@@ -923,7 +946,7 @@ namespace mongo {
             log() << "replAllDead, throwing SyncException: " << replAllDead << endl;
             throw SyncException();
         }
-        
+
         Client::Context ctx( ns );
         ctx.getClient()->curop()->reset();
 
@@ -932,14 +955,14 @@ namespace mongo {
 
         if( logLevel >= 6 )
             log(6) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl;
-        
+
         // always apply admin command command
         // this is a bit hacky -- the semantics of replication/commands aren't well specified
         if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) {
             applyOperation( op );
             return;
         }
-        
+
         if ( ctx.justCreated() || empty || incompleteClone ) {
             // we must add to incomplete list now that setClient has been called
             incompleteCloneDbs.insert( clientName );
@@ -950,7 +973,8 @@ namespace mongo {
                  clone 100 databases in one pass.)
                  */
                 addDbNextPass.insert( clientName );
-            } else {
+            }
+            else {
                 if ( incompleteClone ) {
                     log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl;
                 }
@@ -962,21 +986,25 @@ namespace mongo {
                 incompleteCloneDbs.erase( clientName );
             }
             save();
-        } else {
+        }
+        else {
             bool mod;
             if ( replPair && replPair->state == ReplPair::State_Master ) {
                 BSONObj id = idForOp( op, mod );
                 if ( !idTracker.haveId( ns, id ) ) {
-                    applyOperation( op );    
-                } else if ( idTracker.haveModId( ns, id ) ) {
+                    applyOperation( op );
+                }
+                else if ( idTracker.haveModId( ns, id ) ) {
                     log( 6 ) << "skipping operation matching mod id object " << op << endl;
                     BSONObj existing;
                     if ( Helpers::findOne( ns, id, existing ) )
                         logOp( "i", ns, existing );
-                } else {
+                }
+                else {
                     log( 6 ) << "skipping operation matching changed id object " << op << endl;
                 }
-            } else {
+            }
+            else {
                 applyOperation( op );
             }
             addDbNextPass.erase( clientName );
@@ -988,33 +1016,33 @@ namespace mongo {
         const char *opType = op.getStringField( "op" );
         BSONObj o = op.getObjectField( "o" );
         switch( opType[ 0 ] ) {
-            case 'i': {
-                BSONObjBuilder idBuilder;
-                BSONElement id;
-                if ( !o.getObjectID( id ) )
-                    return BSONObj();                    
-                idBuilder.append( id );
-                return idBuilder.obj();
-            }
-            case 'u': {
-                BSONObj o2 = op.getObjectField( "o2" );
-                if ( strcmp( o2.firstElement().fieldName(), "_id" ) != 0 )
-                    return BSONObj();
-                if ( o.firstElement().fieldName()[ 0 ] == '$' )
-                    mod = true;
-                return o2;
-            }
-            case 'd': {
-                if ( opType[ 1 ] != '\0' )
-                    return BSONObj(); // skip "db" op type
-                return o;
-            }
-            default:
-                break;
-        }        
+        case 'i': {
+            BSONObjBuilder idBuilder;
+            BSONElement id;
+            if ( !o.getObjectID( id ) )
+                return BSONObj();
+            idBuilder.append( id );
+            return idBuilder.obj();
+        }
+        case 'u': {
+            BSONObj o2 = op.getObjectField( "o2" );
+            if ( strcmp( o2.firstElement().fieldName(), "_id" ) != 0 )
+                return BSONObj();
+            if ( o.firstElement().fieldName()[ 0 ] == '$' )
+                mod = true;
+            return o2;
+        }
+        case 'd': {
+            if ( opType[ 1 ] != '\0' )
+                return BSONObj(); // skip "db" op type
+            return o;
+        }
+        default:
+            break;
+        }
         return BSONObj();
     }
-    
+
     void ReplSource::updateSetsWithOp( const BSONObj &op, bool mayUnlock ) {
         if ( mayUnlock ) {
             idTracker.mayUpgradeStorage();
@@ -1029,42 +1057,42 @@ namespace mongo {
             if ( mod )
                 idTracker.haveModId( ns, id, true );
             idTracker.haveId( ns, id, true );
-        }        
+        }
     }
-    
+
     void ReplSource::syncToTailOfRemoteLog() {
         string _ns = ns();
         BSONObjBuilder b;
         if ( !only.empty() ) {
             b.appendRegex("ns", string("^") + only);
-        }        
+        }
         BSONObj last = oplogReader.findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) );
         if ( !last.isEmpty() ) {
             BSONElement ts = last.getField( "ts" );
             massert( 10386 ,  "non Date ts found: " + last.toString(), ts.type() == Date || ts.type() == Timestamp );
             syncedTo = OpTime( ts.date() );
-        }        
+        }
     }
-    
+
     OpTime ReplSource::nextLastSavedLocalTs() const {
         Client::Context ctx( "local.oplog.$main" );
         shared_ptr<Cursor> c = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) );
         if ( c->ok() )
-            return OpTime( c->current().getField( "ts" ).date() );        
+            return OpTime( c->current().getField( "ts" ).date() );
         return OpTime();
     }
-    
+
     void ReplSource::setLastSavedLocalTs( const OpTime &nextLocalTs ) {
         _lastSavedLocalTs = nextLocalTs;
         log( 3 ) << "updated _lastSavedLocalTs to: " << _lastSavedLocalTs << endl;
     }
-    
+
     void ReplSource::resetSlave() {
         log() << "**********************************************************\n";
         log() << "Sending forcedead command to slave to stop its replication\n";
         log() << "Host: " << hostName << " paired: " << paired << endl;
         massert( 10387 ,  "request to kill slave replication failed",
-                oplogReader.conn()->simpleCommand( "admin", 0, "forcedead" ) );
+                 oplogReader.conn()->simpleCommand( "admin", 0, "forcedead" ) );
         syncToTailOfRemoteLog();
         {
             dblock lk;
@@ -1073,7 +1101,7 @@ namespace mongo {
             oplogReader.resetCursor();
         }
     }
-    
+
     bool ReplSource::updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock ) {
         Client::Context ctx( "local.oplog.$main" );
         shared_ptr<Cursor> localLog = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) );
@@ -1099,14 +1127,16 @@ namespace mongo {
             dbtemprelease t;
             resetSlave();
             massert( 10388 ,  "local master log filled, forcing slave resync", false );
-        }        
+        }
         if ( !newTail.isNull() )
             localLogTail = newTail;
         return true;
     }
-    
+
+    extern unsigned replApplyBatchSize;
+
     /* slave: pull some data from the master's oplog
-       note: not yet in db mutex at this point. 
+       note: not yet in db mutex at this point.
        @return -1 error
                0 ok, don't sleep
                1 ok, sleep
@@ -1126,7 +1156,7 @@ namespace mongo {
         OpTime localLogTail = _lastSavedLocalTs;
 
         bool initial = syncedTo.isNull();
-        
+
         if ( !oplogReader.haveCursor() || initial ) {
             if ( initial ) {
                 // Important to grab last oplog timestamp before listing databases.
@@ -1152,13 +1182,13 @@ namespace mongo {
                 dblock lk;
                 save();
             }
-                        
+
             BSONObjBuilder q;
             q.appendDate("$gte", syncedTo.asDate());
             BSONObjBuilder query;
             query.append("ts", q.done());
             if ( !only.empty() ) {
-               // note we may here skip a LOT of data table scanning, a lot of work for the master.
+                // note we may here skip a LOT of data table scanning, a lot of work for the master.
                 query.appendRegex("ns", string("^") + only); // maybe append "\\." here?
             }
             BSONObj queryObj = query.done();
@@ -1185,7 +1215,7 @@ namespace mongo {
                 b.append("ns", *i + '.');
                 b.append("op", "db");
                 BSONObj op = b.done();
-                sync_pullOpLog_applyOperation(op, 0);
+                sync_pullOpLog_applyOperation(op, 0, false);
             }
         }
 
@@ -1195,7 +1225,8 @@ namespace mongo {
                 if( oplogReader.awaitCapable() )
                     okResultCode = 0; // don't sleep
 
-            } else {
+            }
+            else {
                 log() << "repl:   " << ns << " oplog is empty\n";
             }
             {
@@ -1207,11 +1238,11 @@ namespace mongo {
                         setLastSavedLocalTs( nextLastSaved );
                     }
                 }
-                save();            
+                save();
             }
             return okResultCode;
         }
-        
+
         OpTime nextOpTime;
         {
             BSONObj op = oplogReader.next();
@@ -1234,32 +1265,31 @@ namespace mongo {
                     massert( 10391 , "repl: bad object read from remote oplog", false);
                 }
             }
-        
+
             if ( replPair && replPair->state == ReplPair::State_Master ) {
-            
+
                 OpTime next( ts.date() );
                 if ( !tailing && !initial && next != syncedTo ) {
                     log() << "remote slave log filled, forcing slave resync" << endl;
                     resetSlave();
                     return 1;
-                }            
-            
+                }
+
                 dblock lk;
                 updateSetsWithLocalOps( localLogTail, true );
             }
-        
+
             nextOpTime = OpTime( ts.date() );
             log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n';
-            if ( tailing || initial ) {
-                if ( initial )
-                    log(1) << "repl:   initial run\n";
-                else {
-                    if( !( syncedTo <= nextOpTime ) ) { 
-                        log() << "repl ASSERTION failed : syncedTo <= nextOpTime" << endl;
-                        log() << "repl syncTo:     " << syncedTo.toStringLong() << endl;
-                        log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl;
-                        assert(false);
-                    }
+            if ( initial ) {
+                log(1) << "repl:   initial run\n";
+            }
+            if( tailing ) {
+                if( !( syncedTo < nextOpTime ) ) {
+                    log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl;
+                    log() << "repl syncTo:     " << syncedTo.toStringLong() << endl;
+                    log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl;
+                    assert(false);
                 }
                 oplogReader.putBack( op ); // op will be processed in the loop below
                 nextOpTime = OpTime(); // will reread the op below
@@ -1281,14 +1311,14 @@ namespace mongo {
                 throw SyncException();
             }
             else {
-                /* t == syncedTo, so the first op was applied previously. */
+                /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */
             }
         }
 
         // apply operations
         {
             int n = 0;
-			time_t saveLast = time(0);
+            time_t saveLast = time(0);
             while ( 1 ) {
                 /* from a.s.:
                    I think the idea here is that we can establish a sync point between the local op log and the remote log with the following steps:
@@ -1316,7 +1346,8 @@ namespace mongo {
                             if ( getInitialSyncCompleted() ) { // if initial sync hasn't completed, break out of loop so we can set to completed or clone more dbs
                                 continue;
                             }
-                        } else {
+                        }
+                        else {
                             setLastSavedLocalTs( nextLastSaved );
                         }
                     }
@@ -1332,109 +1363,132 @@ namespace mongo {
                 else {
                 }
 
-                OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { 
-					// periodically note our progress, in case we are doing a lot of work and crash
-					dblock lk;
+                OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) {
+                    // periodically note our progress, in case we are doing a lot of work and crash
+                    dblock lk;
                     syncedTo = nextOpTime;
                     // can't update local log ts since there are pending operations from our peer
-					save();
+                    save();
                     log() << "repl:   checkpoint applied " << n << " operations" << endl;
                     log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
-					saveLast = time(0);
-					n = 0;
-				}
+                    saveLast = time(0);
+                    n = 0;
+                }
 
                 BSONObj op = oplogReader.next();
-                BSONElement ts = op.getField("ts");
-                if( !( ts.type() == Date || ts.type() == Timestamp ) ) { 
-                    log() << "sync error: problem querying remote oplog record\n";
-                    log() << "op: " << op.toString() << '\n';
-                    log() << "halting replication" << endl;
-                    replInfo = replAllDead = "sync error: no ts found querying remote oplog record";
-                    throw SyncException();
-                }
-                OpTime last = nextOpTime;
-                nextOpTime = OpTime( ts.date() );
-                if ( !( last < nextOpTime ) ) {
-                    log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl;
-                    log() << " last:       " << last.toStringLong() << '\n';
-                    log() << " nextOpTime: " << nextOpTime.toStringLong() << '\n';
-                    log() << " halting replication" << endl;
-                    replInfo = replAllDead = "sync error last >= nextOpTime";
-                    uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false);
-                }
-                if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) {
-                    oplogReader.putBack( op );
-                    _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1;
-                    dblock lk;
-                    if ( n > 0 ) {
-                        syncedTo = last;
-                        save();
+
+                unsigned b = replApplyBatchSize;
+                bool justOne = b == 1;
+                scoped_ptr<writelock> lk( justOne ? 0 : new writelock() );
+                while( 1 ) {
+
+                    BSONElement ts = op.getField("ts");
+                    if( !( ts.type() == Date || ts.type() == Timestamp ) ) {
+                        log() << "sync error: problem querying remote oplog record" << endl;
+                        log() << "op: " << op.toString() << endl;
+                        log() << "halting replication" << endl;
+                        replInfo = replAllDead = "sync error: no ts found querying remote oplog record";
+                        throw SyncException();
+                    }
+                    OpTime last = nextOpTime;
+                    nextOpTime = OpTime( ts.date() );
+                    if ( !( last < nextOpTime ) ) {
+                        log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl;
+                        log() << " last:       " << last.toStringLong() << endl;
+                        log() << " nextOpTime: " << nextOpTime.toStringLong() << endl;
+                        log() << " halting replication" << endl;
+                        replInfo = replAllDead = "sync error last >= nextOpTime";
+                        uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false);
+                    }
+                    if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) {
+                        assert( justOne );
+                        oplogReader.putBack( op );
+                        _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1;
+                        dblock lk;
+                        if ( n > 0 ) {
+                            syncedTo = last;
+                            save();
+                        }
+                        log() << "repl:   applied " << n << " operations" << endl;
+                        log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
+                        log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl;
+                        return okResultCode;
                     }
-                    log() << "repl:   applied " << n << " operations" << endl;
-                    log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
-                    log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl;
-                    break;
-                }
 
-                sync_pullOpLog_applyOperation(op, &localLogTail);
-                n++;
+                    sync_pullOpLog_applyOperation(op, &localLogTail, !justOne);
+                    n++;
+
+                    if( --b == 0 )
+                        break;
+                    // if to here, we are doing mulpile applications in a singel write lock acquisition
+                    if( !oplogReader.moreInCurrentBatch() ) {
+                        // break if no more in batch so we release lock while reading from the master
+                        break;
+                    }
+                    op = oplogReader.next();
+
+                    getDur().commitIfNeeded();
+                }
             }
         }
 
         return okResultCode;
     }
 
-	BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}");
-    
-	bool replAuthenticate(DBClientConnection *conn) {
-		if( ! cc().isAdmin() ){
-			log() << "replauthenticate: requires admin permissions, failing\n";
-			return false;
-		}
-
-		BSONObj user;
-		{
-			dblock lk;
-			Client::Context ctxt("local.");
-			if( !Helpers::findOne("local.system.users", userReplQuery, user) ) { 
-				// try the first user is local
-				if( !Helpers::getSingleton("local.system.users", user) ) {
-					if( noauth ) 
-						return true; // presumably we are running a --noauth setup all around.
-
-					log() << "replauthenticate: no user in local.system.users to use for authentication\n";
-					return false;
-				}
-			}
-            
-		}
-
-		string u = user.getStringField("user");
-		string p = user.getStringField("pwd");
-		massert( 10392 , "bad user object? [1]", !u.empty());
-		massert( 10393 , "bad user object? [2]", !p.empty());
-		string err;
-		if( !conn->auth("local", u.c_str(), p.c_str(), err, false) ) {
-			log() << "replauthenticate: can't authenticate to master server, user:" << u << endl;
-			return false;
-		}
-		return true;
-	}
+    BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}");
+
+    bool replAuthenticate(DBClientBase *conn) {
+        if( ! cc().isAdmin() ) {
+            log() << "replauthenticate: requires admin permissions, failing\n";
+            return false;
+        }
+
+        string u;
+        string p;
+        if (internalSecurity.pwd.length() > 0) {
+            u = internalSecurity.user;
+            p = internalSecurity.pwd;
+        }
+        else {
+            BSONObj user;
+            {
+                dblock lk;
+                Client::Context ctxt("local.");
+                if( !Helpers::findOne("local.system.users", userReplQuery, user) ||
+                        // try the first user in local
+                        !Helpers::getSingleton("local.system.users", user) ) {
+                    log() << "replauthenticate: no user in local.system.users to use for authentication\n";
+                    return noauth;
+                }
+            }
+            u = user.getStringField("user");
+            p = user.getStringField("pwd");
+            massert( 10392 , "bad user object? [1]", !u.empty());
+            massert( 10393 , "bad user object? [2]", !p.empty());
+        }
+
+        string err;
+        if( !conn->auth("local", u.c_str(), p.c_str(), err, false) ) {
+            log() << "replauthenticate: can't authenticate to master server, user:" << u << endl;
+            return false;
+        }
+        return true;
+    }
 
     bool replHandshake(DBClientConnection *conn) {
-        
+
         BSONObj me;
         {
             dblock l;
-            if ( ! Helpers::getSingleton( "local.me" , me ) ){
+            // local.me is an identifier for a server for getLastError w:2+
+            if ( ! Helpers::getSingleton( "local.me" , me ) ) {
                 BSONObjBuilder b;
                 b.appendOID( "_id" , 0 , true );
                 me = b.obj();
                 Helpers::putSingleton( "local.me" , me );
             }
         }
-        
+
         BSONObjBuilder cmd;
         cmd.appendAs( me["_id"] , "handshake" );
 
@@ -1450,9 +1504,9 @@ namespace mongo {
             _conn = auto_ptr<DBClientConnection>(new DBClientConnection( false, 0, replPair ? 20 : 0 /* tcp timeout */));
             string errmsg;
             ReplInfo r("trying to connect to sync source");
-            if ( !_conn->connect(hostName.c_str(), errmsg) || 
-                 !replAuthenticate(_conn.get()) ||
-                 !replHandshake(_conn.get()) ) {
+            if ( !_conn->connect(hostName.c_str(), errmsg) ||
+                    (!noauth && !replAuthenticate(_conn.get())) ||
+                    !replHandshake(_conn.get()) ) {
                 resetConnection();
                 log() << "repl:  " << errmsg << endl;
                 return false;
@@ -1460,7 +1514,7 @@ namespace mongo {
         }
         return true;
     }
-    
+
     /* note: not yet in mutex at this point.
        returns >= 0 if ok.  return -1 if you want to reconnect.
        return value of zero indicates no sleep necessary before next call
@@ -1486,14 +1540,14 @@ namespace mongo {
         }
 
         if ( !oplogReader.connect(hostName) ) {
-			log(4) << "repl:  can't connect to sync source" << endl;
+            log(4) << "repl:  can't connect to sync source" << endl;
             if ( replPair && paired ) {
                 assert( startsWith(hostName.c_str(), replPair->remoteHost.c_str()) );
                 replPair->arbitrate();
             }
             return -1;
         }
-        
+
         if ( paired ) {
             int remote = replPair->negotiate(oplogReader.conn(), "direct");
             int nMasters = ( remote == ReplPair::State_Master ) + ( replPair->state == ReplPair::State_Master );
@@ -1504,17 +1558,17 @@ namespace mongo {
         }
 
         /*
-        	// get current mtime at the server.
-        	BSONObj o = conn->findOne("admin.$cmd", opTimeQuery);
-        	BSONElement e = o.getField("optime");
-        	if( e.eoo() ) {
-        		log() << "repl:   failed to get cur optime from master" << endl;
-        		log() << "        " << o.toString() << endl;
-        		return false;
-        	}
-        	uassert( 10124 ,  e.type() == Date );
-        	OpTime serverCurTime;
-        	serverCurTime.asDate() = e.date();
+            // get current mtime at the server.
+            BSONObj o = conn->findOne("admin.$cmd", opTimeQuery);
+            BSONElement e = o.getField("optime");
+            if( e.eoo() ) {
+                log() << "repl:   failed to get cur optime from master" << endl;
+                log() << "        " << o.toString() << endl;
+                return false;
+            }
+            uassert( 10124 ,  e.type() == Date );
+            OpTime serverCurTime;
+            serverCurTime.asDate() = e.date();
         */
         return sync_pullOpLog(nApplied);
     }
@@ -1527,7 +1581,7 @@ namespace mongo {
     _ reuse that cursor when we can
     */
 
-    /* returns: # of seconds to sleep before next pass 
+    /* returns: # of seconds to sleep before next pass
                 0 = no sleep recommended
                 1 = special sentinel indicating adaptive sleep recommended
     */
@@ -1543,6 +1597,7 @@ namespace mongo {
             /* replication is not configured yet (for --slave) in local.sources.  Poll for config it
             every 20 seconds.
             */
+            log() << "no source given, add a master to local.sources to start replication" << endl;
             return 20;
         }
 
@@ -1553,7 +1608,7 @@ namespace mongo {
             try {
                 res = s->sync(nApplied);
                 bool moreToSync = s->haveMoreDbsToSync();
-                if( res < 0 ) { 
+                if( res < 0 ) {
                     sleepAdvice = 3;
                 }
                 else if( moreToSync ) {
@@ -1562,7 +1617,7 @@ namespace mongo {
                 else if ( s->sleepAdvice() ) {
                     sleepAdvice = s->sleepAdvice();
                 }
-                else 
+                else
                     sleepAdvice = res;
                 if ( res >= 0 && !moreToSync /*&& !s->syncedTo.isNull()*/ ) {
                     pairSync->setInitialSyncCompletedLocking();
@@ -1588,9 +1643,9 @@ namespace mongo {
             }
             catch ( const std::exception &e ) {
                 log() << "repl: std::exception " << e.what() << endl;
-                replInfo = "replMain caught std::exception";                
+                replInfo = "replMain caught std::exception";
             }
-            catch ( ... ) { 
+            catch ( ... ) {
                 log() << "unexpected exception during replication.  replication will halt" << endl;
                 replAllDead = "caught unexpected exception during replication";
             }
@@ -1616,15 +1671,16 @@ namespace mongo {
             try {
                 int nApplied = 0;
                 s = _replMain(sources, nApplied);
-                if( s == 1 ) { 
+                if( s == 1 ) {
                     if( nApplied == 0 ) s = 2;
-                    else if( nApplied > 100 ) { 
+                    else if( nApplied > 100 ) {
                         // sleep very little - just enought that we aren't truly hammering master
                         sleepmillis(75);
                         s = 0;
                     }
                 }
-            } catch (...) {
+            }
+            catch (...) {
                 out() << "caught exception in _replMain" << endl;
                 s = 4;
             }
@@ -1634,10 +1690,10 @@ namespace mongo {
                 syncing--;
             }
 
-			if( relinquishSyncingSome )  { 
-				relinquishSyncingSome = 0;
-				s = 1; // sleep before going back in to syncing=1
-			}
+            if( relinquishSyncingSome )  {
+                relinquishSyncingSome = 0;
+                s = 1; // sleep before going back in to syncing=1
+            }
 
             if ( s ) {
                 stringstream ss;
@@ -1660,21 +1716,21 @@ namespace mongo {
         while( 1 ) {
 
             sleepsecs( toSleep );
-            /* write a keep-alive like entry to the log.  this will make things like 
+            /* write a keep-alive like entry to the log.  this will make things like
                printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date
                even when things are idle.
             */
             {
                 writelocktry lk("",1);
-                if ( lk.got() ){
+                if ( lk.got() ) {
                     toSleep = 10;
-                    
-                    cc().getAuthenticationInfo()->authorize("admin");   
-                        
-                    try { 
+
+                    cc().getAuthenticationInfo()->authorize("admin");
+
+                    try {
                         logKeepalive();
                     }
-                    catch(...) { 
+                    catch(...) {
                         log() << "caught exception in replMasterThread()" << endl;
                     }
                 }
@@ -1690,11 +1746,11 @@ namespace mongo {
         sleepsecs(1);
         Client::initThread("replslave");
         cc().iAmSyncThread();
-            
+
         {
             dblock lk;
             cc().getAuthenticationInfo()->authorize("admin");
-        
+
             BSONObj obj;
             if ( Helpers::getSingleton("local.pair.startup", obj) ) {
                 // should be: {replacepeer:1}
@@ -1730,12 +1786,11 @@ namespace mongo {
     void startReplication() {
         /* if we are going to be a replica set, we aren't doing other forms of replication. */
         if( !cmdLine._replSet.empty() ) {
-            if( replSettings.slave || replSettings.master || replPair ) { 
+            if( replSettings.slave || replSettings.master || replPair ) {
                 log() << "***" << endl;
                 log() << "ERROR: can't use --slave or --master replication options with --replSet" << endl;
                 log() << "***" << endl;
             }
-            createOplog();
             newRepl();
             return;
         }
@@ -1773,7 +1828,7 @@ namespace mongo {
             createOplog();
             boost::thread t(replMasterThread);
         }
-        
+
         while( replSettings.fastsync ) // don't allow writes until we've set up from log
             sleepmillis( 50 );
     }
@@ -1807,5 +1862,29 @@ namespace mongo {
         }
         tp.join();
     }
-    
+
+    class ReplApplyBatchSizeValidator : public ParameterValidator {
+    public:
+        ReplApplyBatchSizeValidator() : ParameterValidator( "replApplyBatchSize" ) {}
+
+        virtual bool isValid( BSONElement e , string& errmsg ) {
+            int b = e.numberInt();
+            if( b < 1 || b > 1024 ) {
+                errmsg = "replApplyBatchSize has to be >= 1 and < 1024";
+                return false;
+            }
+
+            if ( replSettings.slavedelay != 0 && b > 1 ) {
+                errmsg = "can't use a batch size > 1 with slavedelay";
+                return false;
+            }
+            if ( ! replSettings.slave ) {
+                errmsg = "can't set replApplyBatchSize on a non-slave machine";
+                return false;
+            }
+
+            return true;
+        }
+    } replApplyBatchSizeValidator;
+
 } // namespace mongo
diff --git a/db/repl.h b/db/repl.h
index f33acad..45036fa 100644
--- a/db/repl.h
+++ b/db/repl.h
@@ -40,16 +40,16 @@
 
 namespace mongo {
 
-	/* replication slave? (possibly with slave or repl pair nonmaster)
+    /* replication slave? (possibly with slave or repl pair nonmaster)
        --slave cmd line setting -> SimpleSlave
-	*/
-	typedef enum { NotSlave=0, SimpleSlave, ReplPairSlave } SlaveTypes;
+    */
+    typedef enum { NotSlave=0, SimpleSlave, ReplPairSlave } SlaveTypes;
 
     class ReplSettings {
     public:
         SlaveTypes slave;
 
-        /* true means we are master and doing replication.  if we are not writing to oplog (no --master or repl pairing), 
+        /* true means we are master and doing replication.  if we are not writing to oplog (no --master or repl pairing),
            this won't be true.
         */
         bool master;
@@ -57,9 +57,9 @@ namespace mongo {
         int opIdMem;
 
         bool fastsync;
-        
+
         bool autoresync;
-        
+
         int slavedelay;
 
         ReplSettings()
@@ -69,14 +69,14 @@ namespace mongo {
     };
 
     extern ReplSettings replSettings;
-    
-    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, 
-				   bool slaveOk, bool useReplAuth, bool snapshot);
+
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+                   bool slaveOk, bool useReplAuth, bool snapshot);
 
     /* A replication exception */
     class SyncException : public DBException {
     public:
-        SyncException() : DBException( "sync exception" , 10001 ){}
+        SyncException() : DBException( "sync exception" , 10001 ) {}
     };
 
     /* A Source is a source from which we can pull (replicate) data.
@@ -94,11 +94,14 @@ namespace mongo {
 
         bool resync(string db);
 
-        /* pull some operations from the master's oplog, and apply them. */
+        /** @param alreadyLocked caller already put us in write lock if true */
+        void sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail, bool alreadyLocked);
+
+        /* pull some operations from the master's oplog, and apply them.
+           calls sync_pullOpLog_applyOperation
+        */
         int sync_pullOpLog(int& nApplied);
 
-        void sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail);
-        
         /* we only clone one database per pass, even if a lot need done.  This helps us
            avoid overflowing the master's transaction log by doing too much work before going
            back to read more transactions. (Imagine a scenario of slave startup where we try to
@@ -109,7 +112,7 @@ namespace mongo {
         set<string> incompleteCloneDbs;
 
         ReplSource();
-        
+
         // returns the dummy ns used to do the drop
         string resyncDrop( const char *db, const char *requester );
         // returns possibly unowned id spec for the operation.
@@ -127,7 +130,7 @@ namespace mongo {
         bool updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock );
         string ns() const { return string( "local.oplog.$" ) + sourceName(); }
         unsigned _sleepAdviceTime;
-        
+
     public:
         OplogReader oplogReader;
 
@@ -136,9 +139,7 @@ namespace mongo {
         bool paired; // --pair in use
         string hostName;    // ip addr or hostname plus optionally, ":<port>"
         string _sourceName;  // a logical source name.
-        string sourceName() const {
-            return _sourceName.empty() ? "main" : _sourceName;
-        }
+        string sourceName() const { return _sourceName.empty() ? "main" : _sourceName; }
         string only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating.
 
         /* the last time point we have already synced up to (in the remote/master's oplog). */
@@ -146,8 +147,8 @@ namespace mongo {
 
         /* This is for repl pairs.
            _lastSavedLocalTs is the most recent point in the local log that we know is consistent
-           with the remote log ( ie say the local op log has entries ABCDE and the remote op log 
-           has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled 
+           with the remote log ( ie say the local op log has entries ABCDE and the remote op log
+           has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled
            the DE-XY difference.)
         */
         OpTime _lastSavedLocalTs;
@@ -171,15 +172,15 @@ namespace mongo {
             return hostName == r.hostName && sourceName() == r.sourceName();
         }
         string toString() const { return sourceName() + "@" + hostName; }
-        
-        bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); }        
+
+        bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); }
         int sleepAdvice() const {
             if ( !_sleepAdviceTime )
                 return 0;
             int wait = _sleepAdviceTime - unsigned( time( 0 ) );
             return wait > 0 ? wait : 0;
         }
-        
+
         static bool throttledForceResyncDead( const char *requester );
         static void forceResyncDead( const char *requester );
         void forceResync( const char *requester );
@@ -200,7 +201,8 @@ namespace mongo {
                 if ( imp_[ ns ].insert( id.getOwned() ).second ) {
                     size_ += id.objsize() + sizeof( BSONObj );
                 }
-            } else {
+            }
+            else {
                 if ( imp_[ ns ].erase( id ) == 1 ) {
                     size_ -= id.objsize() + sizeof( BSONObj );
                 }
@@ -236,7 +238,7 @@ namespace mongo {
             // rename _id to id since there may be duplicates
             b.appendAs( id.firstElement(), "id" );
             return b.obj();
-        }        
+        }
         DbSet impl_;
     };
 
@@ -244,14 +246,14 @@ namespace mongo {
     // All functions must be called with db mutex held
     // Kind of sloppy class structure, for now just want to keep the in mem
     // version speedy.
-	// see http://www.mongodb.org/display/DOCS/Pairing+Internals
+    // see http://www.mongodb.org/display/DOCS/Pairing+Internals
     class IdTracker {
     public:
         IdTracker() :
-        dbIds_( "local.temp.replIds" ),
-        dbModIds_( "local.temp.replModIds" ),
-        inMem_( true ),
-        maxMem_( replSettings.opIdMem ) {
+            dbIds_( "local.temp.replIds" ),
+            dbModIds_( "local.temp.replModIds" ),
+            inMem_( true ),
+            maxMem_( replSettings.opIdMem ) {
         }
         void reset( int maxMem = replSettings.opIdMem ) {
             memIds_.reset();
@@ -309,7 +311,7 @@ namespace mongo {
         void upgrade( MemIds &a, DbIds &b ) {
             for( MemIds::IdSets::const_iterator i = a.imp_.begin(); i != a.imp_.end(); ++i ) {
                 for( BSONObjSetDefaultOrder::const_iterator j = i->second.begin(); j != i->second.end(); ++j ) {
-                    set( b, i->first.c_str(), *j, true );            
+                    set( b, i->first.c_str(), *j, true );
                     RARELY {
                         dbtemprelease t;
                     }
@@ -323,9 +325,9 @@ namespace mongo {
         bool inMem_;
         int maxMem_;
     };
-    
+
     bool anyReplEnabled();
     void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 );
-    
-    
+
+
 } // namespace mongo
diff --git a/db/repl/connections.h b/db/repl/connections.h
index cdf2fad..7e7bfe5 100644
--- a/db/repl/connections.h
+++ b/db/repl/connections.h
@@ -1,4 +1,4 @@
-// @file 
+// @file
 
 /*
  *    Copyright (C) 2010 10gen Inc.
@@ -20,11 +20,12 @@
 
 #include <map>
 #include "../../client/dbclient.h"
+#include "../security_key.h"
 
-namespace mongo { 
+namespace mongo {
 
-    /** here we keep a single connection (with reconnect) for a set of hosts, 
-        one each, and allow one user at a time per host.  if in use already for that 
+    /** here we keep a single connection (with reconnect) for a set of hosts,
+        one each, and allow one user at a time per host.  if in use already for that
         host, we block.  so this is an easy way to keep a 1-deep pool of connections
         that many threads can share.
 
@@ -39,35 +40,37 @@ namespace mongo {
         throws exception on connect error (but fine to try again later with a new
         scopedconn object for same host).
     */
-    class ScopedConn { 
+    class ScopedConn {
     public:
         /** throws assertions if connect failure etc. */
         ScopedConn(string hostport);
         ~ScopedConn();
 
         /* If we were to run a query and not exhaust the cursor, future use of the connection would be problematic.
-           So here what we do is wrapper known safe methods and not allow cursor-style queries at all.  This makes 
+           So here what we do is wrapper known safe methods and not allow cursor-style queries at all.  This makes
            ScopedConn limited in functionality but very safe.  More non-cursor wrappers can be added here if needed.
            */
 
         bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0) {
             return conn()->runCommand(dbname, cmd, info, options);
         }
-        unsigned long long count(const string &ns) { 
-            return conn()->count(ns); 
+        unsigned long long count(const string &ns) {
+            return conn()->count(ns);
         }
-        BSONObj findOne(const string &ns, const Query& q, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) { 
+        BSONObj findOne(const string &ns, const Query& q, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) {
             return conn()->findOne(ns, q, fieldsToReturn, queryOptions);
         }
+        void setTimeout(double to) {
+            conn()->setSoTimeout(to);
+        }
 
     private:
         auto_ptr<scoped_lock> connLock;
-        static mutex mapMutex;
-        struct X { 
-            mutex z;
+        static mongo::mutex mapMutex;
+        struct X {
+            mongo::mutex z;
             DBClientConnection cc;
-            X() : z("X"), cc(/*reconnect*/ true, 0, 
-                             /*timeout*/ theReplSet ? theReplSet->config().ho.heartbeatTimeoutMillis/1000.0 : 10.0) { 
+            X() : z("X"), cc(/*reconnect*/ true, 0, /*timeout*/ 10.0) {
                 cc._logLevel = 2;
             }
         } *x;
@@ -87,22 +90,30 @@ namespace mongo {
                 connLock.reset( new scoped_lock(x->z) );
             }
         }
-        if( !first ) { 
+        if( !first ) {
             connLock.reset( new scoped_lock(x->z) );
             return;
         }
 
         // we already locked above...
         string err;
-        x->cc.connect(hostport, err);
+        if (!x->cc.connect(hostport, err)) {
+            log() << "couldn't connect to " << hostport << ": " << err << rsLog;
+            return;
+        }
+
+        if (!noauth && !x->cc.auth("local", internalSecurity.user, internalSecurity.pwd, err, false)) {
+            log() << "could not authenticate against " << conn()->toString() << ", " << err << rsLog;
+            return;
+        }
     }
 
-    inline ScopedConn::~ScopedConn() { 
+    inline ScopedConn::~ScopedConn() {
         // conLock releases...
     }
 
-    /*inline DBClientConnection* ScopedConn::operator->() { 
-        return &x->cc; 
+    /*inline DBClientConnection* ScopedConn::operator->() {
+        return &x->cc;
     }*/
 
 }
diff --git a/db/repl/consensus.cpp b/db/repl/consensus.cpp
index 1519c26..f764abe 100644
--- a/db/repl/consensus.cpp
+++ b/db/repl/consensus.cpp
@@ -19,9 +19,9 @@
 #include "rs.h"
 #include "multicmd.h"
 
-namespace mongo { 
+namespace mongo {
 
-    class CmdReplSetFresh : public ReplSetCommand { 
+    class CmdReplSetFresh : public ReplSetCommand {
     public:
         CmdReplSetFresh() : ReplSetCommand("replSetFresh") { }
     private:
@@ -29,23 +29,23 @@ namespace mongo {
             if( !check(errmsg, result) )
                 return false;
 
-            if( cmdObj["set"].String() != theReplSet->name() ) { 
+            if( cmdObj["set"].String() != theReplSet->name() ) {
                 errmsg = "wrong repl set name";
                 return false;
             }
             string who = cmdObj["who"].String();
             int cfgver = cmdObj["cfgver"].Int();
-			OpTime opTime(cmdObj["opTime"].Date());
+            OpTime opTime(cmdObj["opTime"].Date());
 
             bool weAreFresher = false;
-            if( theReplSet->config().version > cfgver ) { 
+            if( theReplSet->config().version > cfgver ) {
                 log() << "replSet member " << who << " is not yet aware its cfg version " << cfgver << " is stale" << rsLog;
-				result.append("info", "config version stale");
+                result.append("info", "config version stale");
+                weAreFresher = true;
+            }
+            else if( opTime < theReplSet->lastOpTimeWritten )  {
                 weAreFresher = true;
             }
-            else if( opTime < theReplSet->lastOpTimeWritten )  { 
-				weAreFresher = true;
-			}
             result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
             result.append("fresher", weAreFresher);
             return true;
@@ -66,19 +66,19 @@ namespace mongo {
         }
     } cmdReplSetElect;
 
-    int Consensus::totalVotes() const { 
+    int Consensus::totalVotes() const {
         static int complain = 0;
         int vTot = rs._self->config().votes;
-        for( Member *m = rs.head(); m; m=m->next() ) 
+        for( Member *m = rs.head(); m; m=m->next() )
             vTot += m->config().votes;
         if( vTot % 2 == 0 && vTot && complain++ == 0 )
-            log() << "replSet warning total number of votes is even - considering giving one member an extra vote" << rsLog;
+            log() << "replSet " /*buildbot! warning */ "total number of votes is even - add arbiter or give one member an extra vote" << rsLog;
         return vTot;
     }
 
     bool Consensus::aMajoritySeemsToBeUp() const {
         int vUp = rs._self->config().votes;
-        for( Member *m = rs.head(); m; m=m->next() ) 
+        for( Member *m = rs.head(); m; m=m->next() )
             vUp += m->hbinfo().up() ? m->config().votes : 0;
         return vUp * 2 > totalVotes();
     }
@@ -98,13 +98,13 @@ namespace mongo {
 
     const time_t LeaseTime = 30;
 
-    unsigned Consensus::yea(unsigned memberId) /* throws VoteException */ {
+    unsigned Consensus::yea(unsigned memberId) { /* throws VoteException */
         Atomic<LastYea>::tran t(ly);
         LastYea &ly = t.ref();
         time_t now = time(0);
         if( ly.when + LeaseTime >= now && ly.who != memberId ) {
             log(1) << "replSet not voting yea for " << memberId <<
-                " voted for " << ly.who << ' ' << now-ly.when << " secs ago" << rsLog;
+                   " voted for " << ly.who << ' ' << now-ly.when << " secs ago" << rsLog;
             throw VoteException();
         }
         ly.when = now;
@@ -112,7 +112,7 @@ namespace mongo {
         return rs._self->config().votes;
     }
 
-    /* we vote for ourself at start of election.  once it fails, we can cancel the lease we had in 
+    /* we vote for ourself at start of election.  once it fails, we can cancel the lease we had in
        place instead of leaving it for a long time.
        */
     void Consensus::electionFailed(unsigned meid) {
@@ -124,7 +124,7 @@ namespace mongo {
     }
 
     /* todo: threading **************** !!!!!!!!!!!!!!!! */
-    void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) { 
+    void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) {
         BSONObjBuilder& b = *_b;
         DEV log() << "replSet received elect msg " << cmd.toString() << rsLog;
         else log(2) << "replSet received elect msg " << cmd.toString() << rsLog;
@@ -138,14 +138,14 @@ namespace mongo {
         const Member* hopeful = rs.findById(whoid);
 
         int vote = 0;
-        if( set != rs.name() ) { 
+        if( set != rs.name() ) {
             log() << "replSet error received an elect request for '" << set << "' but our set name is '" << rs.name() << "'" << rsLog;
 
         }
-        else if( myver < cfgver ) { 
+        else if( myver < cfgver ) {
             // we are stale.  don't vote
         }
-        else if( myver > cfgver ) { 
+        else if( myver > cfgver ) {
             // they are stale!
             log() << "replSet info got stale version # during election" << rsLog;
             vote = -10000;
@@ -154,10 +154,10 @@ namespace mongo {
             log() << "couldn't find member with id " << whoid << rsLog;
             vote = -10000;
         }
-        else if( primary && primary->hbinfo().opTime > hopeful->hbinfo().opTime ) {
+        else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) {
             // other members might be aware of more up-to-date nodes
             log() << hopeful->fullName() << " is trying to elect itself but " <<
-                primary->fullName() << " is already primary and more up-to-date" << rsLog;
+                  primary->fullName() << " is already primary and more up-to-date" << rsLog;
             vote = -10000;
         }
         else {
@@ -166,7 +166,7 @@ namespace mongo {
                 rs.relinquish();
                 log() << "replSet info voting yea for " << whoid << rsLog;
             }
-            catch(VoteException&) { 
+            catch(VoteException&) {
                 log() << "replSet voting no already voted for another" << rsLog;
             }
         }
@@ -182,10 +182,10 @@ namespace mongo {
                 L.push_back( Target(m->fullName()) );
     }
 
-    /* config version is returned as it is ok to use this unlocked.  BUT, if unlocked, you would need 
+    /* config version is returned as it is ok to use this unlocked.  BUT, if unlocked, you would need
        to check later that the config didn't change. */
     void ReplSetImpl::getTargets(list<Target>& L, int& configVersion) {
-        if( lockedByMe() ) { 
+        if( lockedByMe() ) {
             _getTargets(L, configVersion);
             return;
         }
@@ -200,15 +200,21 @@ namespace mongo {
     bool Consensus::weAreFreshest(bool& allUp, int& nTies) {
         const OpTime ord = theReplSet->lastOpTimeWritten;
         nTies = 0;
-		assert( !ord.isNull() );
+        assert( !ord.isNull() );
         BSONObj cmd = BSON(
-               "replSetFresh" << 1 <<
-               "set" << rs.name() << 
-			   "opTime" << Date_t(ord.asDate()) <<
-               "who" << rs._self->fullName() << 
-               "cfgver" << rs._cfg->version );
+                          "replSetFresh" << 1 <<
+                          "set" << rs.name() <<
+                          "opTime" << Date_t(ord.asDate()) <<
+                          "who" << rs._self->fullName() <<
+                          "cfgver" << rs._cfg->version );
         list<Target> L;
         int ver;
+        /* the following queries arbiters, even though they are never fresh.  wonder if that makes sense.
+           it doesn't, but it could, if they "know" what freshness it one day.  so consider removing
+           arbiters from getTargets() here.  although getTargets is used elsewhere for elections; there
+           arbiters are certainly targets - so a "includeArbs" bool would be necessary if we want to make
+           not fetching them herein happen.
+           */
         rs.getTargets(L, ver);
         multiCommand(cmd, L);
         int nok = 0;
@@ -228,25 +234,25 @@ namespace mongo {
                 allUp = false;
             }
         }
-        DEV log() << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog; 
+        log(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog;
         assert( ord <= theReplSet->lastOpTimeWritten ); // <= as this may change while we are working...
         return true;
     }
 
     extern time_t started;
 
-    void Consensus::multiCommand(BSONObj cmd, list<Target>& L) { 
+    void Consensus::multiCommand(BSONObj cmd, list<Target>& L) {
         assert( !rs.lockedByMe() );
         mongo::multiCommand(cmd, L);
     }
 
     void Consensus::_electSelf() {
-        if( time(0) < steppedDown ) 
+        if( time(0) < steppedDown )
             return;
 
         {
             const OpTime ord = theReplSet->lastOpTimeWritten;
-            if( ord == 0 ) { 
+            if( ord == 0 ) {
                 log() << "replSet info not trying to elect self, do not yet have a complete set of data from any point in time" << rsLog;
                 return;
             }
@@ -254,16 +260,16 @@ namespace mongo {
 
         bool allUp;
         int nTies;
-        if( !weAreFreshest(allUp, nTies) ) { 
+        if( !weAreFreshest(allUp, nTies) ) {
             log() << "replSet info not electing self, we are not freshest" << rsLog;
             return;
         }
 
         rs.sethbmsg("",9);
 
-        if( !allUp && time(0) - started < 60 * 5 ) { 
-            /* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data 
-               if we don't have to -- we'd rather be offline and wait a little longer instead 
+        if( !allUp && time(0) - started < 60 * 5 ) {
+            /* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data
+               if we don't have to -- we'd rather be offline and wait a little longer instead
                todo: make this configurable.
                */
             rs.sethbmsg("not electing self, not all members up and we have been up less than 5 minutes");
@@ -276,9 +282,10 @@ namespace mongo {
             /* tie?  we then randomly sleep to try to not collide on our voting. */
             /* todo: smarter. */
             if( me.id() == 0 || sleptLast ) {
-                // would be fine for one node not to sleep 
+                // would be fine for one node not to sleep
                 // todo: biggest / highest priority nodes should be the ones that get to not sleep
-            } else {
+            }
+            else {
                 assert( !rs.lockedByMe() ); // bad to go to sleep locked
                 unsigned ms = ((unsigned) rand()) % 1000 + 50;
                 DEV log() << "replSet tie " << nTies << " sleeping a little " << ms << "ms" << rsLog;
@@ -297,13 +304,13 @@ namespace mongo {
             log() << "replSet info electSelf " << meid << rsLog;
 
             BSONObj electCmd = BSON(
-                   "replSetElect" << 1 <<
-                   "set" << rs.name() << 
-                   "who" << me.fullName() << 
-                   "whoid" << me.hbinfo().id() << 
-                   "cfgver" << rs._cfg->version << 
-                   "round" << OID::gen() /* this is just for diagnostics */
-                );
+                                   "replSetElect" << 1 <<
+                                   "set" << rs.name() <<
+                                   "who" << me.fullName() <<
+                                   "whoid" << me.hbinfo().id() <<
+                                   "cfgver" << rs._cfg->version <<
+                                   "round" << OID::gen() /* this is just for diagnostics */
+                               );
 
             int configVersion;
             list<Target> L;
@@ -326,7 +333,7 @@ namespace mongo {
                     // defensive; should never happen as we have timeouts on connection and operation for our conn
                     log() << "replSet too much time passed during our election, ignoring result" << rsLog;
                 }
-                else if( configVersion != rs.config().version ) { 
+                else if( configVersion != rs.config().version ) {
                     log() << "replSet config version changed during our election, ignoring result" << rsLog;
                 }
                 else {
@@ -334,9 +341,10 @@ namespace mongo {
                     log(1) << "replSet election succeeded, assuming primary role" << rsLog;
                     success = true;
                     rs.assumePrimary();
-                } 
+                }
             }
-        } catch( std::exception& ) { 
+        }
+        catch( std::exception& ) {
             if( !success ) electionFailed(meid);
             throw;
         }
@@ -347,19 +355,19 @@ namespace mongo {
         assert( !rs.lockedByMe() );
         assert( !rs.myConfig().arbiterOnly );
         assert( rs.myConfig().slaveDelay == 0 );
-        try { 
-            _electSelf(); 
-        } 
-        catch(RetryAfterSleepException&) { 
+        try {
+            _electSelf();
+        }
+        catch(RetryAfterSleepException&) {
             throw;
         }
-        catch(VoteException& ) { 
+        catch(VoteException& ) {
             log() << "replSet not trying to elect self as responded yea to someone else recently" << rsLog;
         }
-        catch(DBException& e) { 
+        catch(DBException& e) {
             log() << "replSet warning caught unexpected exception in electSelf() " << e.toString() << rsLog;
         }
-        catch(...) { 
+        catch(...) {
             log() << "replSet warning caught unexpected exception in electSelf()" << rsLog;
         }
     }
diff --git a/db/repl/health.cpp b/db/repl/health.cpp
index c75221c..762ca90 100644
--- a/db/repl/health.cpp
+++ b/db/repl/health.cpp
@@ -32,20 +32,22 @@
 #include "../dbhelpers.h"
 
 namespace mongo {
+
     /* decls for connections.h */
-    ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M());    
+    ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M());
     mutex ScopedConn::mapMutex("ScopedConn::mapMutex");
 }
 
-namespace mongo { 
+namespace mongo {
 
     using namespace mongoutils::html;
     using namespace bson;
 
     static RamLog _rsLog;
     Tee *rsLog = &_rsLog;
+    extern bool replSetBlind;
 
-    string ago(time_t t) { 
+    string ago(time_t t) {
         if( t == 0 ) return "";
 
         time_t x = time(0) - t;
@@ -58,14 +60,14 @@ namespace mongo {
             s.precision(2);
             s << x / 60.0 << " mins";
         }
-        else { 
+        else {
             s.precision(2);
             s << x / 3600.0 << " hrs";
         }
         return s.str();
     }
 
-    void Member::summarizeMember(stringstream& s) const { 
+    void Member::summarizeMember(stringstream& s) const {
         s << tr();
         {
             stringstream u;
@@ -89,27 +91,29 @@ namespace mongo {
             s << td(h);
         }
         s << td(config().votes);
-        { 
+        s << td(config().priority);
+        {
             string stateText = state().toString();
             if( _config.hidden )
                 stateText += " (hidden)";
-            if( ok || stateText.empty() ) 
+            if( ok || stateText.empty() )
                 s << td(stateText); // text blank if we've never connected
             else
                 s << td( grey(str::stream() << "(was " << state().toString() << ')', true) );
         }
         s << td( grey(hbinfo().lastHeartbeatMsg,!ok) );
         stringstream q;
-        q << "/_replSetOplog?" << id();
+        q << "/_replSetOplog?_id=" << id();
         s << td( a(q.str(), "", never ? "?" : hbinfo().opTime.toString()) );
         if( hbinfo().skew > INT_MIN ) {
             s << td( grey(str::stream() << hbinfo().skew,!ok) );
-        } else
+        }
+        else
             s << td("");
         s << _tr();
     }
-   
-    string ReplSetImpl::stateAsHtml(MemberState s) { 
+
+    string ReplSetImpl::stateAsHtml(MemberState s) {
         if( s.s == MemberState::RS_STARTUP ) return a("", "serving still starting up, or still trying to initiate the set", "STARTUP");
         if( s.s == MemberState::RS_PRIMARY ) return a("", "this server thinks it is primary", "PRIMARY");
         if( s.s == MemberState::RS_SECONDARY ) return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY");
@@ -122,7 +126,7 @@ namespace mongo {
         return "";
     }
 
-    string MemberState::toString() const { 
+    string MemberState::toString() const {
         if( s == MemberState::RS_STARTUP ) return "STARTUP";
         if( s == MemberState::RS_PRIMARY ) return "PRIMARY";
         if( s == MemberState::RS_SECONDARY ) return "SECONDARY";
@@ -143,9 +147,9 @@ namespace mongo {
 
         set<string> skip;
         be e = op["ts"];
-        if( e.type() == Date || e.type() == Timestamp ) { 
+        if( e.type() == Date || e.type() == Timestamp ) {
             OpTime ot = e._opTime();
-	    ss << td( time_t_to_String_short( ot.getSecs() ) );
+            ss << td( time_t_to_String_short( ot.getSecs() ) );
             ss << td( ot.toString() );
             skip.insert("ts");
         }
@@ -155,7 +159,8 @@ namespace mongo {
         if( e.type() == NumberLong ) {
             ss << "<td>" << hex << e.Long() << "</td>\n";
             skip.insert("h");
-        } else
+        }
+        else
             ss << td("?");
 
         ss << td(op["op"].valuestrsafe());
@@ -164,20 +169,17 @@ namespace mongo {
         skip.insert("ns");
 
         ss << "<td>";
-        for( bo::iterator i(op); i.more(); ) { 
+        for( bo::iterator i(op); i.more(); ) {
             be e = i.next();
             if( skip.count(e.fieldName()) ) continue;
             ss << e.toString() << ' ';
         }
-        ss << "</td>";
-
-        ss << "</tr>";
-        ss << '\n';
+        ss << "</td></tr>\n";
     }
 
-    void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { 
+    void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const {
         const Member *m = findById(server_id);
-        if( m == 0 ) { 
+        if( m == 0 ) {
             ss << "Error : can't find a member with id: " << server_id << '\n';
             return;
         }
@@ -187,21 +189,29 @@ namespace mongo {
         //const bo fields = BSON( "o" << false << "o2" << false );
         const bo fields;
 
-        ScopedDbConnection conn(m->fullName());
+        /** todo fix we might want an so timeout here */
+        DBClientConnection conn(false, 0, /*timeout*/ 20);
+        {
+            string errmsg;
+            if( !conn.connect(m->fullName(), errmsg) ) {
+                ss << "couldn't connect to " << m->fullName() << ' ' << errmsg;
+                return;
+            }
+        }
 
-        auto_ptr<DBClientCursor> c = conn->query(rsoplog, Query().sort("$natural",1), 20, 0, &fields);
-        if( c.get() == 0 ) { 
+        auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields);
+        if( c.get() == 0 ) {
             ss << "couldn't query " << rsoplog;
             return;
         }
         static const char *h[] = {"ts","optime", "h","op","ns","rest",0};
 
         ss << "<style type=\"text/css\" media=\"screen\">"
-            "table { font-size:75% }\n"
+           "table { font-size:75% }\n"
 //            "th { background-color:#bbb; color:#000 }\n"
 //            "td,th { padding:.25em }\n"
-            "</style>\n";
-        
+           "</style>\n";
+
         ss << table(h, true);
         //ss << "<pre>\n";
         int n = 0;
@@ -211,17 +221,17 @@ namespace mongo {
         while( c->more() ) {
             bo o = c->next();
             otLast = o["ts"]._opTime();
-            if( otFirst.isNull() ) 
+            if( otFirst.isNull() )
                 otFirst = otLast;
             say(ss, o);
-            n++;            
+            n++;
         }
         if( n == 0 ) {
             ss << rsoplog << " is empty\n";
         }
-        else { 
-            auto_ptr<DBClientCursor> c = conn->query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields);
-            if( c.get() == 0 ) { 
+        else {
+            auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields);
+            if( c.get() == 0 ) {
                 ss << "couldn't query [2] " << rsoplog;
                 return;
             }
@@ -230,7 +240,7 @@ namespace mongo {
             otEnd = o["ts"]._opTime();
             while( 1 ) {
                 stringstream z;
-                if( o["ts"]._opTime() == otLast ) 
+                if( o["ts"]._opTime() == otLast )
                     break;
                 say(z, o);
                 x = z.str() + x;
@@ -253,32 +263,31 @@ namespace mongo {
             ss.precision(3);
             if( h < 72 )
                 ss << h << " hours";
-            else 
+            else
                 ss << h / 24.0 << " days";
             ss << "</p>\n";
         }
-
-        conn.done();
     }
 
-    void ReplSetImpl::_summarizeAsHtml(stringstream& s) const { 
+    void ReplSetImpl::_summarizeAsHtml(stringstream& s) const {
         s << table(0, false);
         s << tr("Set name:", _name);
         s << tr("Majority up:", elect.aMajoritySeemsToBeUp()?"yes":"no" );
         s << _table();
 
-        const char *h[] = {"Member", 
-            "<a title=\"member id in the replset config\">id</a>", 
-            "Up", 
-            "<a title=\"length of time we have been continuously connected to the other member with no reconnects (for self, shows uptime)\">cctime</a>", 
-            "<a title=\"when this server last received a heartbeat response - includes error code responses\">Last heartbeat</a>", 
-            "Votes", "State", "Status", 
-            "<a title=\"how up to date this server is.  this value polled every few seconds so actually lag is typically much lower than value shown here.\">optime</a>", 
-            "<a title=\"Clock skew in seconds relative to this server. Informational; server clock variances will make the diagnostics hard to read, but otherwise are benign..\">skew</a>", 
-            0};
+        const char *h[] = {"Member",
+                           "<a title=\"member id in the replset config\">id</a>",
+                           "Up",
+                           "<a title=\"length of time we have been continuously connected to the other member with no reconnects (for self, shows uptime)\">cctime</a>",
+                           "<a title=\"when this server last received a heartbeat response - includes error code responses\">Last heartbeat</a>",
+                           "Votes", "Priority", "State", "Messages",
+                           "<a title=\"how up to date this server is.  this value polled every few seconds so actually lag is typically much lower than value shown here.\">optime</a>",
+                           "<a title=\"Clock skew in seconds relative to this server. Informational; server clock variances will make the diagnostics hard to read, but otherwise are benign..\">skew</a>",
+                           0
+                          };
         s << table(h);
 
-        /* this is to sort the member rows by their ordinal _id, so they show up in the same 
+        /* this is to sort the member rows by their ordinal _id, so they show up in the same
            order on all the different web ui's; that is less confusing for the operator. */
         map<int,string> mp;
 
@@ -287,13 +296,13 @@ namespace mongo {
             readlocktry lk("local.replset.minvalid", 300);
             if( lk.got() ) {
                 BSONObj mv;
-                if( Helpers::getSingleton("local.replset.minvalid", mv) ) { 
+                if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
                     myMinValid = "minvalid:" + mv["ts"]._opTime().toString();
                 }
             }
             else myMinValid = ".";
         }
-        catch(...) { 
+        catch(...) {
             myMinValid = "exception fetching minvalid";
         }
 
@@ -301,25 +310,26 @@ namespace mongo {
             stringstream s;
             /* self row */
             s << tr() << td(_self->fullName() + " (me)") <<
-                td(_self->id()) <<
-  	        td("1") <<  //up
-                td(ago(started)) << 
-	        td("") << // last heartbeat
-                td(ToString(_self->config().votes)) << 
-                td( stateAsHtml(box.getState()) + (_self->config().hidden?" (hidden)":"") );
+              td(_self->id()) <<
+              td("1") <<  //up
+              td(ago(started)) <<
+              td("") << // last heartbeat
+              td(ToString(_self->config().votes)) <<
+              td(ToString(_self->config().priority)) <<
+              td( stateAsHtml(box.getState()) + (_self->config().hidden?" (hidden)":"") );
             s << td( _hbmsg );
             stringstream q;
-            q << "/_replSetOplog?" << _self->id();
+            q << "/_replSetOplog?_id=" << _self->id();
             s << td( a(q.str(), myMinValid, theReplSet->lastOpTimeWritten.toString()) );
             s << td(""); // skew
             s << _tr();
-			mp[_self->hbinfo().id()] = s.str();
+            mp[_self->hbinfo().id()] = s.str();
         }
         Member *m = head();
         while( m ) {
-			stringstream s;
+            stringstream s;
             m->summarizeMember(s);
-			mp[m->hbinfo().id()] = s.str();
+            mp[m->hbinfo().id()] = s.str();
             m = m->next();
         }
 
@@ -333,26 +343,27 @@ namespace mongo {
         _rsLog.toHTML( s );
     }
 
-    const Member* ReplSetImpl::findById(unsigned id) const { 
+    const Member* ReplSetImpl::findById(unsigned id) const {
         if( id == _self->id() ) return _self;
         for( Member *m = head(); m; m = m->next() )
-            if( m->id() == id ) 
+            if( m->id() == id )
                 return m;
         return 0;
     }
 
-    void ReplSetImpl::_summarizeStatus(BSONObjBuilder& b) const { 
+    void ReplSetImpl::_summarizeStatus(BSONObjBuilder& b) const {
         vector<BSONObj> v;
 
         // add self
         {
-            HostAndPort h(getHostName(), cmdLine.port);
-
             BSONObjBuilder bb;
             bb.append("_id", (int) _self->id());
-            bb.append("name", h.toString());
+            bb.append("name", _self->fullName());
             bb.append("health", 1.0);
             bb.append("state", (int) box.getState().s);
+            bb.append("stateStr", box.getState().toString());
+            bb.appendTimestamp("optime", lastOpTimeWritten.asDate());
+            bb.appendDate("optimeDate", lastOpTimeWritten.getSecs() * 1000LL);
             string s = _self->lhb();
             if( !s.empty() )
                 bb.append("errmsg", s);
@@ -365,9 +376,19 @@ namespace mongo {
             BSONObjBuilder bb;
             bb.append("_id", (int) m->id());
             bb.append("name", m->fullName());
-            bb.append("health", m->hbinfo().health);
+            double h = m->hbinfo().health;
+            bb.append("health", h);
             bb.append("state", (int) m->state().s);
+            if( h == 0 ) {
+                // if we can't connect the state info is from the past and could be confusing to show
+                bb.append("stateStr", "(not reachable/healthy)");
+            }
+            else {
+                bb.append("stateStr", m->state().toString());
+            }
             bb.append("uptime", (unsigned) (m->hbinfo().upSince ? (time(0)-m->hbinfo().upSince) : 0));
+            bb.appendTimestamp("optime", m->hbinfo().opTime.asDate());
+            bb.appendDate("optimeDate", m->hbinfo().opTime.getSecs() * 1000LL);
             bb.appendTimeT("lastHeartbeat", m->hbinfo().lastHeartbeat);
             string s = m->lhb();
             if( !s.empty() )
@@ -380,10 +401,12 @@ namespace mongo {
         b.appendTimeT("date", time(0));
         b.append("myState", box.getState().s);
         b.append("members", v);
+        if( replSetBlind )
+            b.append("blind",true); // to avoid confusion if set...normally never set except for testing.
     }
 
-    static struct Test : public UnitTest { 
-        void run() { 
+    static struct Test : public UnitTest {
+        void run() {
             HealthOptions a,b;
             assert( a == b );
             assert( a.isDefault() );
diff --git a/db/repl/health.h b/db/repl/health.h
index 645a3b5..a32db00 100644
--- a/db/repl/health.h
+++ b/db/repl/health.h
@@ -23,8 +23,8 @@ namespace mongo {
     /* throws */
     bool requestHeartbeat(string setname, string fromHost, string memberFullName, BSONObj& result, int myConfigVersion, int& theirConfigVersion, bool checkEmpty = false);
 
-    struct HealthOptions { 
-        HealthOptions() { 
+    struct HealthOptions {
+        HealthOptions() {
             heartbeatSleepMillis = 2000;
             heartbeatTimeoutMillis = 10000;
             heartbeatConnRetries  = 2;
@@ -42,8 +42,8 @@ namespace mongo {
             uassert(13113, "bad replset heartbeat option", heartbeatTimeoutMillis >= 10);
         }
 
-        bool operator==(const HealthOptions& r) const { 
-            return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==heartbeatConnRetries; 
+        bool operator==(const HealthOptions& r) const {
+            return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==heartbeatConnRetries;
         }
     };
 
diff --git a/db/repl/heartbeat.cpp b/db/repl/heartbeat.cpp
index b39fad7..3972466 100644
--- a/db/repl/heartbeat.cpp
+++ b/db/repl/heartbeat.cpp
@@ -31,7 +31,7 @@
 #include "../../util/unittest.h"
 #include "../instance.h"
 
-namespace mongo { 
+namespace mongo {
 
     using namespace bson;
 
@@ -42,7 +42,7 @@ namespace mongo {
 
     long long HeartbeatInfo::timeDown() const {
         if( up() ) return 0;
-        if( downSince == 0 ) 
+        if( downSince == 0 )
             return 0; // still waiting on first heartbeat
         return jsTime() - downSince;
     }
@@ -53,10 +53,10 @@ namespace mongo {
         virtual bool adminOnly() const { return false; }
         CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") { }
         virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            if( replSetBlind ) 
+            if( replSetBlind )
                 return false;
 
-            /* we don't call ReplSetCommand::check() here because heartbeat 
+            /* we don't call ReplSetCommand::check() here because heartbeat
                checks many things that are pre-initialization. */
             if( !replSet ) {
                 errmsg = "not running with --replSet";
@@ -65,12 +65,12 @@ namespace mongo {
 
             /* we want to keep heartbeat connections open when relinquishing primary.  tag them here. */
             {
-                MessagingPort *mp = cc()._mp;
-                if( mp ) 
+                MessagingPort *mp = cc().port();
+                if( mp )
                     mp->tag |= 1;
             }
 
-            if( cmdObj["pv"].Int() != 1 ) { 
+            if( cmdObj["pv"].Int() != 1 ) {
                 errmsg = "incompatible replset protocol version";
                 return false;
             }
@@ -86,7 +86,7 @@ namespace mongo {
             }
 
             result.append("rs", true);
-            if( cmdObj["checkEmpty"].trueValue() ) { 
+            if( cmdObj["checkEmpty"].trueValue() ) {
                 result.append("hasData", replHasDatabases());
             }
             if( theReplSet == 0 ) {
@@ -98,7 +98,7 @@ namespace mongo {
                 return false;
             }
 
-            if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) { 
+            if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) {
                 errmsg = "repl set names do not match (2)";
                 result.append("mismatch", true);
                 return false;
@@ -118,8 +118,8 @@ namespace mongo {
     } cmdReplSetHeartbeat;
 
     /* throws dbexception */
-    bool requestHeartbeat(string setName, string from, string memberFullName, BSONObj& result, int myCfgVersion, int& theirCfgVersion, bool checkEmpty) { 
-        if( replSetBlind ) { 
+    bool requestHeartbeat(string setName, string from, string memberFullName, BSONObj& result, int myCfgVersion, int& theirCfgVersion, bool checkEmpty) {
+        if( replSetBlind ) {
             //sleepmillis( rand() );
             return false;
         }
@@ -144,8 +144,8 @@ namespace mongo {
     public:
         ReplSetHealthPollTask(const HostAndPort& hh, const HeartbeatInfo& mm) : h(hh), m(mm) { }
 
-        string name() { return "ReplSetHealthPollTask"; }
-        void doWork() { 
+        string name() const { return "ReplSetHealthPollTask"; }
+        void doWork() {
             if ( !theReplSet ) {
                 log(2) << "theReplSet not initialized yet, skipping health poll this round" << rsLog;
                 return;
@@ -153,7 +153,7 @@ namespace mongo {
 
             HeartbeatInfo mem = m;
             HeartbeatInfo old = mem;
-            try { 
+            try {
                 BSONObj info;
                 int theirConfigVersion = -10000;
 
@@ -163,15 +163,17 @@ namespace mongo {
 
                 time_t after = mem.lastHeartbeat = time(0); // we set this on any response - we don't get this far if couldn't connect because exception is thrown
 
-                try {
-                    mem.skew = 0;
-                    long long t = info["time"].Long();
-                    if( t > after ) 
+                if ( info["time"].isNumber() ) {
+                    long long t = info["time"].numberLong();
+                    if( t > after )
                         mem.skew = (int) (t - after);
-                    else if( t < before ) 
+                    else if( t < before )
                         mem.skew = (int) (t - before); // negative
                 }
-                catch(...) { 
+                else {
+                    // it won't be there if remote hasn't initialized yet
+                    if( info.hasElement("time") )
+                        warning() << "heatbeat.time isn't a number: " << info << endl;
                     mem.skew = INT_MIN;
                 }
 
@@ -182,7 +184,7 @@ namespace mongo {
                 }
                 if( ok ) {
                     if( mem.upSince == 0 ) {
-                        log() << "replSet info " << h.toString() << " is now up" << rsLog;
+                        log() << "replSet info " << h.toString() << " is up" << rsLog;
                         mem.upSince = mem.lastHeartbeat;
                     }
                     mem.health = 1.0;
@@ -193,17 +195,20 @@ namespace mongo {
                     be cfg = info["config"];
                     if( cfg.ok() ) {
                         // received a new config
-                        boost::function<void()> f = 
+                        boost::function<void()> f =
                             boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
                         theReplSet->mgr->send(f);
                     }
                 }
-                else { 
+                else {
                     down(mem, info.getStringField("errmsg"));
                 }
             }
-            catch(...) { 
-                down(mem, "connect/transport error");             
+            catch(DBException& e) {
+                down(mem, e.what());
+            }
+            catch(...) {
+                down(mem, "something unusual went wrong");
             }
             m = mem;
 
@@ -212,9 +217,9 @@ namespace mongo {
             static time_t last = 0;
             time_t now = time(0);
             bool changed = mem.changed(old);
-            if( changed ) { 
-                if( old.hbstate != mem.hbstate ) 
-                    log() << "replSet " << h.toString() << ' ' << mem.hbstate.toString() << rsLog;
+            if( changed ) {
+                if( old.hbstate != mem.hbstate )
+                    log() << "replSet member " << h.toString() << ' ' << mem.hbstate.toString() << rsLog;
             }
             if( changed || now-last>4 ) {
                 last = now;
@@ -228,18 +233,18 @@ namespace mongo {
             if( mem.upSince || mem.downSince == 0 ) {
                 mem.upSince = 0;
                 mem.downSince = jsTime();
-                log() << "replSet info " << h.toString() << " is now down (or slow to respond)" << rsLog;
+                log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog;
             }
             mem.lastHeartbeatMsg = msg;
         }
     };
 
-    void ReplSetImpl::endOldHealthTasks() { 
+    void ReplSetImpl::endOldHealthTasks() {
         unsigned sz = healthTasks.size();
         for( set<ReplSetHealthPollTask*>::iterator i = healthTasks.begin(); i != healthTasks.end(); i++ )
             (*i)->halt();
         healthTasks.clear();
-        if( sz ) 
+        if( sz )
             DEV log() << "replSet debug: cleared old tasks " << sz << endl;
     }
 
@@ -251,8 +256,8 @@ namespace mongo {
 
     void startSyncThread();
 
-    /** called during repl set startup.  caller expects it to return fairly quickly. 
-        note ReplSet object is only created once we get a config - so this won't run 
+    /** called during repl set startup.  caller expects it to return fairly quickly.
+        note ReplSet object is only created once we get a config - so this won't run
         until the initiation.
     */
     void ReplSetImpl::startThreads() {
diff --git a/db/repl/manager.cpp b/db/repl/manager.cpp
index 862ac46..ed39c31 100644
--- a/db/repl/manager.cpp
+++ b/db/repl/manager.cpp
@@ -1,4 +1,4 @@
-/* @file manager.cpp 
+/* @file manager.cpp
 */
 
 /**
@@ -23,20 +23,20 @@
 
 namespace mongo {
 
-    enum { 
+    enum {
         NOPRIMARY = -2,
         SELFPRIMARY = -1
     };
 
     /* check members OTHER THAN US to see if they think they are primary */
-    const Member * Manager::findOtherPrimary(bool& two) { 
+    const Member * Manager::findOtherPrimary(bool& two) {
         two = false;
         Member *m = rs->head();
         Member *p = 0;
         while( m ) {
             DEV assert( m != rs->_self );
             if( m->state().primary() && m->hbinfo().up() ) {
-                if( p ) { 
+                if( p ) {
                     two = true;
                     return 0;
                 }
@@ -44,33 +44,36 @@ namespace mongo {
             }
             m = m->next();
         }
-        if( p ) 
+        if( p )
             noteARemoteIsPrimary(p);
         return p;
     }
 
-    Manager::Manager(ReplSetImpl *_rs) : 
-    task::Server("rs Manager"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY)
-    { 
+    Manager::Manager(ReplSetImpl *_rs) :
+        task::Server("rs Manager"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY) {
     }
-    
-    Manager::~Manager() { 
-        log() << "ERROR: ~Manager should never be called" << rsLog;
+
+    Manager::~Manager() {
+        /* we don't destroy the replset object we sit in; however, the destructor could have thrown on init.
+           the log message below is just a reminder to come back one day and review this code more, and to
+           make it cleaner.
+           */
+        log() << "info: ~Manager called" << rsLog;
         rs->mgr = 0;
-        assert(false);
     }
 
-    void Manager::starting() { 
+    void Manager::starting() {
         Client::initThread("rs Manager");
     }
 
-    void Manager::noteARemoteIsPrimary(const Member *m) { 
+    void Manager::noteARemoteIsPrimary(const Member *m) {
         if( rs->box.getPrimary() == m )
             return;
         rs->_self->lhb() = "";
         if( rs->iAmArbiterOnly() ) {
             rs->box.set(MemberState::RS_ARBITER, m);
-        } else {
+        }
+        else {
             rs->box.noteRemoteIsPrimary(m);
         }
     }
@@ -87,9 +90,8 @@ namespace mongo {
 
             const Member *p = rs->box.getPrimary();
             if( p && p != rs->_self ) {
-                if( !p->hbinfo().up() || 
-                    !p->hbinfo().hbstate.primary() ) 
-                {
+                if( !p->hbinfo().up() ||
+                        !p->hbinfo().hbstate.primary() ) {
                     p = 0;
                     rs->box.setOtherPrimary(0);
                 }
@@ -101,36 +103,36 @@ namespace mongo {
                 p2 = findOtherPrimary(two);
                 if( two ) {
                     /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */
-                    log() << "replSet warning DIAG two primaries (transiently)" << rsLog;
+                    log() << "replSet info two primaries (transiently)" << rsLog;
                     return;
                 }
             }
 
             if( p2 ) {
                 /* someone else thinks they are primary. */
-                if( p == p2 ) { 
+                if( p == p2 ) {
                     // we thought the same; all set.
                     return;
                 }
                 if( p == 0 ) {
-                    noteARemoteIsPrimary(p2); 
+                    noteARemoteIsPrimary(p2);
                     return;
                 }
                 // todo xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                 if( p != rs->_self ) {
                     // switch primary from oldremotep->newremotep2
-                    noteARemoteIsPrimary(p2); 
+                    noteARemoteIsPrimary(p2);
                     return;
                 }
                 /* we thought we were primary, yet now someone else thinks they are. */
                 if( !rs->elect.aMajoritySeemsToBeUp() ) {
                     /* we can't see a majority.  so the other node is probably the right choice. */
-                    noteARemoteIsPrimary(p2); 
+                    noteARemoteIsPrimary(p2);
                     return;
                 }
-                /* ignore for now, keep thinking we are master. 
-                   this could just be timing (we poll every couple seconds) or could indicate 
-                   a problem?  if it happens consistently for a duration of time we should 
+                /* ignore for now, keep thinking we are master.
+                   this could just be timing (we poll every couple seconds) or could indicate
+                   a problem?  if it happens consistently for a duration of time we should
                    alert the sysadmin.
                 */
                 return;
@@ -138,17 +140,17 @@ namespace mongo {
 
             /* didn't find anyone who wants to be primary */
 
-            if( p ) { 
+            if( p ) {
                 /* we are already primary */
 
-                if( p != rs->_self ) { 
+                if( p != rs->_self ) {
                     rs->sethbmsg("error p != rs->self in checkNewState");
                     log() << "replSet " << p->fullName() << rsLog;
                     log() << "replSet " << rs->_self->fullName() << rsLog;
                     return;
                 }
 
-                if( rs->elect.shouldRelinquish() ) { 
+                if( rs->elect.shouldRelinquish() ) {
                     log() << "replSet can't see a majority of the set, relinquishing primary" << rsLog;
                     rs->relinquish();
                 }
@@ -162,7 +164,7 @@ namespace mongo {
             /* TODO : CHECK PRIORITY HERE.  can't be elected if priority zero. */
 
             /* no one seems to be primary.  shall we try to elect ourself? */
-            if( !rs->elect.aMajoritySeemsToBeUp() ) { 
+            if( !rs->elect.aMajoritySeemsToBeUp() ) {
                 static time_t last;
                 static int n;
                 int ll = 0;
@@ -175,15 +177,15 @@ namespace mongo {
 
             busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one.
         }
-        try { 
-            rs->elect.electSelf(); 
+        try {
+            rs->elect.electSelf();
         }
         catch(RetryAfterSleepException&) {
             /* we want to process new inbounds before trying this again.  so we just put a checkNewstate in the queue for eval later. */
             requeue();
         }
-        catch(...) { 
-            log() << "replSet error unexpected assertion in rs manager" << rsLog; 
+        catch(...) {
+            log() << "replSet error unexpected assertion in rs manager" << rsLog;
         }
         busyWithElectSelf = false;
     }
diff --git a/db/repl/multicmd.h b/db/repl/multicmd.h
index 9eb9a17..df7c4e5 100644
--- a/db/repl/multicmd.h
+++ b/db/repl/multicmd.h
@@ -21,7 +21,7 @@
 #include "../../util/background.h"
 #include "connections.h"
 
-namespace mongo { 
+namespace mongo {
 
     struct Target {
         Target(string hostport) : toHost(hostport), ok(false) { }
@@ -33,38 +33,37 @@ namespace mongo {
 
     /* -- implementation ------------- */
 
-    class _MultiCommandJob : public BackgroundJob { 
+    class _MultiCommandJob : public BackgroundJob {
     public:
         BSONObj& cmd;
         Target& d;
         _MultiCommandJob(BSONObj& _cmd, Target& _d) : cmd(_cmd), d(_d) { }
+
     private:
-        string name() { return "MultiCommandJob"; }
+        string name() const { return "MultiCommandJob"; }
         void run() {
-            try { 
+            try {
                 ScopedConn c(d.toHost);
                 d.ok = c.runCommand("admin", cmd, d.result);
             }
-            catch(DBException&) { 
+            catch(DBException&) {
                 DEV log() << "dev caught dbexception on multiCommand " << d.toHost << rsLog;
             }
         }
     };
 
-    inline void multiCommand(BSONObj cmd, list<Target>& L) { 
-        typedef shared_ptr<_MultiCommandJob> P;
-        list<P> jobs;
-        list<BackgroundJob *> _jobs;
+    inline void multiCommand(BSONObj cmd, list<Target>& L) {
+        list<BackgroundJob *> jobs;
 
-        for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) { 
+        for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
             Target& d = *i;
             _MultiCommandJob *j = new _MultiCommandJob(cmd, d);
-            jobs.push_back(P(j));
-            _jobs.push_back(j);
+            j->go();
+            jobs.push_back(j);
         }
 
-        BackgroundJob::go(_jobs);
-        BackgroundJob::wait(_jobs,5);
+        for( list<BackgroundJob*>::iterator i = jobs.begin(); i != jobs.end(); i++ ) {
+            (*i)->wait();
+        }
     }
-
 }
diff --git a/db/repl/replset_commands.cpp b/db/repl/replset_commands.cpp
index 328b0ab..dc8567a 100644
--- a/db/repl/replset_commands.cpp
+++ b/db/repl/replset_commands.cpp
@@ -24,7 +24,9 @@
 #include "../../util/mongoutils/html.h"
 #include "../../client/dbclient.h"
 
-namespace mongo { 
+using namespace bson;
+
+namespace mongo {
 
     void checkMembersUpForConfigChange(const ReplSetConfig& cfg, bool initial);
 
@@ -50,7 +52,7 @@ namespace mongo {
             }
 
             // may not need this, but if removed check all tests still work:
-            if( !check(errmsg, result) ) 
+            if( !check(errmsg, result) )
                 return false;
 
             if( cmdObj.hasElement("blind") ) {
@@ -61,6 +63,7 @@ namespace mongo {
         }
     } cmdReplSetTest;
 
+    /** get rollback id */
     class CmdReplSetGetRBID : public ReplSetCommand {
     public:
         /* todo: ideally this should only change on rollbacks NOT on mongod restarts also. fix... */
@@ -68,26 +71,28 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "internal";
         }
-        CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") { 
+        CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") {
             rbid = (int) curTimeMillis();
         }
         virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            if( !check(errmsg, result) ) 
+            if( !check(errmsg, result) )
                 return false;
             result.append("rbid",rbid);
             return true;
         }
     } cmdReplSetRBID;
 
-    using namespace bson;
-    void incRBID() { 
+    /** we increment the rollback id on every rollback event. */
+    void incRBID() {
         cmdReplSetRBID.rbid++;
     }
-    int getRBID(DBClientConnection *c) { 
+
+    /** helper to get rollback id from another server. */
+    int getRBID(DBClientConnection *c) {
         bo info;
         c->simpleCommand("admin", &info, "replSetGetRBID");
         return info["rbid"].numberInt();
-    } 
+    }
 
     class CmdReplSetGetStatus : public ReplSetCommand {
     public:
@@ -98,7 +103,10 @@ namespace mongo {
         }
         CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) { }
         virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            if( !check(errmsg, result) ) 
+            if ( cmdObj["forShell"].trueValue() )
+                lastError.disableForCommand();
+
+            if( !check(errmsg, result) )
                 return false;
             theReplSet->summarizeStatus(result);
             return true;
@@ -115,7 +123,7 @@ namespace mongo {
         }
         CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") { }
         virtual bool run(const string& a, BSONObj& b, string& errmsg, BSONObjBuilder& c, bool d) {
-            try { 
+            try {
                 rwlock_try_write lk(mutex);
                 return _run(a,b,errmsg,c,d);
             }
@@ -125,16 +133,16 @@ namespace mongo {
         }
     private:
         bool _run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            if( !check(errmsg, result) ) 
+            if( !check(errmsg, result) )
                 return false;
-            if( !theReplSet->box.getState().primary() ) { 
+            if( !theReplSet->box.getState().primary() ) {
                 errmsg = "replSetReconfig command must be sent to the current replica set primary.";
                 return false;
             }
 
             {
-                // just make sure we can get a write lock before doing anything else.  we'll reacquire one 
-                // later.  of course it could be stuck then, but this check lowers the risk if weird things 
+                // just make sure we can get a write lock before doing anything else.  we'll reacquire one
+                // later.  of course it could be stuck then, but this check lowers the risk if weird things
                 // are up - we probably don't want a change to apply 30 minutes after the initial attempt.
                 time_t t = time(0);
                 writelock lk("");
@@ -159,7 +167,7 @@ namespace mongo {
 
                 log() << "replSet replSetReconfig config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;
 
-                if( !ReplSetConfig::legalChange(theReplSet->getConfig(), newConfig, errmsg) ) { 
+                if( !ReplSetConfig::legalChange(theReplSet->getConfig(), newConfig, errmsg) ) {
                     return false;
                 }
 
@@ -170,7 +178,7 @@ namespace mongo {
                 theReplSet->haveNewConfig(newConfig, true);
                 ReplSet::startupStatusMsg = "replSetReconfig'd";
             }
-            catch( DBException& e ) { 
+            catch( DBException& e ) {
                 log() << "replSet replSetReconfig exception: " << e.what() << rsLog;
                 throw;
             }
@@ -182,8 +190,11 @@ namespace mongo {
     class CmdReplSetFreeze : public ReplSetCommand {
     public:
         virtual void help( stringstream &help ) const {
-            help << "Enable / disable failover for the set - locks current primary as primary even if issues occur.\nFor use during system maintenance.\n";
-            help << "{ replSetFreeze : <bool> }";
+            help << "{ replSetFreeze : <seconds> }";
+            help << "'freeze' state of member to the extent we can do that.  What this really means is that\n";
+            help << "this node will not attempt to become primary until the time period specified expires.\n";
+            help << "You can call again with {replSetFreeze:0} to unfreeze sooner.\n";
+            help << "A process restart unfreezes the member also.\n";
             help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
         }
 
@@ -191,15 +202,22 @@ namespace mongo {
         virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
-            errmsg = "not yet implemented"; /*TODO*/
-            return false;
+            int secs = (int) cmdObj.firstElement().numberInt();
+            if( theReplSet->freeze(secs) ) {
+                if( secs == 0 )
+                    result.append("info","unfreezing");
+            }
+            if( secs == 1 )
+                result.append("warning", "you really want to freeze for only 1 second?");
+            return true;
         }
     } cmdReplSetFreeze;
 
     class CmdReplSetStepDown: public ReplSetCommand {
     public:
         virtual void help( stringstream &help ) const {
-            help << "Step down as primary.  Will not try to reelect self or 1 minute.\n";
+            help << "{ replSetStepDown : <seconds> }\n";
+            help << "Step down as primary.  Will not try to reelect self for the specified time period (1 minute if no numeric secs value specified).\n";
             help << "(If another member with same priority takes over in the meantime, it will stay primary.)\n";
             help << "http://www.mongodb.org/display/DOCS/Replica+Set+Commands";
         }
@@ -212,7 +230,10 @@ namespace mongo {
                 errmsg = "not primary so can't step down";
                 return false;
             }
-            return theReplSet->stepDown();
+            int secs = (int) cmdObj.firstElement().numberInt();
+            if( secs == 0 )
+                secs = 60;
+            return theReplSet->stepDown(secs);
         }
     } cmdReplSetStepDown;
 
@@ -222,45 +243,46 @@ namespace mongo {
 
     class ReplSetHandler : public DbWebHandler {
     public:
-        ReplSetHandler() : DbWebHandler( "_replSet" , 1 , true ){}
+        ReplSetHandler() : DbWebHandler( "_replSet" , 1 , true ) {}
 
         virtual bool handles( const string& url ) const {
             return startsWith( url , "/_replSet" );
         }
 
-        virtual void handle( const char *rq, string url, 
+        virtual void handle( const char *rq, string url, BSONObj params,
                              string& responseMsg, int& responseCode,
-                             vector<string>& headers,  const SockAddr &from ){
-            
-            string s = str::after(url, "/_replSetOplog?");
-            if( !s.empty() )
-                responseMsg = _replSetOplog(s);
+                             vector<string>& headers,  const SockAddr &from ) {
+
+            if( url == "/_replSetOplog" ) {
+                responseMsg = _replSetOplog(params);
+            }
             else
                 responseMsg = _replSet();
             responseCode = 200;
         }
 
+        string _replSetOplog(bo parms) {
+            int _id = (int) str::toUnsigned( parms["_id"].String() );
 
-        string _replSetOplog(string parms) { 
             stringstream s;
             string t = "Replication oplog";
             s << start(t);
             s << p(t);
 
-            if( theReplSet == 0 ) { 
-                if( cmdLine._replSet.empty() ) 
+            if( theReplSet == 0 ) {
+                if( cmdLine._replSet.empty() )
                     s << p("Not using --replSet");
                 else  {
-                    s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated") 
+                    s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated")
                            + ".<br>" + ReplSet::startupStatusMsg);
                 }
             }
             else {
                 try {
-                    theReplSet->getOplogDiagsAsHtml(stringToNum(parms.c_str()), s);
+                    theReplSet->getOplogDiagsAsHtml(_id, s);
                 }
-                catch(std::exception& e) { 
-                    s << "error querying oplog: " << e.what() << '\n'; 
+                catch(std::exception& e) {
+                    s << "error querying oplog: " << e.what() << '\n';
                 }
             }
 
@@ -269,20 +291,20 @@ namespace mongo {
         }
 
         /* /_replSet show replica set status in html format */
-        string _replSet() { 
+        string _replSet() {
             stringstream s;
             s << start("Replica Set Status " + prettyHostName());
-            s << p( a("/", "back", "Home") + " | " + 
+            s << p( a("/", "back", "Home") + " | " +
                     a("/local/system.replset/?html=1", "", "View Replset Config") + " | " +
-                    a("/replSetGetStatus?text", "", "replSetGetStatus") + " | " +
+                    a("/replSetGetStatus?text=1", "", "replSetGetStatus") + " | " +
                     a("http://www.mongodb.org/display/DOCS/Replica+Sets", "", "Docs")
                   );
 
-            if( theReplSet == 0 ) { 
-                if( cmdLine._replSet.empty() ) 
+            if( theReplSet == 0 ) {
+                if( cmdLine._replSet.empty() )
                     s << p("Not using --replSet");
                 else  {
-                    s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated") 
+                    s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated")
                            + ".<br>" + ReplSet::startupStatusMsg);
                 }
             }
diff --git a/db/repl/rs.cpp b/db/repl/rs.cpp
index 1c0444a..90ed9f4 100644
--- a/db/repl/rs.cpp
+++ b/db/repl/rs.cpp
@@ -20,9 +20,12 @@
 #include "../client.h"
 #include "../../client/dbclient.h"
 #include "../dbhelpers.h"
+#include "../../s/d_logic.h"
 #include "rs.h"
+#include "connections.h"
+#include "../repl.h"
 
-namespace mongo { 
+namespace mongo {
 
     using namespace bson;
 
@@ -30,18 +33,18 @@ namespace mongo {
     ReplSet *theReplSet = 0;
     extern string *discoveredSeed;
 
-    void ReplSetImpl::sethbmsg(string s, int logLevel) { 
+    void ReplSetImpl::sethbmsg(string s, int logLevel) {
         static time_t lastLogged;
         _hbmsgTime = time(0);
 
-        if( s == _hbmsg ) { 
+        if( s == _hbmsg ) {
             // unchanged
             if( _hbmsgTime - lastLogged < 60 )
                 return;
         }
 
         unsigned sz = s.size();
-        if( sz >= 256 ) 
+        if( sz >= 256 )
             memcpy(_hbmsg, s.c_str(), 255);
         else {
             _hbmsg[sz] = 0;
@@ -53,7 +56,7 @@ namespace mongo {
         }
     }
 
-    void ReplSetImpl::assumePrimary() { 
+    void ReplSetImpl::assumePrimary() {
         assert( iAmPotentiallyHot() );
         writelock lk("admin."); // so we are synchronized with _logOp()
         box.setSelfPrimary(_self);
@@ -62,17 +65,26 @@ namespace mongo {
 
     void ReplSetImpl::changeState(MemberState s) { box.change(s, _self); }
 
-    void ReplSetImpl::relinquish() { 
+    const bool closeOnRelinquish = true;
+
+    void ReplSetImpl::relinquish() {
         if( box.getState().primary() ) {
             log() << "replSet relinquishing primary state" << rsLog;
-            changeState(MemberState::RS_RECOVERING);
-            
-            /* close sockets that were talking to us */
-            /*log() << "replSet closing sockets after reqlinquishing primary" << rsLog;
-            MessagingPort::closeAllSockets(1);*/
+            changeState(MemberState::RS_SECONDARY);
+
+            if( closeOnRelinquish ) {
+                /* close sockets that were talking to us so they don't blithly send many writes that will fail
+                   with "not master" (of course client could check result code, but in case they are not)
+                */
+                log() << "replSet closing client sockets after reqlinquishing primary" << rsLog;
+                MessagingPort::closeAllSockets(1);
+            }
+
+            // now that all connections were closed, strip this mongod from all sharding details
+            // if and when it gets promoted to a primary again, only then it should reload the sharding state
+            // the rationale here is that this mongod won't bring stale state when it regains primaryhood
+            shardingState.resetShardingState();
 
-            // todo: >
-            //changeState(MemberState::RS_SECONDARY);
         }
         else if( box.getState().startup2() ) {
             // ? add comment
@@ -81,26 +93,48 @@ namespace mongo {
     }
 
     /* look freshly for who is primary - includes relinquishing ourself. */
-    void ReplSetImpl::forgetPrimary() { 
-        if( box.getState().primary() ) 
+    void ReplSetImpl::forgetPrimary() {
+        if( box.getState().primary() )
             relinquish();
         else {
             box.setOtherPrimary(0);
         }
     }
 
-    bool ReplSetImpl::_stepDown() { 
+    // for the replSetStepDown command
+    bool ReplSetImpl::_stepDown(int secs) {
         lock lk(this);
-        if( box.getState().primary() ) { 
-            changeState(MemberState::RS_RECOVERING);
-            elect.steppedDown = time(0) + 60;
-            log() << "replSet info stepped down as primary" << rsLog;
+        if( box.getState().primary() ) {
+            elect.steppedDown = time(0) + secs;
+            log() << "replSet info stepping down as primary secs=" << secs << rsLog;
+            relinquish();
             return true;
         }
         return false;
     }
 
-    void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) { 
+    bool ReplSetImpl::_freeze(int secs) {
+        lock lk(this);
+        /* note if we are primary we remain primary but won't try to elect ourself again until
+           this time period expires.
+           */
+        if( secs == 0 ) {
+            elect.steppedDown = 0;
+            log() << "replSet info 'unfreezing'" << rsLog;
+        }
+        else {
+            if( !box.getState().primary() ) {
+                elect.steppedDown = time(0) + secs;
+                log() << "replSet info 'freezing' for " << secs << " seconds" << rsLog;
+            }
+            else {
+                log() << "replSet info received freeze command but we are primary" << rsLog;
+            }
+        }
+        return true;
+    }
+
+    void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) {
         for( Member *m = _members.head(); m; m=m->next() ) {
             if( m->id() == h.id() ) {
                 m->_hbinfo = h;
@@ -109,7 +143,7 @@ namespace mongo {
         }
     }
 
-    list<HostAndPort> ReplSetImpl::memberHostnames() const { 
+    list<HostAndPort> ReplSetImpl::memberHostnames() const {
         list<HostAndPort> L;
         L.push_back(_self->h());
         for( Member *m = _members.head(); m; m = m->next() )
@@ -118,6 +152,7 @@ namespace mongo {
     }
 
     void ReplSetImpl::_fillIsMasterHost(const Member *m, vector<string>& hosts, vector<string>& passives, vector<string>& arbiters) {
+        assert( m );
         if( m->config().hidden )
             return;
 
@@ -126,8 +161,9 @@ namespace mongo {
         }
         else if( !m->config().arbiterOnly ) {
             if( m->config().slaveDelay ) {
-                /* hmmm - we don't list these as they are stale. */   
-            } else {
+                /* hmmm - we don't list these as they are stale. */
+            }
+            else {
                 passives.push_back(m->h().toString());
             }
         }
@@ -147,6 +183,7 @@ namespace mongo {
             _fillIsMasterHost(_self, hosts, passives, arbiters);
 
             for( Member *m = _members.head(); m; m = m->next() ) {
+                assert( m );
                 _fillIsMasterHost(m, hosts, passives, arbiters);
             }
 
@@ -161,23 +198,27 @@ namespace mongo {
             }
         }
 
-        if( !isp ) { 
+        if( !isp ) {
             const Member *m = sp.primary;
             if( m )
                 b.append("primary", m->h().toString());
         }
         if( myConfig().arbiterOnly )
             b.append("arbiterOnly", true);
+        if( myConfig().priority == 0 )
+            b.append("passive", true);
         if( myConfig().slaveDelay )
             b.append("slaveDelay", myConfig().slaveDelay);
         if( myConfig().hidden )
             b.append("hidden", true);
+        if( !myConfig().buildIndexes )
+            b.append("buildIndexes", false);
     }
 
     /** @param cfgString <setname>/<seedhost1>,<seedhost2> */
 
-    void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet ) { 
-        const char *p = cfgString.c_str(); 
+    void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet ) {
+        const char *p = cfgString.c_str();
         const char *slash = strchr(p, '/');
         if( slash )
             setname = string(p, slash-p);
@@ -207,7 +248,8 @@ namespace mongo {
                 //uassert(13101, "can't use localhost in replset host list", !m.isLocalHost());
                 if( m.isSelf() ) {
                     log(1) << "replSet ignoring seed " << m.toString() << " (=self)" << rsLog;
-                } else
+                }
+                else
                     seeds.push_back(m);
                 if( *comma == 0 )
                     break;
@@ -216,10 +258,9 @@ namespace mongo {
         }
     }
 
-    ReplSetImpl::ReplSetImpl(ReplSetCmdline& replSetCmdline) : elect(this), 
-        _self(0), 
-        mgr( new Manager(this) )
-    {
+    ReplSetImpl::ReplSetImpl(ReplSetCmdline& replSetCmdline) : elect(this),
+        _self(0),
+        mgr( new Manager(this) ) {
         _cfg = 0;
         memset(_hbmsg, 0, sizeof(_hbmsg));
         *_hbmsg = '.'; // temp...just to see
@@ -240,20 +281,21 @@ namespace mongo {
         }
         for( set<HostAndPort>::iterator i = replSetCmdline.seedSet.begin(); i != replSetCmdline.seedSet.end(); i++ ) {
             if( i->isSelf() ) {
-                if( sss == 1 ) 
+                if( sss == 1 )
                     log(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog;
-            } else
+            }
+            else
                 log() << "replSet warning command line seed " << i->toString() << " is not present in the current repl set config" << rsLog;
         }
     }
 
     void newReplUp();
 
-    void ReplSetImpl::loadLastOpTimeWritten() { 
+    void ReplSetImpl::loadLastOpTimeWritten() {
         //assert( lastOpTimeWritten.isNull() );
         readlock lk(rsoplog);
         BSONObj o;
-        if( Helpers::getLast(rsoplog, o) ) { 
+        if( Helpers::getLast(rsoplog, o) ) {
             lastH = o["h"].numberLong();
             lastOpTimeWritten = o["ts"]._opTime();
             uassert(13290, "bad replSet oplog entry?", !lastOpTimeWritten.isNull());
@@ -261,11 +303,11 @@ namespace mongo {
     }
 
     /* call after constructing to start - returns fairly quickly after launching its threads */
-    void ReplSetImpl::_go() { 
-        try { 
+    void ReplSetImpl::_go() {
+        try {
             loadLastOpTimeWritten();
         }
-        catch(std::exception& e) { 
+        catch(std::exception& e) {
             log() << "replSet error fatal couldn't query the local " << rsoplog << " collection.  Terminating mongod after 30 seconds." << rsLog;
             log() << e.what() << rsLog;
             sleepsecs(30);
@@ -283,11 +325,17 @@ namespace mongo {
 
     extern BSONObj *getLastErrorDefault;
 
+    void ReplSetImpl::setSelfTo(Member *m) {
+        _self = m;
+        if( m ) _buildIndexes = m->config().buildIndexes;
+        else _buildIndexes = true;
+    }
+
     /** @param reconf true if this is a reconfiguration and not an initial load of the configuration.
         @return true if ok; throws if config really bad; false if config doesn't include self
     */
     bool ReplSetImpl::initFromConfig(ReplSetConfig& c, bool reconf) {
-        /* NOTE: haveNewConfig() writes the new config to disk before we get here.  So 
+        /* NOTE: haveNewConfig() writes the new config to disk before we get here.  So
                  we cannot error out at this point, except fatally.  Check errors earlier.
                  */
         lock lk(this);
@@ -302,25 +350,24 @@ namespace mongo {
         {
             unsigned nfound = 0;
             int me = 0;
-            for( vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin(); i != c.members.end(); i++ ) { 
+            for( vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin(); i != c.members.end(); i++ ) {
                 const ReplSetConfig::MemberCfg& m = *i;
                 if( m.h.isSelf() ) {
                     nfound++;
                     me++;
-
                     if( !reconf || (_self && _self->id() == (unsigned) m._id) )
                         ;
-                    else { 
+                    else {
                         log() << "replSet " << _self->id() << ' ' << m._id << rsLog;
                         assert(false);
                     }
                 }
-                else if( reconf ) { 
+                else if( reconf ) {
                     const Member *old = findById(m._id);
-                    if( old ) { 
+                    if( old ) {
                         nfound++;
                         assert( (int) old->id() == m._id );
-                        if( old->config() == m ) { 
+                        if( old->config() == m ) {
                             additive = false;
                         }
                     }
@@ -328,16 +375,24 @@ namespace mongo {
                         newOnes.push_back(&m);
                     }
                 }
+
+                // change timeout settings, if necessary
+                ScopedConn conn(m.h.toString());
+                conn.setTimeout(c.ho.heartbeatTimeoutMillis/1000.0);
             }
             if( me == 0 ) {
+                // initial startup with fastsync
+                if (!reconf && replSettings.fastsync) {
+                    return false;
+                }
                 // log() << "replSet config : " << _cfg->toString() << rsLog;
-                log() << "replSet error can't find self in the repl set configuration:" << rsLog;
+                log() << "replSet error self not present in the repl set configuration:" << rsLog;
                 log() << c.toString() << rsLog;
-                assert(false);
+                uasserted(13497, "replSet error self not present in the configuration");
             }
             uassert( 13302, "replSet error self appears twice in the repl set configuration", me<=1 );
 
-            if( reconf && config().members.size() != nfound ) 
+            if( reconf && config().members.size() != nfound )
                 additive = false;
         }
 
@@ -347,14 +402,14 @@ namespace mongo {
         _name = _cfg->_id;
         assert( !_name.empty() );
 
-        if( additive ) { 
+        if( additive ) {
             log() << "replSet info : additive change to configuration" << rsLog;
             for( list<const ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) {
                 const ReplSetConfig::MemberCfg* m = *i;
                 Member *mi = new Member(m->h, m->_id, m, false);
 
-                /** we will indicate that new members are up() initially so that we don't relinquish our 
-                    primary state because we can't (transiently) see a majority.  they should be up as we 
+                /** we will indicate that new members are up() initially so that we don't relinquish our
+                    primary state because we can't (transiently) see a majority.  they should be up as we
                     check that new members are up before getting here on reconfig anyway.
                     */
                 mi->get_hbinfo().health = 0.1;
@@ -373,20 +428,30 @@ namespace mongo {
         int oldPrimaryId = -1;
         {
             const Member *p = box.getPrimary();
-            if( p ) 
+            if( p )
                 oldPrimaryId = p->id();
         }
         forgetPrimary();
-        _self = 0;
-        for( vector<ReplSetConfig::MemberCfg>::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) { 
+
+        bool iWasArbiterOnly = _self ? iAmArbiterOnly() : false;
+        setSelfTo(0);
+        for( vector<ReplSetConfig::MemberCfg>::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) {
             const ReplSetConfig::MemberCfg& m = *i;
             Member *mi;
             if( m.h.isSelf() ) {
                 assert( _self == 0 );
-                mi = _self = new Member(m.h, m._id, &m, true);
+                mi = new Member(m.h, m._id, &m, true);
+                setSelfTo(mi);
+
+                // if the arbiter status changed
+                if (iWasArbiterOnly ^ iAmArbiterOnly()) {
+                    _changeArbiterState();
+                }
+
                 if( (int)mi->id() == oldPrimaryId )
                     box.setSelfPrimary(mi);
-            } else {
+            }
+            else {
                 mi = new Member(m.h, m._id, &m, false);
                 _members.push(mi);
                 startHealthTaskFor(mi);
@@ -397,26 +462,57 @@ namespace mongo {
         return true;
     }
 
+    void startSyncThread();
+
+    void ReplSetImpl::_changeArbiterState() {
+        if (iAmArbiterOnly()) {
+            changeState(MemberState::RS_ARBITER);
+
+            // if there is an oplog, free it
+            // not sure if this is necessary, maybe just leave the oplog and let
+            // the user delete it if they want the space?
+            writelock lk(rsoplog);
+            Client::Context c(rsoplog);
+            NamespaceDetails *d = nsdetails(rsoplog);
+            if (d) {
+                string errmsg;
+                bob res;
+                dropCollection(rsoplog, errmsg, res);
+
+                // clear last op time to force initial sync (if the arbiter
+                // becomes a "normal" server again)
+                lastOpTimeWritten = OpTime();
+            }
+        }
+        else {
+            changeState(MemberState::RS_RECOVERING);
+
+            // oplog will be allocated when sync begins
+            /* TODO : could this cause two sync threads to exist (race condition)? */
+            boost::thread t(startSyncThread);
+        }
+    }
+
     // Our own config must be the first one.
-    bool ReplSetImpl::_loadConfigFinish(vector<ReplSetConfig>& cfgs) { 
+    bool ReplSetImpl::_loadConfigFinish(vector<ReplSetConfig>& cfgs) {
         int v = -1;
         ReplSetConfig *highest = 0;
         int myVersion = -2000;
         int n = 0;
-        for( vector<ReplSetConfig>::iterator i = cfgs.begin(); i != cfgs.end(); i++ ) { 
+        for( vector<ReplSetConfig>::iterator i = cfgs.begin(); i != cfgs.end(); i++ ) {
             ReplSetConfig& cfg = *i;
             if( ++n == 1 ) myVersion = cfg.version;
-            if( cfg.ok() && cfg.version > v ) { 
+            if( cfg.ok() && cfg.version > v ) {
                 highest = &cfg;
                 v = cfg.version;
             }
         }
         assert( highest );
 
-        if( !initFromConfig(*highest) ) 
+        if( !initFromConfig(*highest) )
             return false;
 
-        if( highest->version > myVersion && highest->version >= 0 ) { 
+        if( highest->version > myVersion && highest->version >= 0 ) {
             log() << "replSet got config version " << highest->version << " from a remote, saving locally" << rsLog;
             writelock lk("admin.");
             highest->saveConfigLocally(BSONObj());
@@ -430,7 +526,7 @@ namespace mongo {
             startupStatusMsg = "loading " + rsConfigNs + " config (LOADINGCONFIG)";
             try {
                 vector<ReplSetConfig> configs;
-                try { 
+                try {
                     configs.push_back( ReplSetConfig(HostAndPort::me()) );
                 }
                 catch(DBException& e) {
@@ -438,26 +534,26 @@ namespace mongo {
                     throw;
                 }
                 for( vector<HostAndPort>::const_iterator i = _seeds->begin(); i != _seeds->end(); i++ ) {
-                    try { 
+                    try {
                         configs.push_back( ReplSetConfig(*i) );
                     }
-                    catch( DBException& e ) { 
+                    catch( DBException& e ) {
                         log() << "replSet exception trying to load config from " << *i << " : " << e.toString() << rsLog;
                     }
                 }
 
-                if( discoveredSeed ) { 
+                if( discoveredSeed ) {
                     try {
                         configs.push_back( ReplSetConfig(HostAndPort(*discoveredSeed)) );
                     }
-                    catch( DBException& ) { 
+                    catch( DBException& ) {
                         log(1) << "replSet exception trying to load config from discovered seed " << *discoveredSeed << rsLog;
                     }
                 }
 
                 int nok = 0;
                 int nempty = 0;
-                for( vector<ReplSetConfig>::iterator i = configs.begin(); i != configs.end(); i++ ) { 
+                for( vector<ReplSetConfig>::iterator i = configs.begin(); i != configs.end(); i++ ) {
                     if( i->ok() )
                         nok++;
                     if( i->empty() )
@@ -469,7 +565,9 @@ namespace mongo {
                         startupStatus = EMPTYCONFIG;
                         startupStatusMsg = "can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)";
                         log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog;
-                        log(1) << "replSet have you ran replSetInitiate yet?" << rsLog;
+                        static unsigned once;
+                        if( ++once == 1 )
+                            log() << "replSet info you may need to run replSetInitiate -- rs.initiate() in the shell -- if that is not already done" << rsLog;
                         if( _seeds->size() == 0 )
                             log(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog;
                     }
@@ -483,13 +581,13 @@ namespace mongo {
                     continue;
                 }
 
-                if( !_loadConfigFinish(configs) ) { 
+                if( !_loadConfigFinish(configs) ) {
                     log() << "replSet info Couldn't load config yet. Sleeping 20sec and will try again." << rsLog;
                     sleepsecs(20);
                     continue;
                 }
             }
-            catch(DBException& e) { 
+            catch(DBException& e) {
                 startupStatus = BADCONFIG;
                 startupStatusMsg = "replSet error loading set config (BADCONFIG)";
                 log() << "replSet error loading configurations " << e.toString() << rsLog;
@@ -504,30 +602,34 @@ namespace mongo {
         startupStatus = STARTED;
     }
 
-    void ReplSetImpl::_fatal() 
-    { 
+    void ReplSetImpl::_fatal() {
         //lock l(this);
         box.set(MemberState::RS_FATAL, 0);
         //sethbmsg("fatal error");
-        log() << "replSet error fatal, stopping replication" << rsLog; 
+        log() << "replSet error fatal, stopping replication" << rsLog;
     }
 
-    void ReplSet::haveNewConfig(ReplSetConfig& newConfig, bool addComment) { 
+    void ReplSet::haveNewConfig(ReplSetConfig& newConfig, bool addComment) {
         lock l(this); // convention is to lock replset before taking the db rwlock
         writelock lk("");
         bo comment;
         if( addComment )
             comment = BSON( "msg" << "Reconfig set" << "version" << newConfig.version );
         newConfig.saveConfigLocally(comment);
-        try { 
+        try {
             initFromConfig(newConfig, true);
             log() << "replSet replSetReconfig new config saved locally" << rsLog;
         }
-        catch(DBException& e) { 
+        catch(DBException& e) {
+            if( e.getCode() == 13497 /* removed from set */ ) {
+                cc().shutdown();
+                dbexit( EXIT_CLEAN , "removed from replica set" ); // never returns
+                assert(0);
+            }
             log() << "replSet error unexpected exception in haveNewConfig() : " << e.toString() << rsLog;
             _fatal();
         }
-        catch(...) { 
+        catch(...) {
             log() << "replSet error unexpected exception in haveNewConfig()" << rsLog;
             _fatal();
         }
@@ -538,30 +640,33 @@ namespace mongo {
         ReplSetConfig c(o);
         if( c.version > rs->config().version )
             theReplSet->haveNewConfig(c, false);
-        else { 
-            log() << "replSet info msgReceivedNewConfig but version isn't higher " << 
-                c.version << ' ' << rs->config().version << rsLog;
+        else {
+            log() << "replSet info msgReceivedNewConfig but version isn't higher " <<
+                  c.version << ' ' << rs->config().version << rsLog;
         }
     }
 
-    /* forked as a thread during startup 
-       it can run quite a while looking for config.  but once found, 
+    /* forked as a thread during startup
+       it can run quite a while looking for config.  but once found,
        a separate thread takes over as ReplSetImpl::Manager, and this thread
        terminates.
     */
     void startReplSets(ReplSetCmdline *replSetCmdline) {
         Client::initThread("startReplSets");
-        try { 
+        try {
             assert( theReplSet == 0 );
             if( replSetCmdline == 0 ) {
                 assert(!replSet);
                 return;
             }
+            if( !noauth ) {
+                cc().getAuthenticationInfo()->authorize("local");
+            }
             (theReplSet = new ReplSet(*replSetCmdline))->go();
         }
-        catch(std::exception& e) { 
+        catch(std::exception& e) {
             log() << "replSet caught exception in startReplSets thread: " << e.what() << rsLog;
-            if( theReplSet ) 
+            if( theReplSet )
                 theReplSet->fatal();
         }
         cc().shutdown();
@@ -569,10 +674,9 @@ namespace mongo {
 
 }
 
-namespace boost { 
+namespace boost {
 
-    void assertion_failed(char const * expr, char const * function, char const * file, long line)
-    {
+    void assertion_failed(char const * expr, char const * function, char const * file, long line) {
         mongo::log() << "boost assertion failure " << expr << ' ' << function << ' ' << file << ' ' << line << endl;
     }
 
diff --git a/db/repl/rs.h b/db/repl/rs.h
index 6c4d9a8..1419ad6 100644
--- a/db/repl/rs.h
+++ b/db/repl/rs.h
@@ -43,6 +43,7 @@ namespace mongo {
     class Member : public List1<Member>::Base {
     public:
         Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self);
+
         string fullName() const { return h().toString(); }
         const ReplSetConfig::MemberCfg& config() const { return _config; }
         const HeartbeatInfo& hbinfo() const { return _hbinfo; }
@@ -51,10 +52,12 @@ namespace mongo {
         MemberState state() const { return _hbinfo.hbstate; }
         const HostAndPort& h() const { return _h; }
         unsigned id() const { return _hbinfo.id(); }
+
         bool potentiallyHot() const { return _config.potentiallyHot(); } // not arbiter, not priority 0
         void summarizeMember(stringstream& s) const;
-        friend class ReplSetImpl;
+
     private:
+        friend class ReplSetImpl;
         const ReplSetConfig::MemberCfg _config;
         const HostAndPort _h;
         HeartbeatInfo _hbinfo;
@@ -65,8 +68,8 @@ namespace mongo {
         bool busyWithElectSelf;
         int _primary;
 
-        /** @param two - if true two primaries were seen.  this can happen transiently, in addition to our 
-                         polling being only occasional.  in this case null is returned, but the caller should 
+        /** @param two - if true two primaries were seen.  this can happen transiently, in addition to our
+                         polling being only occasional.  in this case null is returned, but the caller should
                          not assume primary itself in that situation.
         */
         const Member* findOtherPrimary(bool& two);
@@ -75,7 +78,7 @@ namespace mongo {
         virtual void starting();
     public:
         Manager(ReplSetImpl *rs);
-        ~Manager();
+        virtual ~Manager();
         void msgReceivedNewConfig(BSONObj);
         void msgCheckNewState();
     };
@@ -84,7 +87,7 @@ namespace mongo {
 
     class Consensus {
         ReplSetImpl &rs;
-        struct LastYea { 
+        struct LastYea {
             LastYea() : when(0), who(0xffffffff) { }
             time_t when;
             unsigned who;
@@ -96,12 +99,12 @@ namespace mongo {
         bool weAreFreshest(bool& allUp, int& nTies);
         bool sleptLast; // slept last elect() pass
     public:
-        Consensus(ReplSetImpl *t) : rs(*t) { 
+        Consensus(ReplSetImpl *t) : rs(*t) {
             sleptLast = false;
             steppedDown = 0;
         }
 
-        /* if we've stepped down, this is when we are allowed to try to elect ourself again. 
+        /* if we've stepped down, this is when we are allowed to try to elect ourself again.
            todo: handle possible weirdnesses at clock skews etc.
         */
         time_t steppedDown;
@@ -115,40 +118,40 @@ namespace mongo {
     };
 
     /** most operations on a ReplSet object should be done while locked. that logic implemented here. */
-    class RSBase : boost::noncopyable { 
+    class RSBase : boost::noncopyable {
     public:
         const unsigned magic;
         void assertValid() { assert( magic == 0x12345677 ); }
     private:
-        mutex m;
+        mongo::mutex m;
         int _locked;
         ThreadLocalValue<bool> _lockedByMe;
     protected:
         RSBase() : magic(0x12345677), m("RSBase"), _locked(0) { }
-        ~RSBase() { 
+        ~RSBase() {
             /* this can happen if we throw in the constructor; otherwise never happens.  thus we log it as it is quite unusual. */
             log() << "replSet ~RSBase called" << rsLog;
         }
 
-        class lock { 
+        class lock {
             RSBase& rsbase;
             auto_ptr<scoped_lock> sl;
         public:
-            lock(RSBase* b) : rsbase(*b) { 
+            lock(RSBase* b) : rsbase(*b) {
                 if( rsbase._lockedByMe.get() )
                     return; // recursive is ok...
 
                 sl.reset( new scoped_lock(rsbase.m) );
                 DEV assert(rsbase._locked == 0);
-                rsbase._locked++; 
+                rsbase._locked++;
                 rsbase._lockedByMe.set(true);
             }
-            ~lock() { 
+            ~lock() {
                 if( sl.get() ) {
                     assert( rsbase._lockedByMe.get() );
                     DEV assert(rsbase._locked == 1);
                     rsbase._lockedByMe.set(false);
-                    rsbase._locked--; 
+                    rsbase._locked--;
                 }
             }
         };
@@ -157,11 +160,11 @@ namespace mongo {
         /* for asserts */
         bool locked() const { return _locked != 0; }
 
-        /* if true, is locked, and was locked by this thread. note if false, it could be in the lock or not for another 
+        /* if true, is locked, and was locked by this thread. note if false, it could be in the lock or not for another
            just for asserts & such so we can make the contracts clear on who locks what when.
            we don't use these locks that frequently, so the little bit of overhead is fine.
         */
-        bool lockedByMe() { return _lockedByMe.get(); } 
+        bool lockedByMe() { return _lockedByMe.get(); }
     };
 
     class ReplSetHealthPollTask;
@@ -174,19 +177,19 @@ namespace mongo {
             MemberState state;
             const Member *primary;
         };
-        const SP get() { 
+        const SP get() {
             scoped_lock lk(m);
             return sp;
         }
         MemberState getState() const { return sp.state; }
         const Member* getPrimary() const { return sp.primary; }
-        void change(MemberState s, const Member *self) { 
+        void change(MemberState s, const Member *self) {
             scoped_lock lk(m);
-            if( sp.state != s ) { 
+            if( sp.state != s ) {
                 log() << "replSet " << s.toString() << rsLog;
             }
             sp.state = s;
-            if( s.primary() ) { 
+            if( s.primary() ) {
                 sp.primary = self;
             }
             else {
@@ -194,17 +197,17 @@ namespace mongo {
                     sp.primary = 0;
             }
         }
-        void set(MemberState s, const Member *p) { 
+        void set(MemberState s, const Member *p) {
             scoped_lock lk(m);
             sp.state = s; sp.primary = p;
         }
         void setSelfPrimary(const Member *self) { change(MemberState::RS_PRIMARY, self); }
-        void setOtherPrimary(const Member *mem) { 
+        void setOtherPrimary(const Member *mem) {
             scoped_lock lk(m);
             assert( !sp.state.primary() );
             sp.primary = mem;
         }
-        void noteRemoteIsPrimary(const Member *remote) { 
+        void noteRemoteIsPrimary(const Member *remote) {
             scoped_lock lk(m);
             if( !sp.state.secondary() && !sp.state.fatal() )
                 sp.state = MemberState::RS_RECOVERING;
@@ -212,10 +215,10 @@ namespace mongo {
         }
         StateBox() : m("StateBox") { }
     private:
-        mutex m;
+        mongo::mutex m;
         SP sp;
     };
-    
+
     void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet );
 
     /** Parameter given to the --replSet command line option (parsed).
@@ -230,15 +233,15 @@ namespace mongo {
     };
 
     /* information about the entire repl set, such as the various servers in the set, and their state */
-    /* note: We currently do not free mem when the set goes away - it is assumed the replset is a 
+    /* note: We currently do not free mem when the set goes away - it is assumed the replset is a
              singleton and long lived.
     */
     class ReplSetImpl : protected RSBase {
     public:
         /** info on our state if the replset isn't yet "up".  for example, if we are pre-initiation. */
-        enum StartupStatus { 
-            PRESTART=0, LOADINGCONFIG=1, BADCONFIG=2, EMPTYCONFIG=3, 
-            EMPTYUNREACHABLE=4, STARTED=5, SOON=6 
+        enum StartupStatus {
+            PRESTART=0, LOADINGCONFIG=1, BADCONFIG=2, EMPTYCONFIG=3,
+            EMPTYUNREACHABLE=4, STARTED=5, SOON=6
         };
         static StartupStatus startupStatus;
         static string startupStatusMsg;
@@ -260,18 +263,21 @@ namespace mongo {
         void relinquish();
         void forgetPrimary();
     protected:
-        bool _stepDown();
+        bool _stepDown(int secs);
+        bool _freeze(int secs);
     private:
         void assumePrimary();
         void loadLastOpTimeWritten();
         void changeState(MemberState s);
+        const Member* getMemberToSyncTo();
+        void _changeArbiterState();
     protected:
         // "heartbeat message"
-        // sent in requestHeartbeat respond in field "hbm" 
+        // sent in requestHeartbeat respond in field "hbm"
         char _hbmsg[256]; // we change this unlocked, thus not an stl::string
         time_t _hbmsgTime; // when it was logged
     public:
-        void sethbmsg(string s, int logLevel = 0); 
+        void sethbmsg(string s, int logLevel = 0);
     protected:
         bool initFromConfig(ReplSetConfig& c, bool reconf=false); // true if ok; throws if config really bad; false if config doesn't include self
         void _fillIsMaster(BSONObjBuilder&);
@@ -281,7 +287,7 @@ namespace mongo {
         MemberState state() const { return box.getState(); }
         void _fatal();
         void _getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const;
-        void _summarizeAsHtml(stringstream&) const;        
+        void _summarizeAsHtml(stringstream&) const;
         void _summarizeStatus(BSONObjBuilder&) const; // for replSetGetStatus command
 
         /* throws exception if a problem initializing. */
@@ -295,7 +301,7 @@ namespace mongo {
         const vector<HostAndPort> *_seeds;
         ReplSetConfig *_cfg;
 
-        /** load our configuration from admin.replset.  try seed machines too. 
+        /** load our configuration from admin.replset.  try seed machines too.
             @return true if ok; throws if config really bad; false if config doesn't include self
         */
         bool _loadConfigFinish(vector<ReplSetConfig>& v);
@@ -306,7 +312,9 @@ namespace mongo {
         bool iAmArbiterOnly() const { return myConfig().arbiterOnly; }
         bool iAmPotentiallyHot() const { return myConfig().potentiallyHot(); }
     protected:
-        Member *_self;        
+        Member *_self;
+        bool _buildIndexes;       // = _self->config().buildIndexes
+        void setSelfTo(Member *); // use this as it sets buildIndexes var
     private:
         List1<Member> _members; /* all members of the set EXCEPT self. */
 
@@ -330,7 +338,7 @@ namespace mongo {
 
     private:
         /* pulling data from primary related - see rs_sync.cpp */
-        bool initialSyncOplogApplication(string hn, const Member *primary, OpTime applyGTE, OpTime minValid);
+        bool initialSyncOplogApplication(const Member *primary, OpTime applyGTE, OpTime minValid);
         void _syncDoInitialSync();
         void syncDoInitialSync();
         void _syncThread();
@@ -340,21 +348,29 @@ namespace mongo {
         unsigned _syncRollback(OplogReader& r);
         void syncRollback(OplogReader& r);
         void syncFixUp(HowToFixUp& h, OplogReader& r);
+        bool _getOplogReader(OplogReader& r, string& hn);
+        bool _isStale(OplogReader& r, const string& hn);
     public:
         void syncThread();
     };
 
-    class ReplSet : public ReplSetImpl { 
+    class ReplSet : public ReplSetImpl {
     public:
         ReplSet(ReplSetCmdline& replSetCmdline) : ReplSetImpl(replSetCmdline) {  }
 
-        bool stepDown() { return _stepDown(); }
+        // for the replSetStepDown command
+        bool stepDown(int secs) { return _stepDown(secs); }
 
-        string selfFullName() { 
+        // for the replSetFreeze command
+        bool freeze(int secs) { return _freeze(secs); }
+
+        string selfFullName() {
             lock lk(this);
             return _self->fullName();
         }
 
+        bool buildIndexes() const { return _buildIndexes; }
+
         /* call after constructing to start - returns fairly quickly after la[unching its threads */
         void go() { _go(); }
 
@@ -369,7 +385,7 @@ namespace mongo {
         void summarizeStatus(BSONObjBuilder& b) const  { _summarizeStatus(b); }
         void fillIsMaster(BSONObjBuilder& b) { _fillIsMaster(b); }
 
-        /* we have a new config (reconfig) - apply it. 
+        /* we have a new config (reconfig) - apply it.
            @param comment write a no-op comment to the oplog about it.  only makes sense if one is primary and initiating the reconf.
         */
         void haveNewConfig(ReplSetConfig& c, bool comment);
@@ -380,16 +396,16 @@ namespace mongo {
         bool lockedByMe() { return RSBase::lockedByMe(); }
 
         // heartbeat msg to send to others; descriptive diagnostic info
-        string hbmsg() const { 
+        string hbmsg() const {
             if( time(0)-_hbmsgTime > 120 ) return "";
-            return _hbmsg; 
+            return _hbmsg;
         }
     };
 
-    /** base class for repl set commands.  checks basic things such as in rs mode before the command 
+    /** base class for repl set commands.  checks basic things such as in rs mode before the command
         does its real work
         */
-    class ReplSetCommand : public Command { 
+    class ReplSetCommand : public Command {
     protected:
         ReplSetCommand(const char * s, bool show=false) : Command(s, show) { }
         virtual bool slaveOk() const { return true; }
@@ -398,14 +414,14 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream &help ) const { help << "internal"; }
         bool check(string& errmsg, BSONObjBuilder& result) {
-            if( !replSet ) { 
+            if( !replSet ) {
                 errmsg = "not running with --replSet";
                 return false;
             }
             if( theReplSet == 0 ) {
                 result.append("startupStatus", ReplSet::startupStatus);
                 errmsg = ReplSet::startupStatusMsg.empty() ? "replset unknown error 2" : ReplSet::startupStatusMsg;
-                if( ReplSet::startupStatus == 3 ) 
+                if( ReplSet::startupStatus == 3 )
                     result.append("info", "run rs.initiate(...) if not yet done for the set");
                 return false;
             }
@@ -415,9 +431,8 @@ namespace mongo {
 
     /** inlines ----------------- */
 
-    inline Member::Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self) : 
-        _config(*c), _h(h), _hbinfo(ord) 
-    { 
+    inline Member::Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self) :
+        _config(*c), _h(h), _hbinfo(ord) {
         if( self )
             _hbinfo.health = 1.0;
     }
diff --git a/db/repl/rs_config.cpp b/db/repl/rs_config.cpp
index 371507d..5998f51 100644
--- a/db/repl/rs_config.cpp
+++ b/db/repl/rs_config.cpp
@@ -27,11 +27,11 @@
 
 using namespace bson;
 
-namespace mongo { 
+namespace mongo {
 
     void logOpInitiate(const bo&);
 
-    void assertOnlyHas(BSONObj o, const set<string>& fields) { 
+    void assertOnlyHas(BSONObj o, const set<string>& fields) {
         BSONObj::iterator i(o);
         while( i.more() ) {
             BSONElement e = i.next();
@@ -41,7 +41,7 @@ namespace mongo {
         }
     }
 
-    list<HostAndPort> ReplSetConfig::otherMemberHostnames() const { 
+    list<HostAndPort> ReplSetConfig::otherMemberHostnames() const {
         list<HostAndPort> L;
         for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); i++ ) {
             if( !i->h.isSelf() )
@@ -49,12 +49,12 @@ namespace mongo {
         }
         return L;
     }
-    
+
     /* comment MUST only be set when initiating the set by the initiator */
-    void ReplSetConfig::saveConfigLocally(bo comment) { 
+    void ReplSetConfig::saveConfigLocally(bo comment) {
         checkRsConfig();
         log() << "replSet info saving a newer config version to local.system.replset" << rsLog;
-        { 
+        {
             writelock lk("");
             Client::Context cx( rsConfigNs );
             cx.db()->flushFiles(true);
@@ -70,21 +70,21 @@ namespace mongo {
         }
         DEV log() << "replSet saveConfigLocally done" << rsLog;
     }
-    
-    /*static*/ 
-    /*void ReplSetConfig::receivedNewConfig(BSONObj cfg) { 
+
+    /*static*/
+    /*void ReplSetConfig::receivedNewConfig(BSONObj cfg) {
         if( theReplSet )
             return; // this is for initial setup only, so far. todo
 
         ReplSetConfig c(cfg);
 
         writelock lk("admin.");
-        if( theReplSet ) 
+        if( theReplSet )
             return;
         c.saveConfigLocally(bo());
     }*/
 
-    bo ReplSetConfig::MemberCfg::asBson() const { 
+    bo ReplSetConfig::MemberCfg::asBson() const {
         bob b;
         b << "_id" << _id;
         b.append("host", h.toString());
@@ -93,18 +93,28 @@ namespace mongo {
         if( arbiterOnly ) b << "arbiterOnly" << true;
         if( slaveDelay ) b << "slaveDelay" << slaveDelay;
         if( hidden ) b << "hidden" << hidden;
+        if( !buildIndexes ) b << "buildIndexes" << buildIndexes;
+        if( !tags.empty() ) {
+            BSONArrayBuilder a;
+            for( set<string>::const_iterator i = tags.begin(); i != tags.end(); i++ )
+                a.append(*i);
+            b.appendArray("tags", a.done());
+        }
+        if( !initialSync.isEmpty() ) {
+            b << "initialSync" << initialSync;
+        }
         return b.obj();
     }
 
-    bo ReplSetConfig::asBson() const { 
+    bo ReplSetConfig::asBson() const {
         bob b;
         b.append("_id", _id).append("version", version);
         if( !ho.isDefault() || !getLastErrorDefaults.isEmpty() ) {
             bob settings;
             if( !ho.isDefault() )
-                settings << "heartbeatConnRetries " << ho.heartbeatConnRetries  << 
-                             "heartbeatSleep" << ho.heartbeatSleepMillis / 1000 << 
-                             "heartbeatTimeout" << ho.heartbeatTimeoutMillis / 1000;
+                settings << "heartbeatConnRetries " << ho.heartbeatConnRetries  <<
+                         "heartbeatSleep" << ho.heartbeatSleepMillis / 1000.0 <<
+                         "heartbeatTimeout" << ho.heartbeatTimeoutMillis / 1000.0;
             if( !getLastErrorDefaults.isEmpty() )
                 settings << "getLastErrorDefaults" << getLastErrorDefaults;
             b << "settings" << settings.obj();
@@ -122,7 +132,7 @@ namespace mongo {
         uassert(13126, "bad Member config", expr);
     }
 
-    void ReplSetConfig::MemberCfg::check() const{ 
+    void ReplSetConfig::MemberCfg::check() const {
         mchk(_id >= 0 && _id <= 255);
         mchk(priority >= 0 && priority <= 1000);
         mchk(votes >= 0 && votes <= 100);
@@ -130,41 +140,80 @@ namespace mongo {
         uassert(13437, "slaveDelay requires priority be zero", slaveDelay == 0 || priority == 0);
         uassert(13438, "bad slaveDelay value", slaveDelay >= 0 && slaveDelay <= 3600 * 24 * 366);
         uassert(13439, "priority must be 0 when hidden=true", priority == 0 || !hidden);
+        uassert(13477, "priority must be 0 when buildIndexes=false", buildIndexes || priority == 0);
+
+        if (!initialSync.isEmpty()) {
+            static const string legal[] = {"state", "name", "_id","optime"};
+            static const set<string> legals(legal, legal + 4);
+            assertOnlyHas(initialSync, legals);
+
+            if (initialSync.hasElement("state")) {
+                uassert(13525, "initialSync source state must be 1 or 2",
+                        initialSync["state"].isNumber() &&
+                        (initialSync["state"].Number() == 1 ||
+                         initialSync["state"].Number() == 2));
+            }
+            if (initialSync.hasElement("name")) {
+                uassert(13526, "initialSync source name must be a string",
+                        initialSync["name"].type() == mongo::String);
+            }
+            if (initialSync.hasElement("_id")) {
+                uassert(13527, "initialSync source _id must be a number",
+                        initialSync["_id"].isNumber());
+            }
+            if (initialSync.hasElement("optime")) {
+                uassert(13528, "initialSync source optime must be a timestamp",
+                        initialSync["optime"].type() == mongo::Timestamp ||
+                        initialSync["optime"].type() == mongo::Date);
+            }
+        }
     }
 
     /** @param o old config
-        @param n new config 
+        @param n new config
         */
-    /*static*/ bool ReplSetConfig::legalChange(const ReplSetConfig& o, const ReplSetConfig& n, string& errmsg) { 
+    /*static*/
+    bool ReplSetConfig::legalChange(const ReplSetConfig& o, const ReplSetConfig& n, string& errmsg) {
         assert( theReplSet );
 
-        if( o._id != n._id ) { 
-            errmsg = "set name may not change"; 
+        if( o._id != n._id ) {
+            errmsg = "set name may not change";
             return false;
         }
         /* TODO : wonder if we need to allow o.version < n.version only, which is more lenient.
-                  if someone had some intermediate config this node doesnt have, that could be 
+                  if someone had some intermediate config this node doesnt have, that could be
                   necessary.  but then how did we become primary?  so perhaps we are fine as-is.
                   */
-        if( o.version + 1 != n.version ) { 
+        if( o.version + 1 != n.version ) {
             errmsg = "version number wrong";
             return false;
         }
 
         map<HostAndPort,const ReplSetConfig::MemberCfg*> old;
-        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = o.members.begin(); i != o.members.end(); i++ ) { 
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = o.members.begin(); i != o.members.end(); i++ ) {
             old[i->h] = &(*i);
         }
         int me = 0;
-        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = n.members.begin(); i != n.members.end(); i++ ) { 
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = n.members.begin(); i != n.members.end(); i++ ) {
             const ReplSetConfig::MemberCfg& m = *i;
-            if( old.count(m.h) ) { 
-                if( old[m.h]->_id != m._id ) { 
+            if( old.count(m.h) ) {
+                const ReplSetConfig::MemberCfg& oldCfg = *old[m.h];
+                if( oldCfg._id != m._id ) {
                     log() << "replSet reconfig error with member: " << m.h.toString() << rsLog;
                     uasserted(13432, "_id may not change for members");
                 }
+                if( oldCfg.buildIndexes != m.buildIndexes ) {
+                    log() << "replSet reconfig error with member: " << m.h.toString() << rsLog;
+                    uasserted(13476, "buildIndexes may not change for members");
+                }
+                /* are transitions to and from arbiterOnly guaranteed safe?  if not, we should disallow here.
+                   there is a test at replsets/replsetarb3.js */
+                if( oldCfg.arbiterOnly != m.arbiterOnly ) {
+                    log() << "replSet reconfig error with member: " << m.h.toString() << " arbiterOnly cannot change. remove and readd the member instead " << rsLog;
+                    uasserted(13510, "arbiterOnly may not change for members");
+                }
             }
-            if( m.h.isSelf() ) 
+            if( m.h.isSelf() )
                 me++;
         }
 
@@ -172,24 +221,33 @@ namespace mongo {
 
         /* TODO : MORE CHECKS HERE */
 
-        log() << "replSet TODO : don't allow removal of a node until we handle it at the removed node end?" << endl;
+        DEV log() << "replSet TODO : don't allow removal of a node until we handle it at the removed node end?" << endl;
         // we could change its votes to zero perhaps instead as a short term...
 
         return true;
     }
 
-    void ReplSetConfig::clear() { 
+    void ReplSetConfig::clear() {
         version = -5;
         _ok = false;
     }
 
-    void ReplSetConfig::checkRsConfig() const { 
+    void ReplSetConfig::checkRsConfig() const {
         uassert(13132,
-            "nonmatching repl set name in _id field; check --replSet command line",
-            _id == cmdLine.ourSetName());
+                "nonmatching repl set name in _id field; check --replSet command line",
+                _id == cmdLine.ourSetName());
         uassert(13308, "replSet bad config version #", version > 0);
         uassert(13133, "replSet bad config no members", members.size() >= 1);
-        uassert(13309, "replSet bad config maximum number of members is 7 (for now)", members.size() <= 7);
+        uassert(13309, "replSet bad config maximum number of members is 12", members.size() <= 12);
+        {
+            unsigned voters = 0;
+            for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); ++i ) {
+                if( i->votes )
+                    voters++;
+            }
+            uassert(13612, "replSet bad config maximum number of voting members is 7", voters <= 7);
+            uassert(13613, "replSet bad config no voting members", voters > 0);
+        }
     }
 
     void ReplSetConfig::from(BSONObj o) {
@@ -213,7 +271,8 @@ namespace mongo {
             if( settings["heartbeatTimeout"].ok() )
                 ho.heartbeatTimeoutMillis = (unsigned) (settings["heartbeatTimeout"].Number() * 1000);
             ho.check();
-            try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); } catch(...) { }
+            try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); }
+            catch(...) { }
         }
 
         set<string> hosts;
@@ -231,43 +290,57 @@ namespace mongo {
             BSONObj mobj = members[i].Obj();
             MemberCfg m;
             try {
-                static const string legal[] = {"_id","votes","priority","host","hidden","slaveDelay","arbiterOnly"};
-                static const set<string> legals(legal, legal + 7);
+                static const string legal[] = {
+                    "_id","votes","priority","host", "hidden","slaveDelay",
+                    "arbiterOnly","buildIndexes","tags","initialSync"
+                };
+                static const set<string> legals(legal, legal + 10);
                 assertOnlyHas(mobj, legals);
 
-                try { 
+                try {
                     m._id = (int) mobj["_id"].Number();
-                } catch(...) { 
+                }
+                catch(...) {
                     /* TODO: use of string exceptions may be problematic for reconfig case! */
-                    throw "_id must be numeric"; 
+                    throw "_id must be numeric";
                 }
                 string s;
                 try {
                     s = mobj["host"].String();
                     m.h = HostAndPort(s);
                 }
-                catch(...) { 
+                catch(...) {
                     throw string("bad or missing host field? ") + mobj.toString();
                 }
-                if( m.h.isLocalHost() ) 
+                if( m.h.isLocalHost() )
                     localhosts++;
                 m.arbiterOnly = mobj.getBoolField("arbiterOnly");
                 m.slaveDelay = mobj["slaveDelay"].numberInt();
                 if( mobj.hasElement("hidden") )
                     m.hidden = mobj.getBoolField("hidden");
+                if( mobj.hasElement("buildIndexes") )
+                    m.buildIndexes = mobj.getBoolField("buildIndexes");
                 if( mobj.hasElement("priority") )
                     m.priority = mobj["priority"].Number();
                 if( mobj.hasElement("votes") )
                     m.votes = (unsigned) mobj["votes"].Number();
+                if( mobj.hasElement("tags") ) {
+                    vector<BSONElement> v = mobj["tags"].Array();
+                    for( unsigned i = 0; i < v.size(); i++ )
+                        m.tags.insert( v[i].String() );
+                }
+                if( mobj.hasElement("initialSync")) {
+                    m.initialSync = mobj["initialSync"].Obj().getOwned();
+                }
                 m.check();
             }
-            catch( const char * p ) { 
+            catch( const char * p ) {
                 log() << "replSet cfg parsing exception for members[" << i << "] " << p << rsLog;
                 stringstream ss;
                 ss << "replSet members[" << i << "] " << p;
                 uassert(13107, ss.str(), false);
             }
-            catch(DBException& e) { 
+            catch(DBException& e) {
                 log() << "replSet cfg parsing exception for members[" << i << "] " << e.what() << rsLog;
                 stringstream ss;
                 ss << "bad config for member[" << i << "] " << e.what();
@@ -289,7 +362,7 @@ namespace mongo {
         uassert(13122, "bad repl set config?", expr);
     }
 
-    ReplSetConfig::ReplSetConfig(BSONObj cfg) { 
+    ReplSetConfig::ReplSetConfig(BSONObj cfg) {
         clear();
         from(cfg);
         configAssert( version < 0 /*unspecified*/ || (version >= 1 && version <= 5000) );
@@ -315,18 +388,19 @@ namespace mongo {
                 BSONObj cmd = BSON( "replSetHeartbeat" << setname );
                 int theirVersion;
                 BSONObj info;
+                log() << "trying to contact " << h.toString() << rsLog;
                 bool ok = requestHeartbeat(setname, "", h.toString(), info, -2, theirVersion);
-                if( info["rs"].trueValue() ) { 
+                if( info["rs"].trueValue() ) {
                     // yes, it is a replicate set, although perhaps not yet initialized
                 }
                 else {
                     if( !ok ) {
                         log() << "replSet TEMP !ok heartbeating " << h.toString() << " on cfg load" << rsLog;
-                        if( !info.isEmpty() ) 
+                        if( !info.isEmpty() )
                             log() << "replSet info " << h.toString() << " : " << info.toString() << rsLog;
                         return;
                     }
-                    { 
+                    {
                         stringstream ss;
                         ss << "replSet error: member " << h.toString() << " is not in --replSet mode";
                         msgassertedNoTrace(13260, ss.str().c_str()); // not caught as not a user exception - we want it not caught
@@ -343,7 +417,7 @@ namespace mongo {
                 cfg = conn.findOne(rsConfigNs, Query()).getOwned();
                 count = conn.count(rsConfigNs);
             }
-            catch ( DBException& e) {
+            catch ( DBException& ) {
                 if ( !h.isSelf() ) {
                     throw;
                 }
@@ -356,14 +430,14 @@ namespace mongo {
 
             if( count > 1 )
                 uasserted(13109, str::stream() << "multiple rows in " << rsConfigNs << " not supported host: " << h.toString());
-            
+
             if( cfg.isEmpty() ) {
                 version = EMPTYCONFIG;
                 return;
             }
             version = -1;
         }
-        catch( DBException& e) { 
+        catch( DBException& e) {
             version = v;
             log(level) << "replSet load config couldn't get from " << h.toString() << ' ' << e.what() << rsLog;
             return;
diff --git a/db/repl/rs_config.h b/db/repl/rs_config.h
index e39dad7..7d43fe6 100644
--- a/db/repl/rs_config.h
+++ b/db/repl/rs_config.h
@@ -23,7 +23,7 @@
 #include "../../util/hostandport.h"
 #include "health.h"
 
-namespace mongo { 
+namespace mongo {
 
     /* singleton config object is stored here */
     const string rsConfigNs = "local.system.replset";
@@ -31,7 +31,7 @@ namespace mongo {
     class ReplSetConfig {
         enum { EMPTYCONFIG = -2 };
     public:
-        /* if something is misconfigured, throws an exception. 
+        /* if something is misconfigured, throws an exception.
         if couldn't be queried or is just blank, ok() will be false.
         */
         ReplSetConfig(const HostAndPort& h);
@@ -41,7 +41,7 @@ namespace mongo {
         bool ok() const { return _ok; }
 
         struct MemberCfg {
-            MemberCfg() : _id(-1), votes(1), priority(1.0), arbiterOnly(false), slaveDelay(0), hidden(false) { }
+            MemberCfg() : _id(-1), votes(1), priority(1.0), arbiterOnly(false), slaveDelay(0), hidden(false), buildIndexes(true) { }
             int _id;              /* ordinal */
             unsigned votes;       /* how many votes this node gets. default 1. */
             HostAndPort h;
@@ -49,15 +49,17 @@ namespace mongo {
             bool arbiterOnly;
             int slaveDelay;       /* seconds.  int rather than unsigned for convenient to/front bson conversion. */
             bool hidden;          /* if set, don't advertise to drives in isMaster. for non-primaries (priority 0) */
+            bool buildIndexes;    /* if false, do not create any non-_id indexes */
+            set<string> tags;     /* tagging for data center, rack, etc. */
+            BSONObj initialSync;  /* directions for initial sync source */
 
             void check() const;   /* check validity, assert if not. */
             BSONObj asBson() const;
-            bool potentiallyHot() const { 
-                return !arbiterOnly && priority > 0;
-            }
-            bool operator==(const MemberCfg& r) const { 
-                return _id==r._id && votes == r.votes && h == r.h && priority == r.priority && 
-                    arbiterOnly == r.arbiterOnly && slaveDelay == r.slaveDelay && hidden == r.hidden;
+            bool potentiallyHot() const { return !arbiterOnly && priority > 0; }
+            bool operator==(const MemberCfg& r) const {
+                return _id==r._id && votes == r.votes && h == r.h && priority == r.priority &&
+                       arbiterOnly == r.arbiterOnly && slaveDelay == r.slaveDelay && hidden == r.hidden &&
+                       buildIndexes == buildIndexes;
             }
             bool operator!=(const MemberCfg& r) const { return !(*this == r); }
         };
diff --git a/db/repl/rs_exception.h b/db/repl/rs_exception.h
index e71cad2..fc372fc 100755..100644
--- a/db/repl/rs_exception.h
+++ b/db/repl/rs_exception.h
@@ -1,15 +1,15 @@
-// @file rs_exception.h
-
-#pragma once
-
-namespace mongo { 
-
-    class VoteException : public std::exception { 
+// @file rs_exception.h
+
+#pragma once
+
+namespace mongo {
+
+    class VoteException : public std::exception {
     public:
-        const char * what() const throw () { return "VoteException"; }
+        const char * what() const throw () { return "VoteException"; }
     };
 
-    class RetryAfterSleepException : public std::exception { 
+    class RetryAfterSleepException : public std::exception {
     public:
         const char * what() const throw () { return "RetryAfterSleepException"; }
     };
diff --git a/db/repl/rs_initialsync.cpp b/db/repl/rs_initialsync.cpp
index 3851c66..5a54059 100644
--- a/db/repl/rs_initialsync.cpp
+++ b/db/repl/rs_initialsync.cpp
@@ -15,6 +15,7 @@
 */
 
 #include "pch.h"
+#include "../repl.h"
 #include "../client.h"
 #include "../../client/dbclient.h"
 #include "rs.h"
@@ -33,15 +34,17 @@ namespace mongo {
 
     // add try/catch with sleep
 
-    void isyncassert(const char *msg, bool expr) { 
-        if( !expr ) { 
+    void isyncassert(const char *msg, bool expr) {
+        if( !expr ) {
             string m = str::stream() << "initial sync " << msg;
             theReplSet->sethbmsg(m, 0);
             uasserted(13404, m);
         }
     }
 
-    void ReplSetImpl::syncDoInitialSync() { 
+    void ReplSetImpl::syncDoInitialSync() {
+        createOplog();
+
         while( 1 ) {
             try {
                 _syncDoInitialSync();
@@ -54,14 +57,14 @@ namespace mongo {
         }
     }
 
-    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, 
-				   bool slaveOk, bool useReplAuth, bool snapshot);
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+                   bool slaveOk, bool useReplAuth, bool snapshot);
 
     /* todo : progress metering to sethbmsg. */
     static bool clone(const char *master, string db) {
         string err;
-        return cloneFrom(master, err, db, false, 
-            /*slaveok later can be true*/ false, true, false);
+        return cloneFrom(master, err, db, false,
+                         /* slave_ok */ true, true, false);
     }
 
     void _logOpObjRS(const BSONObj& op);
@@ -71,11 +74,11 @@ namespace mongo {
     static void emptyOplog() {
         writelock lk(rsoplog);
         Client::Context ctx(rsoplog);
-		NamespaceDetails *d = nsdetails(rsoplog);
+        NamespaceDetails *d = nsdetails(rsoplog);
 
-		// temp
-		if( d && d->nrecords == 0 )
-		  return; // already empty, ok.
+        // temp
+        if( d && d->stats.nrecords == 0 )
+            return; // already empty, ok.
 
         log(1) << "replSet empty oplog" << rsLog;
         d->emptyCappedCollection(rsoplog);
@@ -84,10 +87,10 @@ namespace mongo {
         string errmsg;
         bob res;
         dropCollection(rsoplog, errmsg, res);
-		log() << "replSet recreated oplog so it is empty.  todo optimize this..." << rsLog;
-		createOplog();*/
+        log() << "replSet recreated oplog so it is empty.  todo optimize this..." << rsLog;
+        createOplog();*/
 
-      	// TEMP: restart to recreate empty oplog
+        // TEMP: restart to recreate empty oplog
         //log() << "replSet FATAL error during initial sync.  mongod restart required." << rsLog;
         //dbexit( EXIT_CLEAN );
 
@@ -100,106 +103,182 @@ namespace mongo {
         */
     }
 
-    void ReplSetImpl::_syncDoInitialSync() { 
-        sethbmsg("initial sync pending",0);
+    /**
+     * Choose a member to sync from.
+     *
+     * The initalSync option is an object with 1 k/v pair:
+     *
+     * "state" : 1|2
+     * "name" : "host"
+     * "_id" : N
+     * "optime" : t
+     *
+     * All except optime are exact matches.  "optime" will find a secondary with
+     * an optime >= to the optime given.
+     */
+    const Member* ReplSetImpl::getMemberToSyncTo() {
+        BSONObj sync = myConfig().initialSync;
+        bool secondaryOnly = false, isOpTime = false;
+        char *name = 0;
+        int id = -1;
+        OpTime optime;
 
         StateBox::SP sp = box.get();
         assert( !sp.state.primary() ); // wouldn't make sense if we were.
 
-        const Member *cp = sp.primary;
-        if( cp == 0 ) {
-            sethbmsg("initial sync need a member to be primary",0);
+        // if it exists, we've already checked that these fields are valid in
+        // rs_config.cpp
+        if ( !sync.isEmpty() ) {
+            if (sync.hasElement("state")) {
+                if (sync["state"].Number() == 1) {
+                    if (sp.primary) {
+                        sethbmsg( str::stream() << "syncing to primary: " << sp.primary->fullName(), 0);
+                        return const_cast<Member*>(sp.primary);
+                    }
+                    else {
+                        sethbmsg("couldn't clone from primary");
+                        return NULL;
+                    }
+                }
+                else {
+                    secondaryOnly = true;
+                }
+            }
+            if (sync.hasElement("name")) {
+                name = (char*)sync["name"].valuestr();
+            }
+            if (sync.hasElement("_id")) {
+                id = (int)sync["_id"].Number();
+            }
+            if (sync.hasElement("optime")) {
+                isOpTime = true;
+                optime = sync["optime"]._opTime();
+            }
+        }
+
+        for( Member *m = head(); m; m = m->next() ) {
+            if (!m->hbinfo().up() ||
+                    (m->state() != MemberState::RS_SECONDARY &&
+                     m->state() != MemberState::RS_PRIMARY) ||
+                    (secondaryOnly && m->state() != MemberState::RS_SECONDARY) ||
+                    (id != -1 && (int)m->id() != id) ||
+                    (name != 0 && strcmp(name, m->fullName().c_str()) != 0) ||
+                    (isOpTime && optime >= m->hbinfo().opTime)) {
+                continue;
+            }
+
+            sethbmsg( str::stream() << "syncing to: " << m->fullName(), 0);
+            return const_cast<Member*>(m);
+        }
+
+        sethbmsg( str::stream() << "couldn't find a member matching the sync criteria: " <<
+                  "\nstate? " << (secondaryOnly ? "2" : "none") <<
+                  "\nname? " << (name ? name : "none") <<
+                  "\n_id? " << id <<
+                  "\noptime? " << optime.toStringPretty() );
+
+        return NULL;
+    }
+
+    /**
+     * Do the initial sync for this member.
+     */
+    void ReplSetImpl::_syncDoInitialSync() {
+        sethbmsg("initial sync pending",0);
+
+        const Member *source = getMemberToSyncTo();
+        if (!source) {
+            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
             sleepsecs(15);
             return;
         }
 
-        string masterHostname = cp->h().toString();
+        string sourceHostname = source->h().toString();
         OplogReader r;
-        if( !r.connect(masterHostname) ) {
-            sethbmsg( str::stream() << "initial sync couldn't connect to " << cp->h().toString() , 0);
+        if( !r.connect(sourceHostname) ) {
+            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
             sleepsecs(15);
             return;
         }
 
         BSONObj lastOp = r.getLastOp(rsoplog);
-        if( lastOp.isEmpty() ) { 
+        if( lastOp.isEmpty() ) {
             sethbmsg("initial sync couldn't read remote oplog", 0);
             sleepsecs(15);
             return;
         }
         OpTime startingTS = lastOp["ts"]._opTime();
-        
-        {
-            /* make sure things aren't too flappy */
-            sleepsecs(5);
-            isyncassert( "flapping?", box.getPrimary() == cp );
-            BSONObj o = r.getLastOp(rsoplog);
-            isyncassert( "flapping [2]?", !o.isEmpty() );
-        }
-
-        sethbmsg("initial sync drop all databases", 0);
-        dropAllDatabasesExceptLocal();
 
-//        sethbmsg("initial sync drop oplog", 0);
-//        emptyOplog();
-
-        list<string> dbs = r.conn()->getDatabaseNames();
-        for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) {
-            string db = *i;
-            if( db != "local" ) {
-                sethbmsg( str::stream() << "initial sync cloning db: " << db , 0);
-                bool ok;
-                {
-                    writelock lk(db);
-                    Client::Context ctx(db);
-                    ok = clone(masterHostname.c_str(), db);
-                }
-                if( !ok ) { 
-                    sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0);
-                    sleepsecs(300);
-                    return;
+        if (replSettings.fastsync) {
+            log() << "fastsync: skipping database clone" << rsLog;
+        }
+        else {
+            sethbmsg("initial sync drop all databases", 0);
+            dropAllDatabasesExceptLocal();
+
+            sethbmsg("initial sync clone all databases", 0);
+
+            list<string> dbs = r.conn()->getDatabaseNames();
+            for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) {
+                string db = *i;
+                if( db != "local" ) {
+                    sethbmsg( str::stream() << "initial sync cloning db: " << db , 0);
+                    bool ok;
+                    {
+                        writelock lk(db);
+                        Client::Context ctx(db);
+                        ok = clone(sourceHostname.c_str(), db);
+                    }
+                    if( !ok ) {
+                        sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0);
+                        sleepsecs(300);
+                        return;
+                    }
                 }
             }
         }
 
         sethbmsg("initial sync query minValid",0);
 
-        /* our cloned copy will be strange until we apply oplog events that occurred 
+        isyncassert( "initial sync source must remain readable throughout our initial sync", source->state().readable() );
+
+        /* our cloned copy will be strange until we apply oplog events that occurred
            through the process.  we note that time point here. */
         BSONObj minValid = r.getLastOp(rsoplog);
-        assert( !minValid.isEmpty() );
+        isyncassert( "getLastOp is empty ", !minValid.isEmpty() );
         OpTime mvoptime = minValid["ts"]._opTime();
         assert( !mvoptime.isNull() );
 
-        /* copy the oplog 
+        /* apply relevant portion of the oplog
         */
         {
-            sethbmsg("initial sync copy+apply oplog");
-            if( ! initialSyncOplogApplication(masterHostname, cp, startingTS, mvoptime) ) { // note we assume here that this call does not throw
+            sethbmsg("initial sync initial oplog application");
+            isyncassert( "initial sync source must remain readable throughout our initial sync [2]", source->state().readable() );
+            if( ! initialSyncOplogApplication(source, /*applyGTE*/startingTS, /*minValid*/mvoptime) ) { // note we assume here that this call does not throw
                 log() << "replSet initial sync failed during applyoplog" << rsLog;
                 emptyOplog(); // otherwise we'll be up!
-				lastOpTimeWritten = OpTime();
-				lastH = 0;
+                lastOpTimeWritten = OpTime();
+                lastH = 0;
                 log() << "replSet cleaning up [1]" << rsLog;
                 {
                     writelock lk("local.");
                     Client::Context cx( "local." );
-                    cx.db()->flushFiles(true);            
+                    cx.db()->flushFiles(true);
                 }
                 log() << "replSet cleaning up [2]" << rsLog;
-                sleepsecs(2);
+                sleepsecs(5);
                 return;
             }
         }
 
         sethbmsg("initial sync finishing up",0);
-        
+
         assert( !box.getState().primary() ); // wouldn't make sense if we were.
 
         {
             writelock lk("local.");
             Client::Context cx( "local." );
-            cx.db()->flushFiles(true);            
+            cx.db()->flushFiles(true);
             try {
                 log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
             }
diff --git a/db/repl/rs_initiate.cpp b/db/repl/rs_initiate.cpp
index 9c74be0..cf1941f 100644
--- a/db/repl/rs_initiate.cpp
+++ b/db/repl/rs_initiate.cpp
@@ -26,47 +26,63 @@
 #include "rs.h"
 #include "rs_config.h"
 #include "../dbhelpers.h"
+#include "../oplog.h"
 
 using namespace bson;
 using namespace mongoutils;
 
-namespace mongo { 
+namespace mongo {
 
     /* called on a reconfig AND on initiate
-       throws 
+       throws
        @param initial true when initiating
     */
     void checkMembersUpForConfigChange(const ReplSetConfig& cfg, bool initial) {
         int failures = 0;
         int me = 0;
+        stringstream selfs;
         for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
             if( i->h.isSelf() ) {
                 me++;
-                if( !i->potentiallyHot() ) { 
+                if( me > 1 )
+                    selfs << ',';
+                selfs << i->h.toString();
+                if( !i->potentiallyHot() ) {
                     uasserted(13420, "initiation and reconfiguration of a replica set must be sent to a node that can become primary");
                 }
             }
         }
-        uassert(13278, "bad config - dups?", me <= 1); // dups?
-        uassert(13279, "can't find self in the replset config", me == 1);
+        uassert(13278, "bad config: isSelf is true for multiple hosts: " + selfs.str(), me <= 1); // dups?
+        if( me != 1 ) {
+            stringstream ss;
+            ss << "can't find self in the replset config";
+            if( !cmdLine.isDefaultPort() ) ss << " my port: " << cmdLine.port;
+            if( me != 0 ) ss << " found: " << me;
+            uasserted(13279, ss.str());
+        }
 
         for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
+            // we know we're up
+            if (i->h.isSelf()) {
+                continue;
+            }
+
             BSONObj res;
             {
                 bool ok = false;
                 try {
                     int theirVersion = -1000;
-                    ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/); 
-                    if( theirVersion >= cfg.version ) { 
+                    ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/);
+                    if( theirVersion >= cfg.version ) {
                         stringstream ss;
                         ss << "replSet member " << i->h.toString() << " has too new a config version (" << theirVersion << ") to reconfigure";
                         uasserted(13259, ss.str());
                     }
                 }
-                catch(DBException& e) { 
+                catch(DBException& e) {
                     log() << "replSet cmufcc requestHeartbeat " << i->h.toString() << " : " << e.toString() << rsLog;
                 }
-                catch(...) { 
+                catch(...) {
                     log() << "replSet cmufcc error exception in requestHeartbeat?" << rsLog;
                 }
                 if( res.getBoolField("mismatch") )
@@ -96,7 +112,7 @@ namespace mongo {
                            trying to keep change small as release is near.
                            */
                         const Member* m = theReplSet->findById( i->_id );
-                        if( m ) { 
+                        if( m ) {
                             // ok, so this was an existing member (wouldn't make sense to add to config a new member that is down)
                             assert( m->h().toString() == i->h.toString() );
                             allowFailure = true;
@@ -113,24 +129,24 @@ namespace mongo {
             }
             if( initial ) {
                 bool hasData = res["hasData"].Bool();
-                uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set.  All members except initiator must be empty.", 
-                    !hasData || i->h.isSelf());
+                uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set.  All members except initiator must be empty.",
+                        !hasData || i->h.isSelf());
             }
         }
     }
 
-    class CmdReplSetInitiate : public ReplSetCommand { 
+    class CmdReplSetInitiate : public ReplSetCommand {
     public:
         virtual LockType locktype() const { return NONE; }
         CmdReplSetInitiate() : ReplSetCommand("replSetInitiate") { }
-        virtual void help(stringstream& h) const { 
-            h << "Initiate/christen a replica set."; 
+        virtual void help(stringstream& h) const {
+            h << "Initiate/christen a replica set.";
             h << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
         }
         virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             log() << "replSet replSetInitiate admin command received from client" << rsLog;
 
-            if( !replSet ) { 
+            if( !replSet ) {
                 errmsg = "server is not running with --replSet";
                 return false;
             }
@@ -141,12 +157,12 @@ namespace mongo {
             }
 
             {
-                // just make sure we can get a write lock before doing anything else.  we'll reacquire one 
-                // later.  of course it could be stuck then, but this check lowers the risk if weird things 
+                // just make sure we can get a write lock before doing anything else.  we'll reacquire one
+                // later.  of course it could be stuck then, but this check lowers the risk if weird things
                 // are up.
                 time_t t = time(0);
                 writelock lk("");
-                if( time(0)-t > 10 ) { 
+                if( time(0)-t > 10 ) {
                     errmsg = "took a long time to get write lock, so not initiating.  Initiate when server less busy?";
                     return false;
                 }
@@ -155,7 +171,7 @@ namespace mongo {
                    it is ok if the initiating member has *other* data than that.
                    */
                 BSONObj o;
-                if( Helpers::getFirst(rsoplog, o) ) { 
+                if( Helpers::getFirst(rsoplog, o) ) {
                     errmsg = rsoplog + string(" is not empty on the initiating member.  cannot initiate.");
                     return false;
                 }
@@ -194,7 +210,7 @@ namespace mongo {
                 configObj = b.obj();
                 log() << "replSet created this configuration for initiation : " << configObj.toString() << rsLog;
             }
-            else { 
+            else {
                 configObj = cmdObj["replSetInitiate"].Obj();
             }
 
@@ -203,7 +219,7 @@ namespace mongo {
                 ReplSetConfig newConfig(configObj);
                 parsed = true;
 
-                if( newConfig.version > 1 ) { 
+                if( newConfig.version > 1 ) {
                     errmsg = "can't initiate with a version number greater than 1";
                     return false;
                 }
@@ -214,6 +230,8 @@ namespace mongo {
 
                 log() << "replSet replSetInitiate all members seem up" << rsLog;
 
+                createOplog();
+
                 writelock lk("");
                 bo comment = BSON( "msg" << "initiating set");
                 newConfig.saveConfigLocally(comment);
@@ -222,9 +240,9 @@ namespace mongo {
                 ReplSet::startupStatus = ReplSet::SOON;
                 ReplSet::startupStatusMsg = "Received replSetInitiate - should come online shortly.";
             }
-            catch( DBException& e ) { 
+            catch( DBException& e ) {
                 log() << "replSet replSetInitiate exception: " << e.what() << rsLog;
-                if( !parsed ) 
+                if( !parsed )
                     errmsg = string("couldn't parse cfg object ") + e.what();
                 else
                     errmsg = string("couldn't initiate : ") + e.what();
diff --git a/db/repl/rs_member.h b/db/repl/rs_member.h
index 099cb22..017b6ea 100644
--- a/db/repl/rs_member.h
+++ b/db/repl/rs_member.h
@@ -30,18 +30,18 @@ namespace mongo {
         RS_FATAL      something bad has occurred and server is not completely offline with regard to the replica set.  fatal error.
         RS_STARTUP2   loaded config, still determining who is primary
     */
-    struct MemberState { 
-        enum MS { 
-            RS_STARTUP,
-            RS_PRIMARY,
-            RS_SECONDARY,
-            RS_RECOVERING,
-            RS_FATAL,
-            RS_STARTUP2,
-            RS_UNKNOWN, /* remote node not yet reached */
-            RS_ARBITER,
-            RS_DOWN, /* node not reachable for a report */
-            RS_ROLLBACK
+    struct MemberState {
+        enum MS {
+            RS_STARTUP = 0,
+            RS_PRIMARY = 1,
+            RS_SECONDARY = 2,
+            RS_RECOVERING = 3,
+            RS_FATAL = 4,
+            RS_STARTUP2 = 5,
+            RS_UNKNOWN = 6, /* remote node not yet reached */
+            RS_ARBITER = 7,
+            RS_DOWN = 8, /* node not reachable for a report */
+            RS_ROLLBACK = 9
         } s;
 
         MemberState(MS ms = RS_UNKNOWN) : s(ms) { }
@@ -53,6 +53,7 @@ namespace mongo {
         bool startup2() const { return s == RS_STARTUP2; }
         bool fatal() const { return s == RS_FATAL; }
         bool rollback() const { return s == RS_ROLLBACK; }
+        bool readable() const { return s == RS_PRIMARY || s == RS_SECONDARY; }
 
         string toString() const;
 
@@ -60,9 +61,9 @@ namespace mongo {
         bool operator!=(const MemberState& r) const { return s != r.s; }
     };
 
-    /* this is supposed to be just basic information on a member, 
+    /* this is supposed to be just basic information on a member,
        and copy constructable. */
-    class HeartbeatInfo { 
+    class HeartbeatInfo {
         unsigned _id;
     public:
         HeartbeatInfo() : _id(0xffffffff),hbstate(MemberState::RS_UNKNOWN),health(-1.0),downSince(0),skew(INT_MIN) { }
@@ -88,15 +89,15 @@ namespace mongo {
         bool changed(const HeartbeatInfo& old) const;
     };
 
-    inline HeartbeatInfo::HeartbeatInfo(unsigned id) : _id(id) { 
+    inline HeartbeatInfo::HeartbeatInfo(unsigned id) : _id(id) {
         hbstate = MemberState::RS_UNKNOWN;
         health = -1.0;
         downSince = 0;
-        lastHeartbeat = upSince = 0; 
+        lastHeartbeat = upSince = 0;
         skew = INT_MIN;
     }
 
-    inline bool HeartbeatInfo::changed(const HeartbeatInfo& old) const { 
+    inline bool HeartbeatInfo::changed(const HeartbeatInfo& old) const {
         return health != old.health ||
                hbstate != old.hbstate;
     }
diff --git a/db/repl/rs_optime.h b/db/repl/rs_optime.h
index b3607fa..f0ca569 100644
--- a/db/repl/rs_optime.h
+++ b/db/repl/rs_optime.h
@@ -1,58 +1,58 @@
-// @file rs_optime.h
-
-/*
- *    Copyright (C) 2010 10gen Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "../../util/optime.h"
-
-namespace mongo {
-
+// @file rs_optime.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../util/optime.h"
+
+namespace mongo {
+
     const char rsoplog[] = "local.oplog.rs";
-
-    /*
-    class RSOpTime : public OpTime { 
-    public:
-        bool initiated() const { return getSecs() != 0; }
-    };*/
-
-    /*struct RSOpTime { 
-        unsigned long long ord;
-
-        RSOpTime() : ord(0) { }
-
-        bool initiated() const { return ord > 0; }
-
-        void initiate() { 
-            assert( !initiated() );
-            ord = 1000000;
-        }
-
-        ReplTime inc() {
-            DEV assertInWriteLock();
-            return ++ord;
-        }
-
-        string toString() const { return str::stream() << ord; }
-
-        // query the oplog and set the highest value herein.  acquires a db read lock. throws.
-        void load();
-    };
-
-    extern RSOpTime rsOpTime;*/
-
-}
+
+    /*
+    class RSOpTime : public OpTime {
+    public:
+        bool initiated() const { return getSecs() != 0; }
+    };*/
+
+    /*struct RSOpTime {
+        unsigned long long ord;
+
+        RSOpTime() : ord(0) { }
+
+        bool initiated() const { return ord > 0; }
+
+        void initiate() {
+            assert( !initiated() );
+            ord = 1000000;
+        }
+
+        ReplTime inc() {
+            DEV assertInWriteLock();
+            return ++ord;
+        }
+
+        string toString() const { return str::stream() << ord; }
+
+        // query the oplog and set the highest value herein.  acquires a db read lock. throws.
+        void load();
+    };
+
+    extern RSOpTime rsOpTime;*/
+
+}
diff --git a/db/repl/rs_rollback.cpp b/db/repl/rs_rollback.cpp
index 6b2544c..0b4cc28 100644
--- a/db/repl/rs_rollback.cpp
+++ b/db/repl/rs_rollback.cpp
@@ -1,5 +1,5 @@
 /* @file rs_rollback.cpp
-* 
+*
 *    Copyright (C) 2008 10gen Inc.
 *
 *    This program is free software: you can redistribute it and/or  modify
@@ -25,7 +25,7 @@
 /* Scenarios
 
    We went offline with ops not replicated out.
- 
+
        F = node that failed and coming back.
        P = node that took over, new primary
 
@@ -33,11 +33,11 @@
        F : a b c d e f g
        P : a b c d q
 
-   The design is "keep P".  One could argue here that "keep F" has some merits, however, in most cases P 
-   will have significantly more data.  Also note that P may have a proper subset of F's stream if there were 
+   The design is "keep P".  One could argue here that "keep F" has some merits, however, in most cases P
+   will have significantly more data.  Also note that P may have a proper subset of F's stream if there were
    no subsequent writes.
 
-   For now the model is simply : get F back in sync with P.  If P was really behind or something, we should have 
+   For now the model is simply : get F back in sync with P.  If P was really behind or something, we should have
    just chosen not to fail over anyway.
 
    #2:
@@ -50,9 +50,9 @@
 
    Steps
     find an event in common. 'd'.
-    undo our events beyond that by: 
+    undo our events beyond that by:
       (1) taking copy from other server of those objects
-      (2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object 
+      (2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object
           -- i.e., reset minvalid.
       (3) we could skip operations on objects that are previous in time to our capture of the object as an optimization.
 
@@ -65,15 +65,15 @@ namespace mongo {
     bool copyCollectionFromRemote(const string& host, const string& ns, const BSONObj& query, string& errmsg, bool logforrepl);
     void incRBID();
 
-    class rsfatal : public std::exception { 
+    class rsfatal : public std::exception {
     public:
-        virtual const char* what() const throw(){ return "replica set fatal exception"; }
+        virtual const char* what() const throw() { return "replica set fatal exception"; }
     };
 
     struct DocID {
         const char *ns;
         be _id;
-        bool operator<(const DocID& d) const { 
+        bool operator<(const DocID& d) const {
             int c = strcmp(ns, d.ns);
             if( c < 0 ) return true;
             if( c > 0 ) return false;
@@ -82,7 +82,7 @@ namespace mongo {
     };
 
     struct HowToFixUp {
-        /* note this is a set -- if there are many $inc's on a single document we need to rollback, we only 
+        /* note this is a set -- if there are many $inc's on a single document we need to rollback, we only
            need to refetch it once. */
         set<DocID> toRefetch;
 
@@ -97,9 +97,9 @@ namespace mongo {
         int rbid; // remote server's current rollback sequence #
     };
 
-    static void refetch(HowToFixUp& h, const BSONObj& ourObj) { 
+    static void refetch(HowToFixUp& h, const BSONObj& ourObj) {
         const char *op = ourObj.getStringField("op");
-        if( *op == 'n' ) 
+        if( *op == 'n' )
             return;
 
         unsigned long long totSize = 0;
@@ -108,53 +108,54 @@ namespace mongo {
             throw "rollback too large";
 
         DocID d;
+        // NOTE The assigned ns value may become invalid if we yield.
         d.ns = ourObj.getStringField("ns");
-        if( *d.ns == 0 ) { 
+        if( *d.ns == 0 ) {
             log() << "replSet WARNING ignoring op on rollback no ns TODO : " << ourObj.toString() << rsLog;
             return;
         }
 
         bo o = ourObj.getObjectField(*op=='u' ? "o2" : "o");
-        if( o.isEmpty() ) { 
+        if( o.isEmpty() ) {
             log() << "replSet warning ignoring op on rollback : " << ourObj.toString() << rsLog;
             return;
         }
 
-        if( *op == 'c' ) { 
+        if( *op == 'c' ) {
             be first = o.firstElement();
             NamespaceString s(d.ns); // foo.$cmd
             string cmdname = first.fieldName();
             Command *cmd = Command::findCommand(cmdname.c_str());
-            if( cmd == 0 ) { 
+            if( cmd == 0 ) {
                 log() << "replSet warning rollback no suchcommand " << first.fieldName() << " - different mongod versions perhaps?" << rsLog;
                 return;
             }
             else {
                 /* findandmodify - tranlated?
-                   godinsert?,  
+                   godinsert?,
                    renamecollection a->b.  just resync a & b
                 */
                 if( cmdname == "create" ) {
-                    /* Create collection operation 
-                       { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } }
+                    /* Create collection operation
+                       { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } }
                     */
                     string ns = s.db + '.' + o["create"].String(); // -> foo.abc
                     h.toDrop.insert(ns);
                     return;
                 }
-                else if( cmdname == "drop" ) { 
+                else if( cmdname == "drop" ) {
                     string ns = s.db + '.' + first.valuestr();
                     h.collectionsToResync.insert(ns);
                     return;
                 }
-                else if( cmdname == "dropIndexes" || cmdname == "deleteIndexes" ) { 
+                else if( cmdname == "dropIndexes" || cmdname == "deleteIndexes" ) {
                     /* TODO: this is bad.  we simply full resync the collection here, which could be very slow. */
                     log() << "replSet info rollback of dropIndexes is slow in this version of mongod" << rsLog;
                     string ns = s.db + '.' + first.valuestr();
                     h.collectionsToResync.insert(ns);
                     return;
                 }
-                else if( cmdname == "renameCollection" ) { 
+                else if( cmdname == "renameCollection" ) {
                     /* TODO: slow. */
                     log() << "replSet info rollback of renameCollection is slow in this version of mongod" << rsLog;
                     string from = first.valuestr();
@@ -163,15 +164,15 @@ namespace mongo {
                     h.collectionsToResync.insert(to);
                     return;
                 }
-                else if( cmdname == "reIndex" ) { 
+                else if( cmdname == "reIndex" ) {
                     return;
                 }
-                else if( cmdname == "dropDatabase" ) { 
+                else if( cmdname == "dropDatabase" ) {
                     log() << "replSet error rollback : can't rollback drop database full resync will be required" << rsLog;
                     log() << "replSet " << o.toString() << rsLog;
                     throw rsfatal();
                 }
-                else { 
+                else {
                     log() << "replSet error can't rollback this command yet: " << o.toString() << rsLog;
                     log() << "replSet cmdname=" << cmdname << rsLog;
                     throw rsfatal();
@@ -190,15 +191,15 @@ namespace mongo {
 
     int getRBID(DBClientConnection*);
 
-    static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) { 
+    static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) {
         static time_t last;
-        if( time(0)-last < 60 ) { 
+        if( time(0)-last < 60 ) {
             throw "findcommonpoint waiting a while before trying again";
         }
         last = time(0);
 
         assert( dbMutex.atLeastReadLocked() );
-        Client::Context c(rsoplog, dbpath, 0, false);
+        Client::Context c(rsoplog);
         NamespaceDetails *nsd = nsdetails(rsoplog);
         assert(nsd);
         ReverseCappedCursor u(nsd);
@@ -226,7 +227,7 @@ namespace mongo {
             log() << "replSet info rollback our last optime:   " << ourTime.toStringPretty() << rsLog;
             log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog;
             log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog;
-            if( diff > 3600 ) { 
+            if( diff > 3600 ) {
                 log() << "replSet rollback too long a time period for a rollback." << rsLog;
                 throw "error not willing to roll back more than one hour of data";
             }
@@ -236,8 +237,8 @@ namespace mongo {
         while( 1 ) {
             scanned++;
             /* todo add code to assure no excessive scanning for too long */
-            if( ourTime == theirTime ) { 
-                if( ourObj["h"].Long() == theirObj["h"].Long() ) { 
+            if( ourTime == theirTime ) {
+                if( ourObj["h"].Long() == theirObj["h"].Long() ) {
                     // found the point back in time where we match.
                     // todo : check a few more just to be careful about hash collisions.
                     log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog;
@@ -249,7 +250,7 @@ namespace mongo {
 
                 refetch(h, ourObj);
 
-                if( !t->more() ) { 
+                if( !t->more() ) {
                     log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
                     log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
                     log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
@@ -270,8 +271,8 @@ namespace mongo {
                 ourObj = u.current();
                 ourTime = ourObj["ts"]._opTime();
             }
-            else if( theirTime > ourTime ) { 
-                if( !t->more() ) { 
+            else if( theirTime > ourTime ) {
+                if( !t->more() ) {
                     log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
                     log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
                     log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
@@ -281,11 +282,11 @@ namespace mongo {
                 theirObj = t->nextSafe();
                 theirTime = theirObj["ts"]._opTime();
             }
-            else { 
+            else {
                 // theirTime < ourTime
                 refetch(h, ourObj);
                 u.advance();
-                if( !u.ok() ) { 
+                if( !u.ok() ) {
                     log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
                     log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
                     log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
@@ -298,299 +299,303 @@ namespace mongo {
         }
     }
 
-    struct X { 
+    struct X {
         const bson::bo *op;
         bson::bo goodVersionOfObject;
     };
 
-    static void setMinValid(bo newMinValid) { 
-       try {
-           log() << "replSet minvalid=" << newMinValid["ts"]._opTime().toStringLong() << rsLog;
-       }
-       catch(...) { }
-       {
-           Helpers::putSingleton("local.replset.minvalid", newMinValid);
-           Client::Context cx( "local." );
-           cx.db()->flushFiles(true);            
-       }
+    static void setMinValid(bo newMinValid) {
+        try {
+            log() << "replSet minvalid=" << newMinValid["ts"]._opTime().toStringLong() << rsLog;
+        }
+        catch(...) { }
+        {
+            Helpers::putSingleton("local.replset.minvalid", newMinValid);
+            Client::Context cx( "local." );
+            cx.db()->flushFiles(true);
+        }
     }
 
     void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) {
-       DBClientConnection *them = r.conn();
-
-       // fetch all first so we needn't handle interruption in a fancy way
-
-       unsigned long long totSize = 0;
-
-       list< pair<DocID,bo> > goodVersions;
-
-       bo newMinValid;
-
-       /* fetch all the goodVersions of each document from current primary */
-       DocID d;
-       unsigned long long n = 0;
-       try {
-           for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) { 
-               d = *i;
-
-               assert( !d._id.eoo() );
-
-               {
-                   /* TODO : slow.  lots of round trips. */
-                   n++;
-                   bo good= them->findOne(d.ns, d._id.wrap()).getOwned();
-                   totSize += good.objsize();
-                   uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );
-
-                   // note good might be eoo, indicating we should delete it
-                   goodVersions.push_back(pair<DocID,bo>(d,good));
-               }
-           }
-           newMinValid = r.getLastOp(rsoplog);
-           if( newMinValid.isEmpty() ) { 
-               sethbmsg("rollback error newMinValid empty?");
-               return;
-           }
-       }
-       catch(DBException& e) {
-           sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0);
-           log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog;
-           throw e;
-       }
-
-       MemoryMappedFile::flushAll(true);
-
-       sethbmsg("rollback 3.5");
-       if( h.rbid != getRBID(r.conn()) ) { 
-           // our source rolled back itself.  so the data we received isn't necessarily consistent.
-           sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt");
-           return;
-       }
-
-       // update them
-       sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size());
-
-       bool warn = false;
-
-       assert( !h.commonPointOurDiskloc.isNull() );
-
-       dbMutex.assertWriteLocked();
-
-       /* we have items we are writing that aren't from a point-in-time.  thus best not to come online 
-	      until we get to that point in freshness. */
-       setMinValid(newMinValid);
-       
-       /** any full collection resyncs required? */
-       if( !h.collectionsToResync.empty() ) {
-           for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) { 
-               string ns = *i;
-               sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns);
-               Client::Context c(*i, dbpath, 0, /*doauth*/false);
-               try {
-                   bob res;
-                   string errmsg;
-                   dropCollection(ns, errmsg, res);
-                   {
-                       dbtemprelease r;
-                       bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false);
-                       if( !ok ) { 
-                           log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog;
-                           throw "rollback error resyncing rollection [1]";
-                       }
-                   }
-               }
-               catch(...) { 
-                   log() << "replset rollback error resyncing collection " << ns << rsLog;
-                   throw "rollback error resyncing rollection [2]";
-               }
-           }
-
-           /* we did more reading from primary, so check it again for a rollback (which would mess us up), and 
-              make minValid newer. 
-              */
-           sethbmsg("rollback 4.2");
-           { 
-               string err;
-               try {
-                   newMinValid = r.getLastOp(rsoplog);
-                   if( newMinValid.isEmpty() ) {
-                       err = "can't get minvalid from primary";
-                   } else { 
-                       setMinValid(newMinValid);
-                   }
-               }
-               catch(...) { 
-                   err = "can't get/set minvalid"; 
-               }
-               if( h.rbid != getRBID(r.conn()) ) {
-                   // our source rolled back itself.  so the data we received isn't necessarily consistent.
-                   // however, we've now done writes.  thus we have a problem.
-                   err += "rbid at primary changed during resync/rollback";
-               }
-               if( !err.empty() ) {
-                   log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog;
-                   /* todo: reset minvalid so that we are permanently in fatal state */
-                   /* todo: don't be fatal, but rather, get all the data first. */
-                   sethbmsg("rollback error");
-                   throw rsfatal();
-               }
-           }
-           sethbmsg("rollback 4.3");
-       }
-
-       sethbmsg("rollback 4.6");
-       /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */
-       for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) { 
-           Client::Context c(*i, dbpath, 0, /*doauth*/false);
-           try {
-               bob res;
-               string errmsg;
-               log(1) << "replSet rollback drop: " << *i << rsLog;
-               dropCollection(*i, errmsg, res);
-           }
-           catch(...) { 
-               log() << "replset rollback error dropping collection " << *i << rsLog;
-           }
-       }
-
-       sethbmsg("rollback 4.7");
-       Client::Context c(rsoplog, dbpath, 0, /*doauth*/false);
-       NamespaceDetails *oplogDetails = nsdetails(rsoplog);
-       uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails);
-
-       map<string,shared_ptr<RemoveSaver> > removeSavers;
-
-       unsigned deletes = 0, updates = 0;
-       for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) {
-           const DocID& d = i->first;
-           bo pattern = d._id.wrap(); // { _id : ... }
-           try { 
-               assert( d.ns && *d.ns );
-               if( h.collectionsToResync.count(d.ns) ) {
-                   /* we just synced this entire collection */
-                   continue;
-               }
-
-               /* keep an archive of items rolled back */
-               shared_ptr<RemoveSaver>& rs = removeSavers[d.ns];
-               if ( ! rs )
-                   rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) );
-
-               // todo: lots of overhead in context, this can be faster
-               Client::Context c(d.ns, dbpath, 0, /*doauth*/false);
-               if( i->second.isEmpty() ) {
-                   // wasn't on the primary; delete.
-                   /* TODO1.6 : can't delete from a capped collection.  need to handle that here. */
-                   deletes++;
-
-                   NamespaceDetails *nsd = nsdetails(d.ns);
-                   if( nsd ) {
-                       if( nsd->capped ) { 
-                           /* can't delete from a capped collection - so we truncate instead. if this item must go, 
-                           so must all successors!!! */
-                           try { 
-                               /** todo: IIRC cappedTrunateAfter does not handle completely empty.  todo. */
-                               // this will crazy slow if no _id index.
-                               long long start = Listener::getElapsedTimeMillis();
-                               DiskLoc loc = Helpers::findOne(d.ns, pattern, false);
-                               if( Listener::getElapsedTimeMillis() - start > 200 ) 
-                                   log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog; 
-                               //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern);
-                               if( !loc.isNull() ) {
-                                   try {
-                                       nsd->cappedTruncateAfter(d.ns, loc, true);
-                                   }
-                                   catch(DBException& e) { 
-                                       if( e.getCode() == 13415 ) {
-                                           // hack: need to just make cappedTruncate do this...
-                                           nsd->emptyCappedCollection(d.ns);
-                                       } else {
-                                           throw;
-                                       }
-                                   }
-                               }
-                           }
-                           catch(DBException& e) { 
-                               log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog;
-                           }
-                       }
-                       else {
-                           try { 
-                               deletes++;
-                               deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() );
-                           }
-                           catch(...) { 
-                               log() << "replSet error rollback delete failed ns:" << d.ns << rsLog;
-                           }
-                       }
-                       // did we just empty the collection?  if so let's check if it even exists on the source.
-                       if( nsd->nrecords == 0 ) {
-                           try { 
-                               string sys = cc().database()->name + ".system.namespaces";
-                               bo o = them->findOne(sys, QUERY("name"<<d.ns));
-                               if( o.isEmpty() ) { 
-                                   // we should drop
-                                   try {
-                                       bob res;
-                                       string errmsg;
-                                       dropCollection(d.ns, errmsg, res);
-                                   }
-                                   catch(...) { 
-                                       log() << "replset error rolling back collection " << d.ns << rsLog;
-                                   }
-                               }
-                           }
-                           catch(DBException& ) { 
-                               /* this isn't *that* big a deal, but is bad. */
-                               log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog;
-                           }
-                       }
-                   }
-               }
-               else {
-                   // todo faster...
-                   OpDebug debug;
-                   updates++;
-                   _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() );
-               }
-           }
-           catch(DBException& e) { 
-               log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog;
-               warn = true;
-           }
-       }
-
-       removeSavers.clear(); // this effectively closes all of them
-
-       sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates);
-       MemoryMappedFile::flushAll(true);
-       sethbmsg("rollback 6");
-
-       // clean up oplog
-       log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
-       // todo: fatal error if this throws?
-       oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
-
-       /* reset cached lastoptimewritten and h value */
-       loadLastOpTimeWritten();
-
-       sethbmsg("rollback 7");
-       MemoryMappedFile::flushAll(true);
-
-       // done
-       if( warn ) 
-           sethbmsg("issues during syncRollback, see log");
-       else
-           sethbmsg("rollback done");
-   }
-
-    void ReplSetImpl::syncRollback(OplogReader&r) { 
+        DBClientConnection *them = r.conn();
+
+        // fetch all first so we needn't handle interruption in a fancy way
+
+        unsigned long long totSize = 0;
+
+        list< pair<DocID,bo> > goodVersions;
+
+        bo newMinValid;
+
+        /* fetch all the goodVersions of each document from current primary */
+        DocID d;
+        unsigned long long n = 0;
+        try {
+            for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) {
+                d = *i;
+
+                assert( !d._id.eoo() );
+
+                {
+                    /* TODO : slow.  lots of round trips. */
+                    n++;
+                    bo good= them->findOne(d.ns, d._id.wrap()).getOwned();
+                    totSize += good.objsize();
+                    uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );
+
+                    // note good might be eoo, indicating we should delete it
+                    goodVersions.push_back(pair<DocID,bo>(d,good));
+                }
+            }
+            newMinValid = r.getLastOp(rsoplog);
+            if( newMinValid.isEmpty() ) {
+                sethbmsg("rollback error newMinValid empty?");
+                return;
+            }
+        }
+        catch(DBException& e) {
+            sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0);
+            log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog;
+            throw e;
+        }
+
+        MemoryMappedFile::flushAll(true);
+
+        sethbmsg("rollback 3.5");
+        if( h.rbid != getRBID(r.conn()) ) {
+            // our source rolled back itself.  so the data we received isn't necessarily consistent.
+            sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt");
+            return;
+        }
+
+        // update them
+        sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size());
+
+        bool warn = false;
+
+        assert( !h.commonPointOurDiskloc.isNull() );
+
+        dbMutex.assertWriteLocked();
+
+        /* we have items we are writing that aren't from a point-in-time.  thus best not to come online
+           until we get to that point in freshness. */
+        setMinValid(newMinValid);
+
+        /** any full collection resyncs required? */
+        if( !h.collectionsToResync.empty() ) {
+            for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) {
+                string ns = *i;
+                sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns);
+                Client::Context c(*i);
+                try {
+                    bob res;
+                    string errmsg;
+                    dropCollection(ns, errmsg, res);
+                    {
+                        dbtemprelease r;
+                        bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false);
+                        if( !ok ) {
+                            log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog;
+                            throw "rollback error resyncing rollection [1]";
+                        }
+                    }
+                }
+                catch(...) {
+                    log() << "replset rollback error resyncing collection " << ns << rsLog;
+                    throw "rollback error resyncing rollection [2]";
+                }
+            }
+
+            /* we did more reading from primary, so check it again for a rollback (which would mess us up), and
+               make minValid newer.
+               */
+            sethbmsg("rollback 4.2");
+            {
+                string err;
+                try {
+                    newMinValid = r.getLastOp(rsoplog);
+                    if( newMinValid.isEmpty() ) {
+                        err = "can't get minvalid from primary";
+                    }
+                    else {
+                        setMinValid(newMinValid);
+                    }
+                }
+                catch(...) {
+                    err = "can't get/set minvalid";
+                }
+                if( h.rbid != getRBID(r.conn()) ) {
+                    // our source rolled back itself.  so the data we received isn't necessarily consistent.
+                    // however, we've now done writes.  thus we have a problem.
+                    err += "rbid at primary changed during resync/rollback";
+                }
+                if( !err.empty() ) {
+                    log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog;
+                    /* todo: reset minvalid so that we are permanently in fatal state */
+                    /* todo: don't be fatal, but rather, get all the data first. */
+                    sethbmsg("rollback error");
+                    throw rsfatal();
+                }
+            }
+            sethbmsg("rollback 4.3");
+        }
+
+        sethbmsg("rollback 4.6");
+        /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */
+        for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) {
+            Client::Context c(*i);
+            try {
+                bob res;
+                string errmsg;
+                log(1) << "replSet rollback drop: " << *i << rsLog;
+                dropCollection(*i, errmsg, res);
+            }
+            catch(...) {
+                log() << "replset rollback error dropping collection " << *i << rsLog;
+            }
+        }
+
+        sethbmsg("rollback 4.7");
+        Client::Context c(rsoplog);
+        NamespaceDetails *oplogDetails = nsdetails(rsoplog);
+        uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails);
+
+        map<string,shared_ptr<RemoveSaver> > removeSavers;
+
+        unsigned deletes = 0, updates = 0;
+        for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) {
+            const DocID& d = i->first;
+            bo pattern = d._id.wrap(); // { _id : ... }
+            try {
+                assert( d.ns && *d.ns );
+                if( h.collectionsToResync.count(d.ns) ) {
+                    /* we just synced this entire collection */
+                    continue;
+                }
+
+                getDur().commitIfNeeded();
+
+                /* keep an archive of items rolled back */
+                shared_ptr<RemoveSaver>& rs = removeSavers[d.ns];
+                if ( ! rs )
+                    rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) );
+
+                // todo: lots of overhead in context, this can be faster
+                Client::Context c(d.ns);
+                if( i->second.isEmpty() ) {
+                    // wasn't on the primary; delete.
+                    /* TODO1.6 : can't delete from a capped collection.  need to handle that here. */
+                    deletes++;
+
+                    NamespaceDetails *nsd = nsdetails(d.ns);
+                    if( nsd ) {
+                        if( nsd->capped ) {
+                            /* can't delete from a capped collection - so we truncate instead. if this item must go,
+                            so must all successors!!! */
+                            try {
+                                /** todo: IIRC cappedTrunateAfter does not handle completely empty.  todo. */
+                                // this will crazy slow if no _id index.
+                                long long start = Listener::getElapsedTimeMillis();
+                                DiskLoc loc = Helpers::findOne(d.ns, pattern, false);
+                                if( Listener::getElapsedTimeMillis() - start > 200 )
+                                    log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog;
+                                //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern);
+                                if( !loc.isNull() ) {
+                                    try {
+                                        nsd->cappedTruncateAfter(d.ns, loc, true);
+                                    }
+                                    catch(DBException& e) {
+                                        if( e.getCode() == 13415 ) {
+                                            // hack: need to just make cappedTruncate do this...
+                                            nsd->emptyCappedCollection(d.ns);
+                                        }
+                                        else {
+                                            throw;
+                                        }
+                                    }
+                                }
+                            }
+                            catch(DBException& e) {
+                                log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog;
+                            }
+                        }
+                        else {
+                            try {
+                                deletes++;
+                                deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() );
+                            }
+                            catch(...) {
+                                log() << "replSet error rollback delete failed ns:" << d.ns << rsLog;
+                            }
+                        }
+                        // did we just empty the collection?  if so let's check if it even exists on the source.
+                        if( nsd->stats.nrecords == 0 ) {
+                            try {
+                                string sys = cc().database()->name + ".system.namespaces";
+                                bo o = them->findOne(sys, QUERY("name"<<d.ns));
+                                if( o.isEmpty() ) {
+                                    // we should drop
+                                    try {
+                                        bob res;
+                                        string errmsg;
+                                        dropCollection(d.ns, errmsg, res);
+                                    }
+                                    catch(...) {
+                                        log() << "replset error rolling back collection " << d.ns << rsLog;
+                                    }
+                                }
+                            }
+                            catch(DBException& ) {
+                                /* this isn't *that* big a deal, but is bad. */
+                                log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog;
+                            }
+                        }
+                    }
+                }
+                else {
+                    // todo faster...
+                    OpDebug debug;
+                    updates++;
+                    _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() );
+                }
+            }
+            catch(DBException& e) {
+                log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog;
+                warn = true;
+            }
+        }
+
+        removeSavers.clear(); // this effectively closes all of them
+
+        sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates);
+        MemoryMappedFile::flushAll(true);
+        sethbmsg("rollback 6");
+
+        // clean up oplog
+        log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
+        // todo: fatal error if this throws?
+        oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
+
+        /* reset cached lastoptimewritten and h value */
+        loadLastOpTimeWritten();
+
+        sethbmsg("rollback 7");
+        MemoryMappedFile::flushAll(true);
+
+        // done
+        if( warn )
+            sethbmsg("issues during syncRollback, see log");
+        else
+            sethbmsg("rollback done");
+    }
+
+    void ReplSetImpl::syncRollback(OplogReader&r) {
         unsigned s = _syncRollback(r);
-        if( s ) 
+        if( s )
             sleepsecs(s);
     }
 
-    unsigned ReplSetImpl::_syncRollback(OplogReader&r) { 
+    unsigned ReplSetImpl::_syncRollback(OplogReader&r) {
         assert( !lockedByMe() );
         assert( !dbMutex.atLeastReadLocked() );
 
@@ -604,7 +609,7 @@ namespace mongo {
 
         if( box.getState().secondary() ) {
             /* by doing this, we will not service reads (return an error as we aren't in secondary staate.
-               that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred 
+               that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred
                or removed or yielded later anyway.
 
                also, this is better for status reporting - we know what is happening.
@@ -618,7 +623,7 @@ namespace mongo {
             r.resetCursor();
             /*DBClientConnection us(false, 0, 0);
             string errmsg;
-            if( !us.connect(HostAndPort::me().toString(),errmsg) ) { 
+            if( !us.connect(HostAndPort::me().toString(),errmsg) ) {
                 sethbmsg("rollback connect to self failure" + errmsg);
                 return;
             }*/
@@ -627,15 +632,15 @@ namespace mongo {
             try {
                 syncRollbackFindCommonPoint(r.conn(), how);
             }
-            catch( const char *p ) { 
+            catch( const char *p ) {
                 sethbmsg(string("rollback 2 error ") + p);
                 return 10;
             }
-            catch( rsfatal& ) { 
+            catch( rsfatal& ) {
                 _fatal();
                 return 2;
             }
-            catch( DBException& e ) { 
+            catch( DBException& e ) {
                 sethbmsg(string("rollback 2 exception ") + e.toString() + "; sleeping 1 min");
                 dbtemprelease r;
                 sleepsecs(60);
@@ -647,20 +652,20 @@ namespace mongo {
 
         {
             incRBID();
-            try { 
+            try {
                 syncFixUp(how, r);
             }
-            catch( rsfatal& ) { 
+            catch( rsfatal& ) {
                 sethbmsg("rollback fixup error");
                 _fatal();
                 return 2;
             }
-            catch(...) { 
+            catch(...) {
                 incRBID(); throw;
             }
             incRBID();
 
-            /* success - leave "ROLLBACK" state 
+            /* success - leave "ROLLBACK" state
                can go to SECONDARY once minvalid is achieved
             */
             box.change(MemberState::RS_RECOVERING, _self);
diff --git a/db/repl/rs_sync.cpp b/db/repl/rs_sync.cpp
index 9de3f60..8d06fcc 100644
--- a/db/repl/rs_sync.cpp
+++ b/db/repl/rs_sync.cpp
@@ -19,30 +19,21 @@
 #include "../../client/dbclient.h"
 #include "rs.h"
 #include "../repl.h"
-
+#include "connections.h"
 namespace mongo {
 
     using namespace bson;
-
     extern unsigned replSetForceInitialSyncFailure;
 
-    void startSyncThread() { 
-        Client::initThread("rs_sync");
-        cc().iAmSyncThread();
-        theReplSet->syncThread();
-        cc().shutdown();
-    }
-
+    /* apply the log op that is in param o */
     void ReplSetImpl::syncApply(const BSONObj &o) {
-        //const char *op = o.getStringField("op");
-        
-        char db[MaxDatabaseLen];
+        char db[MaxDatabaseNameLen];
         const char *ns = o.getStringField("ns");
         nsToDatabase(ns, db);
 
         if ( *ns == '.' || *ns == 0 ) {
-		    if( *o.getStringField("op") == 'n' )
-			    return;
+            if( *o.getStringField("op") == 'n' )
+                return;
             log() << "replSet skipping bad op in oplog: " << o.toString() << endl;
             return;
         }
@@ -54,19 +45,21 @@ namespace mongo {
         applyOperation_inlock(o);
     }
 
+    /* initial oplog application, during initial sync, after cloning.
+       @return false on failure.
+       this method returns an error and doesn't throw exceptions (i think).
+    */
     bool ReplSetImpl::initialSyncOplogApplication(
-        string hn, 
-        const Member *primary,
+        const Member *source,
         OpTime applyGTE,
-        OpTime minValid)
-    { 
-        if( primary == 0 ) return false;
+        OpTime minValid) {
+        if( source == 0 ) return false;
 
-        OpTime ts;
+        const string hn = source->h().toString();
+        OplogReader r;
         try {
-            OplogReader r;
-            if( !r.connect(hn) ) { 
-                log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
+            if( !r.connect(hn) ) {
+                log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog;
                 return false;
             }
 
@@ -80,48 +73,63 @@ namespace mongo {
             }
             assert( r.haveCursor() );
 
-            /* we lock outside the loop to avoid the overhead of locking on every operation.  server isn't usable yet anyway! */
-            writelock lk("");
-
             {
-                if( !r.more() ) { 
+                if( !r.more() ) {
                     sethbmsg("replSet initial sync error reading remote oplog");
+                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                     return false;
                 }
                 bo op = r.next();
                 OpTime t = op["ts"]._opTime();
                 r.putBack(op);
-                assert( !t.isNull() );
+
+                if( op.firstElement().fieldName() == string("$err") ) {
+                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
+                    return false;
+                }
+
+                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                 if( t > applyGTE ) {
                     sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
+                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
+                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                     return false;
                 }
             }
+        }
+        catch(DBException& e) {
+            log() << "replSet initial sync failing: " << e.toString() << rsLog;
+            return false;
+        }
 
-            // todo : use exhaust
-            unsigned long long n = 0;
-            while( 1 ) { 
+        /* we lock outside the loop to avoid the overhead of locking on every operation. */
+        writelock lk("");
 
+        // todo : use exhaust
+        OpTime ts;
+        unsigned long long n = 0;
+        while( 1 ) {
+            try {
                 if( !r.more() )
                     break;
                 BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                 {
-                    //writelock lk("");
-
                     ts = o["ts"]._opTime();
 
                     /* if we have become primary, we dont' want to apply things from elsewhere
-                        anymore. assumePrimary is in the db lock so we are safe as long as 
+                        anymore. assumePrimary is in the db lock so we are safe as long as
                         we check after we locked above. */
-					const Member *p1 = box.getPrimary();
-                    if( p1 != primary || replSetForceInitialSyncFailure ) {
+                    if( (source->state() != MemberState::RS_PRIMARY &&
+                            source->state() != MemberState::RS_SECONDARY) ||
+                            replSetForceInitialSyncFailure ) {
+
                         int f = replSetForceInitialSyncFailure;
                         if( f > 0 ) {
                             replSetForceInitialSyncFailure = f-1;
                             log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
+                            throw DBException("forced error",0);
                         }
-                        log() << "replSet primary was:" << primary->fullName() << " now:" << 
-                            (p1 != 0 ? p1->fullName() : "none") << rsLog;
+                        log() << "replSet we are now primary" << rsLog;
                         throw DBException("primary changed",0);
                     }
 
@@ -131,38 +139,48 @@ namespace mongo {
                     }
                     _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                 }
-                if( ++n % 100000 == 0 ) { 
+                if( ++n % 100000 == 0 ) {
                     // simple progress metering
                     log() << "replSet initialSyncOplogApplication " << n << rsLog;
                 }
+                
+                getDur().commitIfNeeded();
             }
-        }
-        catch(DBException& e) { 
-            if( ts <= minValid ) {
-                // didn't make it far enough
-                log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog;
-                return false;
+            catch (DBException& e) {
+                if( e.getCode() == 11000 || e.getCode() == 11001 ) {
+                    // skip duplicate key exceptions
+                    continue;
+                }
+
+                if( ts <= minValid ) {
+                    // didn't make it far enough
+                    log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog;
+                    return false;
+                }
+
+                // otherwise, whatever
+                break;
             }
         }
         return true;
     }
 
-    /* should be in RECOVERING state on arrival here.  
+    /* should be in RECOVERING state on arrival here.
        readlocks
        @return true if transitioned to SECONDARY
     */
-    bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) { 
-        bool golive = false;			
+    bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) {
+        bool golive = false;
         {
             readlock lk("local.replset.minvalid");
             BSONObj mv;
-            if( Helpers::getSingleton("local.replset.minvalid", mv) ) { 
+            if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
                 minvalid = mv["ts"]._opTime();
-                if( minvalid <= lastOpTimeWritten ) { 
+                if( minvalid <= lastOpTimeWritten ) {
                     golive=true;
                 }
             }
-            else 
+            else
                 golive = true; /* must have been the original member */
         }
         if( golive ) {
@@ -172,44 +190,104 @@ namespace mongo {
         return golive;
     }
 
-    /* tail the primary's oplog.  ok to return, will be re-called. */
-    void ReplSetImpl::syncTail() { 
-        // todo : locking vis a vis the mgr...
+    /**
+     * Checks if the oplog given is too far ahead to read from.
+     *
+     * @param r the oplog
+     * @param hn the hostname (for log messages)
+     *
+     * @return if we are stale compared to the oplog on hn
+     */
+    bool ReplSetImpl::_isStale(OplogReader& r, const string& hn) {
+        BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
+        OpTime ts = remoteOldestOp["ts"]._opTime();
+        DEV log() << "replSet remoteOldestOp:    " << ts.toStringLong() << rsLog;
+        else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
+        DEV {
+            // debugging sync1.js...
+            log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
+            log() << "replSet our state: " << state().toString() << rsLog;
+        }
+        if( lastOpTimeWritten < ts ) {
+            log() << "replSet error RS102 too stale to catch up, at least from " << hn << rsLog;
+            log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
+            log() << "replSet oldest at " << hn << " : " << ts.toStringLong() << rsLog;
+            log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
+            sethbmsg("error RS102 too stale to catch up");
+            changeState(MemberState::RS_RECOVERING);
+            sleepsecs(120);
+            return true;
+        }
+        return false;
+    }
 
-        const Member *primary = box.getPrimary();
-        if( primary == 0 ) return;
-        string hn = primary->h().toString();
-        OplogReader r;
-        if( !r.connect(primary->h().toString()) ) { 
+    /**
+     * Tries to connect the oplog reader to a potential sync source.  If
+     * successful, it checks that we are not stale compared to this source.
+     *
+     * @param r reader to populate
+     * @param hn hostname to try
+     *
+     * @return if both checks pass, it returns true, otherwise false.
+     */
+    bool ReplSetImpl::_getOplogReader(OplogReader& r, string& hn) {
+        assert(r.conn() == 0);
+
+        if( !r.connect(hn) ) {
             log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
-            return;
+            r.resetConnection();
+            return false;
+        }
+        if( _isStale(r, hn)) {
+            r.resetConnection();
+            return false;
         }
+        return true;
+    }
 
-        /* first make sure we are not hopelessly out of sync by being very stale. */
-        {
-            BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
-            OpTime ts = remoteOldestOp["ts"]._opTime();
-            DEV log() << "replSet remoteOldestOp:    " << ts.toStringLong() << rsLog;
-            else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
-            DEV { 
-                // debugging sync1.js...
-                log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
-                log() << "replSet our state: " << state().toString() << rsLog;
+    /* tail an oplog.  ok to return, will be re-called. */
+    void ReplSetImpl::syncTail() {
+        // todo : locking vis a vis the mgr...
+        OplogReader r;
+        string hn;
+
+        const Member *target = box.getPrimary();
+        if (target != 0) {
+            hn = target->h().toString();
+            if (!_getOplogReader(r, hn)) {
+                // we might be stale wrt the primary, but could still sync from
+                // a secondary
+                target = 0;
+            }
+        }
+
+        // if we cannot reach the master but someone else is more up-to-date
+        // than we are, sync from them.
+        if( target == 0 ) {
+            for(Member *m = head(); m; m=m->next()) {
+                hn = m->h().toString();
+                if (m->hbinfo().up() && m->state().readable() &&
+                        (m->hbinfo().opTime > lastOpTimeWritten) &&
+                        m->config().slaveDelay == 0 &&
+                        _getOplogReader(r, hn)) {
+                    target = m;
+                    break;
+                }
             }
-            if( lastOpTimeWritten < ts ) { 
-                log() << "replSet error RS102 too stale to catch up, at least from primary: " << hn << rsLog;
-                log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
-                log() << "replSet oldest at " << hn << " : " << ts.toStringLong() << rsLog;
-                log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
-                sethbmsg("error RS102 too stale to catch up");
-                sleepsecs(120);
+
+            // no server found
+            if (target == 0) {
+                // if there is no one to sync from
+                OpTime minvalid;
+                tryToGoLiveAsASecondary(minvalid);
                 return;
             }
         }
 
         r.tailingQueryGTE(rsoplog, lastOpTimeWritten);
         assert( r.haveCursor() );
-        assert( r.awaitCapable() );
+
+        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );
 
         {
             if( !r.more() ) {
@@ -222,7 +300,7 @@ namespace mongo {
                         return;
                     }
                     OpTime theirTS = theirLastOp["ts"]._opTime();
-                    if( theirTS < lastOpTimeWritten ) { 
+                    if( theirTS < lastOpTimeWritten ) {
                         log() << "replSet we are ahead of the primary, will try to roll back" << rsLog;
                         syncRollback(r);
                         return;
@@ -231,7 +309,7 @@ namespace mongo {
                     log() << "replSet syncTail condition 1" << rsLog;
                     sleepsecs(1);
                 }
-                catch(DBException& e) { 
+                catch(DBException& e) {
                     log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog;
                     sleepsecs(2);
                 }
@@ -249,12 +327,9 @@ namespace mongo {
             BSONObj o = r.nextSafe();
             OpTime ts = o["ts"]._opTime();
             long long h = o["h"].numberLong();
-            if( ts != lastOpTimeWritten || h != lastH ) { 
-                log(1) << "TEMP our last op time written: " << lastOpTimeWritten.toStringPretty() << endl;
-                log(1) << "TEMP primary's GTE: " << ts.toStringPretty() << endl;
-                /*
-                }*/
-
+            if( ts != lastOpTimeWritten || h != lastH ) {
+                log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << endl;
+                log() << "replset source's GTE: " << ts.toStringPretty() << endl;
                 syncRollback(r);
                 return;
             }
@@ -268,49 +343,45 @@ namespace mongo {
 
         while( 1 ) {
             while( 1 ) {
-                if( !r.moreInCurrentBatch() ) { 
-                    /* we need to occasionally check some things. between 
+                if( !r.moreInCurrentBatch() ) {
+                    /* we need to occasionally check some things. between
                        batches is probably a good time. */
 
                     /* perhaps we should check this earlier? but not before the rollback checks. */
-                    if( state().recovering() ) { 
+                    if( state().recovering() ) {
                         /* can we go to RS_SECONDARY state?  we can if not too old and if minvalid achieved */
                         OpTime minvalid;
                         bool golive = ReplSetImpl::tryToGoLiveAsASecondary(minvalid);
                         if( golive ) {
                             ;
                         }
-                        else { 
+                        else {
                             sethbmsg(str::stream() << "still syncing, not yet to minValid optime" << minvalid.toString());
                         }
 
                         /* todo: too stale capability */
                     }
 
-                    if( box.getPrimary() != primary ) 
-                        return;
+                    {
+                        const Member *primary = box.getPrimary();
+                        
+                        if( !target->hbinfo().hbstate.readable() ||
+                            // if we are not syncing from the primary, return (if
+                            // it's up) so that we can try accessing it again
+                            (target != primary && primary != 0)) {
+                            return;
+                        }
+                    }
                 }
                 if( !r.more() )
                     break;
-                { 
+                {
                     BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
-                    {
-                        writelock lk("");
 
-                        /* if we have become primary, we dont' want to apply things from elsewhere
-                           anymore. assumePrimary is in the db lock so we are safe as long as 
-                           we check after we locked above. */
-                        if( box.getPrimary() != primary ) {
-                            if( box.getState().primary() )
-                                log(0) << "replSet stopping syncTail we are now primary" << rsLog;
-                            return;
-                        }
-
-                        syncApply(o);
-                        _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too: */                   
-                    }
                     int sd = myConfig().slaveDelay;
-                    if( sd ) { 
+                    // ignore slaveDelay if the box is still initializing. once
+                    // it becomes secondary we can worry about it.
+                    if( sd && box.getState().secondary() ) {
                         const OpTime ts = o["ts"]._opTime();
                         long long a = ts.getSecs();
                         long long b = time(0);
@@ -329,13 +400,30 @@ namespace mongo {
                                     sleepsecs(6);
                                     if( time(0) >= waitUntil )
                                         break;
-                                    if( box.getPrimary() != primary )
+                                    if( !target->hbinfo().hbstate.readable() ) {
                                         break;
+                                    }
                                     if( myConfig().slaveDelay != sd ) // reconf
                                         break;
                                 }
                             }
                         }
+
+                    }
+
+                    {
+                        writelock lk("");
+
+                        /* if we have become primary, we dont' want to apply things from elsewhere
+                           anymore. assumePrimary is in the db lock so we are safe as long as
+                           we check after we locked above. */
+                        if( box.getState().primary() ) {
+                            log(0) << "replSet stopping syncTail we are now primary" << rsLog;
+                            return;
+                        }
+
+                        syncApply(o);
+                        _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too: */
                     }
                 }
             }
@@ -345,8 +433,9 @@ namespace mongo {
                 // TODO : reuse our connection to the primary.
                 return;
             }
-            if( box.getPrimary() != primary )
+            if( !target->hbinfo().hbstate.readable() ) {
                 return;
+            }
             // looping back is ok because this is a tailable cursor
         }
     }
@@ -357,15 +446,11 @@ namespace mongo {
             sleepsecs(1);
             return;
         }
-        if( sp.state.fatal() ) { 
+        if( sp.state.fatal() ) {
             sleepsecs(5);
             return;
         }
 
-        /* later, we can sync from up secondaries if we want. tbd. */
-        if( sp.primary == 0 )
-            return;
-
         /* do we have anything at all? */
         if( lastOpTimeWritten.isNull() ) {
             syncDoInitialSync();
@@ -377,23 +462,64 @@ namespace mongo {
     }
 
     void ReplSetImpl::syncThread() {
-        if( myConfig().arbiterOnly )
-            return;
-        while( 1 ) { 
+        /* test here was to force a receive timeout
+        ScopedConn c("localhost");
+        bo info;
+        try {
+            log() << "this is temp" << endl;
+            c.runCommand("admin", BSON("sleep"<<120), info);
+            log() << info.toString() << endl;
+            c.runCommand("admin", BSON("sleep"<<120), info);
+            log() << "temp" << endl;
+        }
+        catch( DBException& e ) {
+            log() << e.toString() << endl;
+            c.runCommand("admin", BSON("sleep"<<120), info);
+            log() << "temp" << endl;
+        }
+        */
+
+        while( 1 ) {
+            if( myConfig().arbiterOnly )
+                return;
+
             try {
                 _syncThread();
             }
-            catch(DBException& e) { 
+            catch(DBException& e) {
                 sethbmsg("syncThread: " + e.toString());
                 sleepsecs(10);
             }
-            catch(...) { 
+            catch(...) {
                 sethbmsg("unexpected exception in syncThread()");
-                // TODO : SET NOT SECONDARY here.
+                // TODO : SET NOT SECONDARY here?
                 sleepsecs(60);
             }
             sleepsecs(1);
+
+            /* normally msgCheckNewState gets called periodically, but in a single node repl set there
+               are no heartbeat threads, so we do it here to be sure.  this is relevant if the singleton
+               member has done a stepDown() and needs to come back up.
+               */
+            OCCASIONALLY mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+        }
+    }
+
+    void startSyncThread() {
+        static int n;
+        if( n != 0 ) {
+            log() << "replSet ERROR : more than one sync thread?" << rsLog;
+            assert( n == 0 );
+        }
+        n++;
+
+        Client::initThread("replica set sync");
+        cc().iAmSyncThread();
+        if (!noauth) {
+            cc().getAuthenticationInfo()->authorize("local");
         }
+        theReplSet->syncThread();
+        cc().shutdown();
     }
 
 }
diff --git a/db/repl_block.cpp b/db/repl_block.cpp
index 9cff24f..05be343 100644
--- a/db/repl_block.cpp
+++ b/db/repl_block.cpp
@@ -35,13 +35,13 @@ namespace mongo {
 
     class SlaveTracking : public BackgroundJob {
     public:
-        string name() { return "SlaveTracking"; }
+        string name() const { return "SlaveTracking"; }
 
         static const char * NS;
 
         struct Ident {
-            
-            Ident(BSONObj r,string h,string n){
+
+            Ident(BSONObj r,string h,string n) {
                 BSONObjBuilder b;
                 b.appendElements( r );
                 b.append( "host" , h );
@@ -52,18 +52,18 @@ namespace mongo {
             bool operator<( const Ident& other ) const {
                 return obj.woCompare( other.obj ) < 0;
             }
-            
+
             BSONObj obj;
         };
 
         struct Info {
-            Info() : loc(0){}
-            ~Info(){
-                if ( loc && owned ){
+            Info() : loc(0) {}
+            ~Info() {
+                if ( loc && owned ) {
                     delete loc;
                 }
             }
-            bool owned;
+            bool owned; // true if loc is a pointer of our creation (and not a pointer into a MMF)
             OpTime * loc;
         };
 
@@ -72,33 +72,33 @@ namespace mongo {
             _started = false;
         }
 
-        void run(){
+        void run() {
             Client::initThread( "slaveTracking" );
             DBDirectClient db;
-            while ( ! inShutdown() ){
+            while ( ! inShutdown() ) {
                 sleepsecs( 1 );
 
                 if ( ! _dirty )
                     continue;
-                
+
                 writelock lk(NS);
 
                 list< pair<BSONObj,BSONObj> > todo;
-                
+
                 {
                     scoped_lock mylk(_mutex);
-                    
-                    for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++ ){
+
+                    for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++ ) {
                         BSONObjBuilder temp;
                         temp.appendTimestamp( "syncedTo" , i->second.loc[0].asDate() );
-                        todo.push_back( pair<BSONObj,BSONObj>( i->first.obj.getOwned() , 
+                        todo.push_back( pair<BSONObj,BSONObj>( i->first.obj.getOwned() ,
                                                                BSON( "$set" << temp.obj() ).getOwned() ) );
                     }
-                    
+
                     _slaves.clear();
                 }
 
-                for ( list< pair<BSONObj,BSONObj> >::iterator i=todo.begin(); i!=todo.end(); i++ ){
+                for ( list< pair<BSONObj,BSONObj> >::iterator i=todo.begin(); i!=todo.end(); i++ ) {
                     db.update( NS , i->first , i->second , true );
                 }
 
@@ -106,52 +106,54 @@ namespace mongo {
             }
         }
 
-        void reset(){
+        void reset() {
             scoped_lock mylk(_mutex);
             _slaves.clear();
         }
 
-        void update( const BSONObj& rid , const string& host , const string& ns , OpTime last ){
+        void update( const BSONObj& rid , const string& host , const string& ns , OpTime last ) {
             REPLDEBUG( host << " " << rid << " " << ns << " " << last );
 
             scoped_lock mylk(_mutex);
-            
+
 #ifdef _DEBUG
             MongoFileAllowWrites allowWrites;
 #endif
 
             Ident ident(rid,host,ns);
             Info& i = _slaves[ ident ];
-            if ( i.loc ){
-                i.loc[0] = last;
+            if ( i.loc ) {
+                if( i.owned )
+                    i.loc[0] = last;
+                else
+                    getDur().setNoJournal(i.loc, &last, sizeof(last));
                 return;
             }
-            
+
             dbMutex.assertAtLeastReadLocked();
 
             BSONObj res;
-            if ( Helpers::findOne( NS , ident.obj , res ) ){
+            if ( Helpers::findOne( NS , ident.obj , res ) ) {
                 assert( res["syncedTo"].type() );
                 i.owned = false;
                 i.loc = (OpTime*)res["syncedTo"].value();
-                i.loc[0] = last;
+                getDur().setNoJournal(i.loc, &last, sizeof(last));
                 return;
             }
-            
+
             i.owned = true;
-            i.loc = new OpTime[1];
-            i.loc[0] = last;
+            i.loc = new OpTime(last);
             _dirty = true;
 
-            if ( ! _started ){
+            if ( ! _started ) {
                 // start background thread here since we definitely need it
                 _started = true;
                 go();
             }
 
         }
-        
-        bool opReplicatedEnough( OpTime op , int w ){
+
+        bool opReplicatedEnough( OpTime op , int w ) {
             RARELY {
                 REPLDEBUG( "looking for : " << op << " w=" << w );
             }
@@ -161,9 +163,9 @@ namespace mongo {
 
             w--; // now this is the # of slaves i need
             scoped_lock mylk(_mutex);
-            for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++){
+            for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++) {
                 OpTime s = *(i->second.loc);
-                if ( s < op ){
+                if ( s < op ) {
                     continue;
                 }
                 if ( --w == 0 )
@@ -171,9 +173,15 @@ namespace mongo {
             }
             return w <= 0;
         }
-        
+
+        unsigned getSlaveCount() const {
+            scoped_lock mylk(_mutex);
+
+            return _slaves.size();
+        }
+
         // need to be careful not to deadlock with this
-        mongo::mutex _mutex;
+        mutable mongo::mutex _mutex;
         map<Ident,Info> _slaves;
         bool _dirty;
         bool _started;
@@ -182,12 +190,12 @@ namespace mongo {
 
     const char * SlaveTracking::NS = "local.slaves";
 
-    void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp ){
+    void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp ) {
         if ( lastOp.isNull() )
             return;
-        
+
         assert( str::startsWith(ns, "local.oplog.") );
-        
+
         Client * c = curop.getClient();
         assert(c);
         BSONObj rid = c->getRemoteID();
@@ -197,11 +205,15 @@ namespace mongo {
         slaveTracking.update( rid , curop.getRemoteString( false ) , ns , lastOp );
     }
 
-    bool opReplicatedEnough( OpTime op , int w ){
+    bool opReplicatedEnough( OpTime op , int w ) {
         return slaveTracking.opReplicatedEnough( op , w );
     }
 
-    void resetSlaveCache(){
+    void resetSlaveCache() {
         slaveTracking.reset();
     }
+
+    unsigned getSlaveCount() {
+        return slaveTracking.getSlaveCount();
+    }
 }
diff --git a/db/repl_block.h b/db/repl_block.h
index e9a990a..978932d 100644
--- a/db/repl_block.h
+++ b/db/repl_block.h
@@ -24,11 +24,15 @@
 
 /**
    local.slaves  - current location for all slaves
-   
+
  */
 namespace mongo {
-    
-    void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp );
+
+    void updateSlaveLocation( CurOp& curop, const char * oplog_ns , OpTime lastOp );
+
+    /** @return true if op has made it to w servers */
     bool opReplicatedEnough( OpTime op , int w );
+
     void resetSlaveCache();
+    unsigned getSlaveCount();
 }
diff --git a/db/replpair.h b/db/replpair.h
index 1da8b78..a551308 100644
--- a/db/replpair.h
+++ b/db/replpair.h
@@ -55,8 +55,8 @@ namespace mongo {
         int remotePort;
         string remoteHost;
         string remote; // host:port if port specified.
-	//    int date; // -1 not yet set; 0=slave; 1=master
-        
+        //    int date; // -1 not yet set; 0=slave; 1=master
+
         string getInfo() {
             stringstream ss;
             ss << "  state:   ";
@@ -113,12 +113,12 @@ namespace mongo {
     */
     inline bool _isMaster() {
         if( replSet ) {
-            if( theReplSet ) 
+            if( theReplSet )
                 return theReplSet->isPrimary();
             return false;
         }
 
-        if( ! replSettings.slave ) 
+        if( ! replSettings.slave )
             return true;
 
         if ( replAllDead )
@@ -128,17 +128,17 @@ namespace mongo {
             if( replPair->state == ReplPair::State_Master )
                 return true;
         }
-        else { 
+        else {
             if( replSettings.master ) {
-                // if running with --master --slave, allow.  note that master is also true 
+                // if running with --master --slave, allow.  note that master is also true
                 // for repl pairs so the check for replPair above is important.
                 return true;
             }
         }
-        
+
         if ( cc().isGod() )
             return true;
-        
+
         return false;
     }
     inline bool isMaster(const char *client = 0) {
@@ -152,20 +152,22 @@ namespace mongo {
         return strcmp( client, "local" ) == 0;
     }
 
-    inline void notMasterUnless(bool expr) { 
+    inline void notMasterUnless(bool expr) {
         uassert( 10107 , "not master" , expr );
     }
 
-    /* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair 
-       so that queries to a pair are realtime consistent as much as possible.  use setSlaveOk() to 
+    /* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair
+       so that queries to a pair are realtime consistent as much as possible.  use setSlaveOk() to
        query the nonmaster member of a replica pair.
     */
     inline void replVerifyReadsOk(ParsedQuery& pq) {
         if( replSet ) {
-            /* todo: speed up the secondary case.  as written here there are 2 mutex entries, it can be 1. */
+            /* todo: speed up the secondary case.  as written here there are 2 mutex entries, it can b 1. */
             if( isMaster() ) return;
-            notMasterUnless( pq.hasOption(QueryOption_SlaveOk) && theReplSet && theReplSet->isSecondary() );
-        } else {
+            uassert(13435, "not master and slaveok=false", pq.hasOption(QueryOption_SlaveOk));
+            uassert(13436, "not master or secondary, can't read", theReplSet && theReplSet->isSecondary() );
+        }
+        else {
             notMasterUnless(isMaster() || pq.hasOption(QueryOption_SlaveOk) || replSettings.slave == SimpleSlave );
         }
     }
diff --git a/db/resource.h b/db/resource.h
index bee8d30..9ba1ed2 100755..100644
--- a/db/resource.h
+++ b/db/resource.h
@@ -1,16 +1,16 @@
-//{{NO_DEPENDENCIES}}
-// Microsoft Visual C++ generated include file.
-// Used by db.rc
-//
-#define IDI_ICON2                       102
-
-// Next default values for new objects
-// 
-#ifdef APSTUDIO_INVOKED
-#ifndef APSTUDIO_READONLY_SYMBOLS
-#define _APS_NEXT_RESOURCE_VALUE        104
-#define _APS_NEXT_COMMAND_VALUE         40001
-#define _APS_NEXT_CONTROL_VALUE         1001
-#define _APS_NEXT_SYMED_VALUE           101
-#endif
-#endif
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by db.rc
+//
+#define IDI_ICON2                       102
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        104
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1001
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
diff --git a/db/restapi.cpp b/db/restapi.cpp
index e9a7ae2..7460c94 100644
--- a/db/restapi.cpp
+++ b/db/restapi.cpp
@@ -29,6 +29,8 @@
 #include "clientcursor.h"
 #include "background.h"
 
+#include "restapi.h"
+
 namespace mongo {
 
     extern const char *replInfo;
@@ -39,17 +41,17 @@ namespace mongo {
 
     class RESTHandler : public DbWebHandler {
     public:
-        RESTHandler() : DbWebHandler( "DUMMY REST" , 1000 , true ){}
+        RESTHandler() : DbWebHandler( "DUMMY REST" , 1000 , true ) {}
 
-        virtual bool handles( const string& url ) const { 
-            return 
+        virtual bool handles( const string& url ) const {
+            return
                 url[0] == '/' &&
                 url.find_last_of( '/' ) > 0;
         }
 
-        virtual void handle( const char *rq, string url, 
+        virtual void handle( const char *rq, string url, BSONObj params,
                              string& responseMsg, int& responseCode,
-                             vector<string>& headers,  const SockAddr &from ){
+                             vector<string>& headers,  const SockAddr &from ) {
 
             string::size_type first = url.find( "/" , 1 );
             if ( first == string::npos ) {
@@ -62,12 +64,6 @@ namespace mongo {
             string coll = url.substr( first + 1 );
             string action = "";
 
-            BSONObj params;
-            if ( coll.find( "?" ) != string::npos ) {
-                MiniWebServer::parseParams( params , coll.substr( coll.find( "?" ) + 1 ) );
-                coll = coll.substr( 0 , coll.find( "?" ) );
-            }
-
             string::size_type last = coll.find_last_of( "/" );
             if ( last == string::npos ) {
                 action = coll;
@@ -107,7 +103,7 @@ namespace mongo {
                 out() << "don't know how to handle a [" << method << "]" << endl;
             }
 
-            if( html ) 
+            if( html )
                 headers.push_back("Content-Type: text/html;charset=utf-8");
             else
                 headers.push_back("Content-Type: text/plain;charset=utf-8");
@@ -118,7 +114,7 @@ namespace mongo {
         bool handleRESTQuery( string ns , string action , BSONObj & params , int & responseCode , stringstream & out ) {
             Timer t;
 
-            int html = _getOption( params["html"] , 0 ); 
+            int html = _getOption( params["html"] , 0 );
             int skip = _getOption( params["skip"] , 0 );
             int num  = _getOption( params["limit"] , _getOption( params["count" ] , 1000 ) ); // count is old, limit is new
 
@@ -131,7 +127,7 @@ namespace mongo {
             BSONObjBuilder queryBuilder;
 
             BSONObjIterator i(params);
-            while ( i.more() ){
+            while ( i.more() ) {
                 BSONElement e = i.next();
                 string name = e.fieldName();
                 if ( ! name.find( "filter_" ) == 0 )
@@ -167,10 +163,11 @@ namespace mongo {
 
             if( html )  {
                 string title = string("query ") + ns;
-                out << start(title) 
+                out << start(title)
                     << p(title)
                     << "<pre>";
-            } else {
+            }
+            else {
                 out << "{\n";
                 out << "  \"offset\" : " << skip << ",\n";
                 out << "  \"rows\": [\n";
@@ -195,7 +192,7 @@ namespace mongo {
                 }
             }
 
-            if( html ) { 
+            if( html ) {
                 out << "</pre>\n";
                 if( howMany == 0 ) out << p("Collection is empty");
                 out << _end();
@@ -216,7 +213,8 @@ namespace mongo {
             try {
                 BSONObj obj = fromjson( body );
                 db.insert( ns.c_str(), obj );
-            } catch ( ... ) {
+            }
+            catch ( ... ) {
                 responseCode = 400; // Bad Request.  Seems reasonable for now.
                 out << "{ \"ok\" : false }";
                 return;
@@ -233,18 +231,18 @@ namespace mongo {
                 return atoi( e.valuestr() );
             return def;
         }
-        
+
         DBDirectClient db;
 
     } restHandler;
 
-    bool webHaveAdminUsers(){
+    bool RestAdminAccess::haveAdminUsers() const {
         readlocktryassert rl("admin.system.users", 10000);
-        Client::Context cx( "admin.system.users" );
-        return ! Helpers::isEmpty("admin.system.users");
+        Client::Context cx( "admin.system.users", dbpath, NULL, false );
+        return ! Helpers::isEmpty("admin.system.users", false);
     }
 
-    BSONObj webGetAdminUser( const string& username ){
+    BSONObj RestAdminAccess::getAdminUser( const string& username ) const {
         Client::GodScope gs;
         readlocktryassert rl("admin.system.users", 10000);
         Client::Context cx( "admin.system.users" );
@@ -256,19 +254,19 @@ namespace mongo {
 
     class LowLevelMongodStatus : public WebStatusPlugin {
     public:
-        LowLevelMongodStatus() : WebStatusPlugin( "low level" , 5 , "requires read lock" ){}
+        LowLevelMongodStatus() : WebStatusPlugin( "low level" , 5 , "requires read lock" ) {}
 
-        virtual void init(){}
+        virtual void init() {}
 
-        void _gotLock( int millis , stringstream& ss ){
+        void _gotLock( int millis , stringstream& ss ) {
             ss << "<pre>\n";
             ss << "time to get readlock: " << millis << "ms\n";
-            
+
             ss << "# databases: " << dbHolder.size() << '\n';
-            
+
             if( ClientCursor::numCursors()>500 )
                 ss << "# Cursors: " << ClientCursor::numCursors() << '\n';
-            
+
             ss << "\nreplication: ";
             if( *replInfo )
                 ss << "\nreplInfo:  " << replInfo << "\n\n";
@@ -296,10 +294,10 @@ namespace mongo {
             ss << "</pre>\n";
         }
 
-        virtual void run( stringstream& ss ){
+        virtual void run( stringstream& ss ) {
             Timer t;
             readlocktry lk( "" , 300 );
-            if ( lk.got() ){
+            if ( lk.got() ) {
                 _gotLock( t.millis() , ss );
             }
             else {
diff --git a/db/restapi.h b/db/restapi.h
new file mode 100644
index 0000000..e5ac520
--- /dev/null
+++ b/db/restapi.h
@@ -0,0 +1,34 @@
+/** @file restapi.h
+ */
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/admin_access.h"
+
+namespace mongo {
+
+    class RestAdminAccess : public AdminAccess {
+    public:
+        virtual ~RestAdminAccess() { }
+
+        virtual bool haveAdminUsers() const;
+        virtual BSONObj getAdminUser( const string& username ) const;
+    };
+
+} // namespace mongo
diff --git a/db/scanandorder.h b/db/scanandorder.h
index 8d63b9a..4c491fa 100644
--- a/db/scanandorder.h
+++ b/db/scanandorder.h
@@ -50,34 +50,25 @@ namespace mongo {
        _ response size limit from runquery; push it up a bit.
     */
 
-    inline void fillQueryResultFromObj(BufBuilder& bb, FieldMatcher *filter, BSONObj& js, DiskLoc* loc=NULL) {
+    inline void fillQueryResultFromObj(BufBuilder& bb, Projection *filter, const BSONObj& js, DiskLoc* loc=NULL) {
         if ( filter ) {
             BSONObjBuilder b( bb );
-            BSONObjIterator i( js );
-            while ( i.more() ){
-                BSONElement e = i.next();
-                const char * fname = e.fieldName();
-                
-                if ( strcmp( fname , "_id" ) == 0 ){
-                    if (filter->includeID())
-                        b.append( e );
-                } else {
-                    filter->append( b , e );
-                }
-            }
+            filter->transform( js , b );
             if (loc)
                 b.append("$diskLoc", loc->toBSONObj());
             b.done();
-        } else if (loc) {
+        }
+        else if (loc) {
             BSONObjBuilder b( bb );
             b.appendElements(js);
             b.append("$diskLoc", loc->toBSONObj());
             b.done();
-        } else {
+        }
+        else {
             bb.appendBuf((void*) js.objdata(), js.objsize());
         }
     }
-    
+
     typedef multimap<BSONObj,BSONObj,BSONObjCmp> BestMap;
     class ScanAndOrder {
         BestMap best; // key -> full object
@@ -87,9 +78,10 @@ namespace mongo {
         unsigned approxSize;
 
         void _add(BSONObj& k, BSONObj o, DiskLoc* loc) {
-            if (!loc){
+            if (!loc) {
                 best.insert(make_pair(k.getOwned(),o.getOwned()));
-            } else {
+            }
+            else {
                 BSONObjBuilder b;
                 b.appendElements(o);
                 b.append("$diskLoc", loc->toBSONObj());
@@ -110,8 +102,8 @@ namespace mongo {
 
     public:
         ScanAndOrder(int _startFrom, int _limit, BSONObj _order) :
-                best( BSONObjCmp( _order ) ),
-                startFrom(_startFrom), order(_order) {
+            best( BSONObjCmp( _order ) ),
+            startFrom(_startFrom), order(_order) {
             limit = _limit > 0 ? _limit + startFrom : 0x7fffffff;
             approxSize = 0;
         }
@@ -140,7 +132,7 @@ namespace mongo {
             _addIfBetter(k, o, i, loc);
         }
 
-        void _fill(BufBuilder& b, FieldMatcher *filter, int& nout, BestMap::iterator begin, BestMap::iterator end) {
+        void _fill(BufBuilder& b, Projection *filter, int& nout, BestMap::iterator begin, BestMap::iterator end) {
             int n = 0;
             int nFilled = 0;
             for ( BestMap::iterator i = begin; i != end; i++ ) {
@@ -158,7 +150,7 @@ namespace mongo {
         }
 
         /* scanning complete. stick the query result in b for n objects. */
-        void fill(BufBuilder& b, FieldMatcher *filter, int& nout) {
+        void fill(BufBuilder& b, Projection *filter, int& nout) {
             _fill(b, filter, nout, best.begin(), best.end());
         }
 
diff --git a/db/security.cpp b/db/security.cpp
index c552b53..1ec4218 100644
--- a/db/security.cpp
+++ b/db/security.cpp
@@ -20,19 +20,17 @@
 #include "security.h"
 #include "instance.h"
 #include "client.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "db.h"
 #include "dbhelpers.h"
 
 namespace mongo {
 
-    bool noauth = true;
-    
-	int AuthenticationInfo::warned = 0;
+    int AuthenticationInfo::warned = 0;
 
-    void AuthenticationInfo::print(){
+    void AuthenticationInfo::print() {
         cout << "AuthenticationInfo: " << this << '\n';
-        for ( map<string,Auth>::iterator i=m.begin(); i!=m.end(); i++ ){
+        for ( map<string,Auth>::iterator i=m.begin(); i!=m.end(); i++ ) {
             cout << "\t" << i->first << "\t" << i->second.level << '\n';
         }
         cout << "END" << endl;
@@ -40,16 +38,16 @@ namespace mongo {
 
 
     bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) {
-        if ( cc().isGod() ){
+        if ( cc().isGod() ) {
             return true;
         }
-        
-        if ( isLocalHost ){
-            atleastreadlock l(""); 
+
+        if ( isLocalHost ) {
+            atleastreadlock l("");
             Client::GodScope gs;
             Client::Context c("admin.system.users");
             BSONObj result;
-            if( ! Helpers::getSingleton("admin.system.users", result) ){
+            if( ! Helpers::getSingleton("admin.system.users", result) ) {
                 if( warned == 0 ) {
                     warned++;
                     log() << "note: no users configured in admin.system.users, allowing localhost access" << endl;
diff --git a/db/security.h b/db/security.h
index a6a9103..2b947c1 100644
--- a/db/security.h
+++ b/db/security.h
@@ -20,12 +20,10 @@
 
 #include "nonce.h"
 #include "concurrency.h"
+#include "security_key.h"
 
 namespace mongo {
 
-    // --noauth cmd line option
-    extern bool noauth;
-
     /* for a particular db */
     struct Auth {
         Auth() { level = 0; }
@@ -35,36 +33,36 @@ namespace mongo {
     class AuthenticationInfo : boost::noncopyable {
         mongo::mutex _lock;
         map<string, Auth> m; // dbname -> auth
-		static int warned;
+        static int warned;
     public:
-		bool isLocalHost;
+        bool isLocalHost;
         AuthenticationInfo() : _lock("AuthenticationInfo") { isLocalHost = false; }
         ~AuthenticationInfo() {
         }
-        void logout(const string& dbname ) { 
+        void logout(const string& dbname ) {
             scoped_lock lk(_lock);
-			m.erase(dbname); 
-		}
-        void authorize(const string& dbname ) { 
+            m.erase(dbname);
+        }
+        void authorize(const string& dbname ) {
             scoped_lock lk(_lock);
             m[dbname].level = 2;
         }
         void authorizeReadOnly(const string& dbname) {
             scoped_lock lk(_lock);
-            m[dbname].level = 1;            
+            m[dbname].level = 1;
         }
         bool isAuthorized(const string& dbname) { return _isAuthorized( dbname, 2 ); }
         bool isAuthorizedReads(const string& dbname) { return _isAuthorized( dbname, 1 ); }
         bool isAuthorizedForLock(const string& dbname, int lockType ) { return _isAuthorized( dbname , lockType > 0 ? 2 : 1 ); }
-        
+
         void print();
 
     protected:
-        bool _isAuthorized(const string& dbname, int level) { 
+        bool _isAuthorized(const string& dbname, int level) {
             if( m[dbname].level >= level ) return true;
-			if( noauth ) return true;
-			if( m["admin"].level >= level ) return true;
-			if( m["local"].level >= level ) return true;
+            if( noauth ) return true;
+            if( m["admin"].level >= level ) return true;
+            if( m["local"].level >= level ) return true;
             return _isAuthorizedSpecialChecks( dbname );
         }
 
diff --git a/db/security_commands.cpp b/db/security_commands.cpp
index 7bf2813..67605aa 100644
--- a/db/security_commands.cpp
+++ b/db/security_commands.cpp
@@ -22,7 +22,7 @@
 #include "pch.h"
 #include "security.h"
 #include "../util/md5.hpp"
-#include "json.h" 
+#include "json.h"
 #include "pdfile.h"
 #include "db.h"
 #include "dbhelpers.h"
@@ -32,17 +32,17 @@
 
 namespace mongo {
 
-/* authentication
+    /* authentication
 
-   system.users contains 
-     { user : <username>, pwd : <pwd_digest>, ... }
+       system.users contains
+         { user : <username>, pwd : <pwd_digest>, ... }
 
-   getnonce sends nonce to client
+       getnonce sends nonce to client
 
-   client then sends { authenticate:1, nonce:<nonce_str>, user:<username>, key:<key> }
+       client then sends { authenticate:1, nonce:<nonce_str>, user:<username>, key:<key> }
 
-   where <key> is md5(<nonce_str><username><pwd_digest_str>) as a string
-*/
+       where <key> is md5(<nonce_str><username><pwd_digest_str>) as a string
+    */
 
     boost::thread_specific_ptr<nonce> lastNonce;
 
@@ -83,7 +83,7 @@ namespace mongo {
             return true;
         }
     } cmdLogout;
-    
+
     class CmdAuthenticate : public Command {
     public:
         virtual bool requiresAuth() { return false; }
@@ -93,7 +93,7 @@ namespace mongo {
         virtual bool slaveOk() const {
             return true;
         }
-        virtual LockType locktype() const { return WRITE; } // TODO: make this READ
+        virtual LockType locktype() const { return WRITE; }
         virtual void help(stringstream& ss) const { ss << "internal"; }
         CmdAuthenticate() : Command("authenticate") {}
         bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
@@ -102,16 +102,16 @@ namespace mongo {
             string user = cmdObj.getStringField("user");
             string key = cmdObj.getStringField("key");
             string received_nonce = cmdObj.getStringField("nonce");
-            
-            if( user.empty() || key.empty() || received_nonce.empty() ) { 
-                log() << "field missing/wrong type in received authenticate command " 
-                    << dbname
-                    << endl;               
+
+            if( user.empty() || key.empty() || received_nonce.empty() ) {
+                log() << "field missing/wrong type in received authenticate command "
+                      << dbname
+                      << endl;
                 errmsg = "auth fails";
                 sleepmillis(10);
                 return false;
             }
-            
+
             stringstream digestBuilder;
 
             {
@@ -120,12 +120,13 @@ namespace mongo {
                 if ( ln == 0 ) {
                     reject = true;
                     log(1) << "auth: no lastNonce" << endl;
-                } else {
+                }
+                else {
                     digestBuilder << hex << *ln;
                     reject = digestBuilder.str() != received_nonce;
                     if ( reject ) log(1) << "auth: different lastNonce" << endl;
                 }
-                    
+
                 if ( reject ) {
                     log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << endl;
                     errmsg = "auth fails";
@@ -134,52 +135,60 @@ namespace mongo {
                 }
             }
 
-            static BSONObj userPattern = fromjson("{\"user\":1}");
-            string systemUsers = dbname + ".system.users";
-            OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
-
             BSONObj userObj;
-            {
-                BSONObjBuilder b;
-                b << "user" << user;
-                BSONObj query = b.done();
-                if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) { 
-                    log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
-                    errmsg = "auth fails";
-                    return false;
+            string pwd;
+
+            if (user == internalSecurity.user) {
+                pwd = internalSecurity.pwd;
+            }
+            else {
+                static BSONObj userPattern = fromjson("{\"user\":1}");
+                string systemUsers = dbname + ".system.users";
+                OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
+                {
+                    BSONObjBuilder b;
+                    b << "user" << user;
+                    BSONObj query = b.done();
+                    if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) {
+                        log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
+                        errmsg = "auth fails";
+                        return false;
+                    }
                 }
+
+                pwd = userObj.getStringField("pwd");
             }
-            
+
+
             md5digest d;
             {
-                
-                string pwd = userObj.getStringField("pwd");
                 digestBuilder << user << pwd;
                 string done = digestBuilder.str();
-                
+
                 md5_state_t st;
                 md5_init(&st);
                 md5_append(&st, (const md5_byte_t *) done.c_str(), done.size());
                 md5_finish(&st, d);
             }
-            
+
             string computed = digestToString( d );
-            
-            if ( key != computed ){
+
+            if ( key != computed ) {
                 log() << "auth: key mismatch " << user << ", ns:" << dbname << endl;
                 errmsg = "auth fails";
                 return false;
             }
 
             AuthenticationInfo *ai = cc().getAuthenticationInfo();
-            
+
             if ( userObj[ "readOnly" ].isBoolean() && userObj[ "readOnly" ].boolean() ) {
                 ai->authorizeReadOnly( cc().database()->name.c_str() );
-            } else {
+            }
+            else {
                 ai->authorize( cc().database()->name.c_str() );
             }
             return true;
         }
     } cmdAuthenticate;
-    
+
 } // namespace mongo
diff --git a/db/security_key.cpp b/db/security_key.cpp
new file mode 100644
index 0000000..1ea7021
--- /dev/null
+++ b/db/security_key.cpp
@@ -0,0 +1,105 @@
+// security_key.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * This file contains inter-mongo instance security helpers.  Due to the
+ * requirement that it be possible to compile this into mongos and mongod, it
+ * should not depend on much external stuff.
+ */
+
+#include "pch.h"
+#include "security_key.h"
+#include "../client/dbclient.h"
+
+#include <sys/stat.h>
+
+namespace mongo {
+
+    bool noauth = true;
+    AuthInfo internalSecurity;
+
+    bool setUpSecurityKey(const string& filename) {
+        struct stat stats;
+
+        // check obvious file errors
+        if (stat(filename.c_str(), &stats) == -1) {
+            log() << "error getting file " << filename << ": " << strerror(errno) << endl;
+            return false;
+        }
+
+#if !defined(WIN32)
+        // check permissions: must be X00, where X is >= 4
+        if ((stats.st_mode & (S_IRWXG|S_IRWXO)) != 0) {
+            log() << "permissions on " << filename << " are too open" << endl;
+            return false;
+        }
+#endif
+
+        const unsigned long long fileLength = stats.st_size;
+        if (fileLength < 6 || fileLength > 1024) {
+            log() << " key file " << filename << " has length " << stats.st_size
+                  << ", must be between 6 and 1024 chars" << endl;
+            return false;
+        }
+
+        FILE* file = fopen( filename.c_str(), "rb" );
+        if (!file) {
+            log() << "error opening file: " << filename << ": " << strerror(errno) << endl;
+            return false;
+        }
+
+        string str = "";
+
+        // strip key file
+        unsigned long long read = 0;
+        while (read < fileLength) {
+            char buf;
+            int readLength = fread(&buf, 1, 1, file);
+            if (readLength < 1) {
+                log() << "error reading file " << filename << endl;
+                return false;
+            }
+            read++;
+
+            // check for whitespace
+            if ((buf >= '\x09' && buf <= '\x0D') || buf == ' ') {
+                continue;
+            }
+
+            // check valid base64
+            if ((buf < 'A' || buf > 'Z') && (buf < 'a' || buf > 'z') && (buf < '0' || buf > '9') && buf != '+' && buf != '/') {
+                log() << "invalid char in key file " << filename << ": " << buf << endl;
+                return false;
+            }
+
+            str += buf;
+        }
+
+        if (str.size() < 6) {
+            log() << "security key must be at least 6 characters" << endl;
+            return false;
+        }
+
+        log(1) << "security key: " << str << endl;
+
+        // createPWDigest should really not be a member func
+        DBClientConnection conn;
+        internalSecurity.pwd = conn.createPasswordDigest(internalSecurity.user, str);
+
+        return true;
+    }
+} // namespace mongo
diff --git a/db/security_key.h b/db/security_key.h
new file mode 100644
index 0000000..86f1307
--- /dev/null
+++ b/db/security_key.h
@@ -0,0 +1,47 @@
+// security_key.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /**
+     * Internal secret key info.
+     */
+    struct AuthInfo {
+        AuthInfo() {
+            user = "__system";
+        }
+        string user;
+        string pwd;
+    };
+
+    // --noauth cmd line option
+    extern bool noauth;
+    extern AuthInfo internalSecurity;
+
+    /**
+     * This method checks the validity of filename as a security key, hashes its
+     * contents, and stores it in the internalSecurity variable.  Prints an
+     * error message to the logs if there's an error.
+     * @param filename the file containing the key
+     * @return if the key was successfully stored
+     */
+    bool setUpSecurityKey(const string& filename);
+
+} // namespace mongo
diff --git a/db/stats/counters.cpp b/db/stats/counters.cpp
index a2d4cfb..889e8a8 100644
--- a/db/stats/counters.cpp
+++ b/db/stats/counters.cpp
@@ -22,7 +22,7 @@
 
 namespace mongo {
 
-    OpCounters::OpCounters(){
+    OpCounters::OpCounters() {
         int zero = 0;
 
         BSONObjBuilder b;
@@ -42,16 +42,16 @@ namespace mongo {
         _command = (AtomicUInt*)_obj["command"].value();
     }
 
-    void OpCounters::gotOp( int op , bool isCommand ){
-        switch ( op ){
+    void OpCounters::gotOp( int op , bool isCommand ) {
+        switch ( op ) {
         case dbInsert: /*gotInsert();*/ break; // need to handle multi-insert
-        case dbQuery: 
+        case dbQuery:
             if ( isCommand )
                 gotCommand();
-            else 
-                gotQuery(); 
+            else
+                gotQuery();
             break;
-            
+
         case dbUpdate: gotUpdate(); break;
         case dbDelete: gotDelete(); break;
         case dbGetMore: gotGetMore(); break;
@@ -62,24 +62,48 @@ namespace mongo {
         default: log() << "OpCounters::gotOp unknown op: " << op << endl;
         }
     }
-    
-    IndexCounters::IndexCounters(){
+
+    BSONObj& OpCounters::getObj() {
+        const unsigned MAX = 1 << 30;
+        RARELY {
+            bool wrap =
+            _insert->get() > MAX ||
+            _query->get() > MAX ||
+            _update->get() > MAX ||
+            _delete->get() > MAX ||
+            _getmore->get() > MAX ||
+            _command->get() > MAX;
+
+            if ( wrap ) {
+                _insert->zero();
+                _query->zero();
+                _update->zero();
+                _delete->zero();
+                _getmore->zero();
+                _command->zero();
+            }
+
+        }
+        return _obj;
+    }
+
+    IndexCounters::IndexCounters() {
         _memSupported = _pi.blockCheckSupported();
-        
+
         _btreeMemHits = 0;
         _btreeMemMisses = 0;
         _btreeAccesses = 0;
-        
-        
+
+
         _maxAllowed = ( numeric_limits< long long >::max() ) / 2;
         _resets = 0;
 
         _sampling = 0;
         _samplingrate = 100;
     }
-    
-    void IndexCounters::append( BSONObjBuilder& b ){
-        if ( ! _memSupported ){
+
+    void IndexCounters::append( BSONObjBuilder& b ) {
+        if ( ! _memSupported ) {
             b.append( "note" , "not supported on this platform" );
             return;
         }
@@ -90,33 +114,33 @@ namespace mongo {
         bb.appendNumber( "misses" , _btreeMemMisses );
 
         bb.append( "resets" , _resets );
-        
+
         bb.append( "missRatio" , (_btreeAccesses ? (_btreeMemMisses / (double)_btreeAccesses) : 0) );
-        
+
         bb.done();
-        
-        if ( _btreeAccesses > _maxAllowed ){
+
+        if ( _btreeAccesses > _maxAllowed ) {
             _btreeAccesses = 0;
             _btreeMemMisses = 0;
             _btreeMemHits = 0;
             _resets++;
         }
     }
-    
+
     FlushCounters::FlushCounters()
         : _total_time(0)
         , _flushes(0)
         , _last()
     {}
 
-    void FlushCounters::flushed(int ms){
+    void FlushCounters::flushed(int ms) {
         _flushes++;
         _total_time += ms;
         _last_time = ms;
         _last = jsTime();
     }
 
-    void FlushCounters::append( BSONObjBuilder& b ){
+    void FlushCounters::append( BSONObjBuilder& b ) {
         b.appendNumber( "flushes" , _flushes );
         b.appendNumber( "total_ms" , _total_time );
         b.appendNumber( "average_ms" , (_flushes ? (_total_time / double(_flushes)) : 0.0) );
@@ -125,25 +149,59 @@ namespace mongo {
     }
 
 
-    void GenericCounter::hit( const string& name , int count ){
+    void GenericCounter::hit( const string& name , int count ) {
         scoped_lock lk( _mutex );
         _counts[name]++;
     }
-    
+
     BSONObj GenericCounter::getObj() {
         BSONObjBuilder b(128);
         {
             mongo::mutex::scoped_lock lk( _mutex );
-            for ( map<string,long long>::iterator i=_counts.begin(); i!=_counts.end(); i++ ){
+            for ( map<string,long long>::iterator i=_counts.begin(); i!=_counts.end(); i++ ) {
                 b.appendNumber( i->first , i->second );
             }
         }
         return b.obj();
     }
 
-    
+
+    void NetworkCounter::hit( long long bytesIn , long long bytesOut ) {
+        const long long MAX = 1ULL << 60;
+
+        // don't care about the race as its just a counter
+        bool overflow = _bytesIn > MAX || _bytesOut > MAX;
+
+        if ( overflow ) {
+            _lock.lock();
+            _overflows++;
+            _bytesIn = bytesIn;
+            _bytesOut = bytesOut;
+            _requests = 1;
+            _lock.unlock();
+        }
+        else {
+            _lock.lock();
+            _bytesIn += bytesIn;
+            _bytesOut += bytesOut;
+            _requests++;
+            _lock.unlock();
+        }
+    }
+
+    void NetworkCounter::append( BSONObjBuilder& b ) {
+        _lock.lock();
+        b.appendNumber( "bytesIn" , _bytesIn );
+        b.appendNumber( "bytesOut" , _bytesOut );
+        b.appendNumber( "numRequests" , _requests );
+        _lock.unlock();
+    }
+
 
     OpCounters globalOpCounters;
+    OpCounters replOpCounters;
     IndexCounters globalIndexCounters;
     FlushCounters globalFlushCounters;
+    NetworkCounter networkCounter;
+
 }
diff --git a/db/stats/counters.h b/db/stats/counters.h
index 2704464..b5cad85 100644
--- a/db/stats/counters.h
+++ b/db/stats/counters.h
@@ -21,6 +21,7 @@
 #include "../jsobj.h"
 #include "../../util/message.h"
 #include "../../util/processinfo.h"
+#include "../../util/concurrency/spin_lock.h"
 
 namespace mongo {
 
@@ -30,28 +31,33 @@ namespace mongo {
      */
     class OpCounters {
     public:
-        
+
         OpCounters();
 
-        AtomicUInt * getInsert(){ return _insert; }
-        AtomicUInt * getQuery(){ return _query; }
-        AtomicUInt * getUpdate(){ return _update; }
-        AtomicUInt * getDelete(){ return _delete; }
-        AtomicUInt * getGetMore(){ return _getmore; }
-        AtomicUInt * getCommand(){ return _command; }
-        
-        void gotInsert(){ _insert[0]++; }
-        void gotQuery(){ _query[0]++; }
-        void gotUpdate(){ _update[0]++; }
-        void gotDelete(){ _delete[0]++; }
-        void gotGetMore(){ _getmore[0]++; }
-        void gotCommand(){ _command[0]++; }
+        AtomicUInt * getInsert() { return _insert; }
+        AtomicUInt * getQuery() { return _query; }
+        AtomicUInt * getUpdate() { return _update; }
+        AtomicUInt * getDelete() { return _delete; }
+        AtomicUInt * getGetMore() { return _getmore; }
+        AtomicUInt * getCommand() { return _command; }
+
+        void incInsertInWriteLock(int n) { _insert->x += n; }
+        void gotInsert() { _insert[0]++; }
+        void gotQuery() { _query[0]++; }
+        void gotUpdate() { _update[0]++; }
+        void gotDelete() { _delete[0]++; }
+        void gotGetMore() { _getmore[0]++; }
+        void gotCommand() { _command[0]++; }
 
         void gotOp( int op , bool isCommand );
 
-        BSONObj& getObj(){ return _obj; }
+        BSONObj& getObj();
+
     private:
         BSONObj _obj;
+
+        // todo: there will be a lot of cache line contention on these.  need to do something 
+        //       else eventually.
         AtomicUInt * _insert;
         AtomicUInt * _query;
         AtomicUInt * _update;
@@ -59,14 +65,16 @@ namespace mongo {
         AtomicUInt * _getmore;
         AtomicUInt * _command;
     };
-    
+
     extern OpCounters globalOpCounters;
+    extern OpCounters replOpCounters;
+
 
     class IndexCounters {
     public:
         IndexCounters();
-        
-        void btree( char * node ){
+
+        void btree( char * node ) {
             if ( ! _memSupported )
                 return;
             if ( _sampling++ % _samplingrate )
@@ -74,28 +82,28 @@ namespace mongo {
             btree( _pi.blockInMemory( node ) );
         }
 
-        void btree( bool memHit ){
+        void btree( bool memHit ) {
             if ( memHit )
                 _btreeMemHits++;
             else
                 _btreeMemMisses++;
             _btreeAccesses++;
         }
-        void btreeHit(){ _btreeMemHits++; _btreeAccesses++; }
-        void btreeMiss(){ _btreeMemMisses++; _btreeAccesses++; }
-        
+        void btreeHit() { _btreeMemHits++; _btreeAccesses++; }
+        void btreeMiss() { _btreeMemMisses++; _btreeAccesses++; }
+
         void append( BSONObjBuilder& b );
-        
+
     private:
         ProcessInfo _pi;
         bool _memSupported;
 
         int _sampling;
         int _samplingrate;
-        
+
         int _resets;
         long long _maxAllowed;
-        
+
         long long _btreeMemMisses;
         long long _btreeMemHits;
         long long _btreeAccesses;
@@ -108,7 +116,7 @@ namespace mongo {
         FlushCounters();
 
         void flushed(int ms);
-        
+
         void append( BSONObjBuilder& b );
 
     private:
@@ -130,4 +138,21 @@ namespace mongo {
         map<string,long long> _counts; // TODO: replace with thread safe map
         mongo::mutex _mutex;
     };
+
+    class NetworkCounter {
+    public:
+        NetworkCounter() : _bytesIn(0), _bytesOut(0), _requests(0), _overflows(0) {}
+        void hit( long long bytesIn , long long bytesOut );
+        void append( BSONObjBuilder& b );
+    private:
+        long long _bytesIn;
+        long long _bytesOut;
+        long long _requests;
+
+        long long _overflows;
+
+        SpinLock _lock;
+    };
+
+    extern NetworkCounter networkCounter;
 }
diff --git a/db/stats/fine_clock.h b/db/stats/fine_clock.h
index 1f23175..02600e7 100644
--- a/db/stats/fine_clock.h
+++ b/db/stats/fine_clock.h
@@ -36,29 +36,30 @@ namespace mongo {
      * Really, you shouldn't be using this class in hot code paths for
      * platforms you're not sure whether the overhead is low.
      */
-    class FineClock{
+    class FineClock {
     public:
 
         typedef timespec WallTime;
 
-        static WallTime now(){
+        static WallTime now() {
             struct timespec ts;
             clock_gettime(CLOCK_MONOTONIC, &ts);
             return ts;
         }
 
-        static uint64_t diffInNanos( WallTime end, WallTime start ){
+        static uint64_t diffInNanos( WallTime end, WallTime start ) {
             uint64_t diff;
-            if ( end.tv_nsec < start.tv_nsec ){
+            if ( end.tv_nsec < start.tv_nsec ) {
                 diff = 1000000000 * ( end.tv_sec - start.tv_sec - 1);
                 diff += 1000000000 + end.tv_nsec - start.tv_nsec;
-            } else {
+            }
+            else {
                 diff = 1000000000 * ( end.tv_sec - start.tv_sec );
                 diff += end.tv_nsec - start.tv_nsec;
             }
             return diff;
         }
-  
+
     };
 }
 
diff --git a/db/stats/service_stats.cpp b/db/stats/service_stats.cpp
index 5574ecb..d69147f 100644
--- a/db/stats/service_stats.cpp
+++ b/db/stats/service_stats.cpp
@@ -25,7 +25,7 @@ namespace mongo {
 
     using std::ostringstream;
 
-    ServiceStats::ServiceStats(){
+    ServiceStats::ServiceStats() {
         // Time histogram covers up to 128msec in exponential intervals
         // starting at 125usec.
         Histogram::Options timeOpts;
@@ -43,12 +43,12 @@ namespace mongo {
         _spaceHistogram = new Histogram( spaceOpts );
     }
 
-    ServiceStats::~ServiceStats(){
+    ServiceStats::~ServiceStats() {
         delete _timeHistogram;
         delete _spaceHistogram;
     }
 
-    void ServiceStats::logResponse( uint64_t duration, uint64_t bytes ){
+    void ServiceStats::logResponse( uint64_t duration, uint64_t bytes ) {
         _spinLock.lock();
         _timeHistogram->insert( duration / 1000 /* in usecs */ );
         _spaceHistogram->insert( bytes );
diff --git a/db/stats/snapshots.cpp b/db/stats/snapshots.cpp
index 3ce80ca..a81568d 100644
--- a/db/stats/snapshots.cpp
+++ b/db/stats/snapshots.cpp
@@ -27,28 +27,27 @@
    handles snapshotting performance metrics and other such things
  */
 namespace mongo {
-    void SnapshotData::takeSnapshot(){
-         _created = curTimeMicros64();
-         _globalUsage = Top::global.getGlobalData();
+    void SnapshotData::takeSnapshot() {
+        _created = curTimeMicros64();
+        _globalUsage = Top::global.getGlobalData();
         _totalWriteLockedTime = dbMutex.info().getTimeLocked();
         Top::global.cloneMap(_usage);
     }
 
     SnapshotDelta::SnapshotDelta( const SnapshotData& older , const SnapshotData& newer )
-        : _older( older ) , _newer( newer )
-    {
+        : _older( older ) , _newer( newer ) {
         assert( _newer._created > _older._created );
         _elapsed = _newer._created - _older._created;
-        
+
     }
-    
-    Top::CollectionData SnapshotDelta::globalUsageDiff(){
+
+    Top::CollectionData SnapshotDelta::globalUsageDiff() {
         return Top::CollectionData( _older._globalUsage , _newer._globalUsage );
     }
-    Top::UsageMap SnapshotDelta::collectionUsageDiff(){
+    Top::UsageMap SnapshotDelta::collectionUsageDiff() {
         Top::UsageMap u;
-        
-        for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ){
+
+        for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ) {
             Top::UsageMap::const_iterator j = _older._usage.find(i->first);
             if (j != _older._usage.end())
                 u[i->first] = Top::CollectionData( j->second , i->second );
@@ -62,8 +61,8 @@ namespace mongo {
         , _loc(0)
         , _stored(0)
     {}
-    
-    const SnapshotData* Snapshots::takeSnapshot(){
+
+    const SnapshotData* Snapshots::takeSnapshot() {
         scoped_lock lk(_lock);
         _loc = ( _loc + 1 ) % _n;
         _snapshots[_loc].takeSnapshot();
@@ -72,7 +71,7 @@ namespace mongo {
         return &_snapshots[_loc];
     }
 
-    auto_ptr<SnapshotDelta> Snapshots::computeDelta( int numBack ){
+    auto_ptr<SnapshotDelta> Snapshots::computeDelta( int numBack ) {
         scoped_lock lk(_lock);
         auto_ptr<SnapshotDelta> p;
         if ( numBack < numDeltas() )
@@ -80,43 +79,43 @@ namespace mongo {
         return p;
     }
 
-    const SnapshotData& Snapshots::getPrev( int numBack ){
+    const SnapshotData& Snapshots::getPrev( int numBack ) {
         int x = _loc - numBack;
         if ( x < 0 )
             x += _n;
         return _snapshots[x];
     }
 
-    void Snapshots::outputLockInfoHTML( stringstream& ss ){
+    void Snapshots::outputLockInfoHTML( stringstream& ss ) {
         scoped_lock lk(_lock);
         ss << "\n<div>";
-        for ( int i=0; i<numDeltas(); i++ ){
+        for ( int i=0; i<numDeltas(); i++ ) {
             SnapshotDelta d( getPrev(i+1) , getPrev(i) );
             unsigned e = (unsigned) d.elapsed() / 1000;
             ss << (unsigned)(100*d.percentWriteLocked());
-            if( e < 3900 || e > 4100 ) 
+            if( e < 3900 || e > 4100 )
                 ss << '(' << e / 1000.0 << "s)";
             ss << ' ';
         }
         ss << "</div>\n";
     }
 
-    void SnapshotThread::run(){
+    void SnapshotThread::run() {
         Client::initThread("snapshotthread");
         Client& client = cc();
 
         long long numLoops = 0;
-        
+
         const SnapshotData* prev = 0;
 
-        while ( ! inShutdown() ){
+        while ( ! inShutdown() ) {
             try {
                 const SnapshotData* s = statsSnapshots.takeSnapshot();
-                
-                if ( prev ){
+
+                if ( prev ) {
                     unsigned long long elapsed = s->_created - prev->_created;
 
-                    if ( cmdLine.cpu ){
+                    if ( cmdLine.cpu ) {
                         SnapshotDelta d( *prev , *s );
                         log() << "cpu: elapsed:" << (elapsed/1000) <<"  writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl;
                     }
@@ -125,14 +124,14 @@ namespace mongo {
 
                 prev = s;
             }
-            catch ( std::exception& e ){
+            catch ( std::exception& e ) {
                 log() << "ERROR in SnapshotThread: " << e.what() << endl;
             }
-            
+
             numLoops++;
             sleepsecs(4);
         }
-        
+
         client.shutdown();
     }
 
@@ -140,15 +139,15 @@ namespace mongo {
 
     class WriteLockStatus : public WebStatusPlugin {
     public:
-        WriteLockStatus() : WebStatusPlugin( "write lock" , 51 , "% time in write lock, by 4 sec periods" ){}
-        virtual void init(){}
+        WriteLockStatus() : WebStatusPlugin( "write lock" , 51 , "% time in write lock, by 4 sec periods" ) {}
+        virtual void init() {}
 
-        virtual void run( stringstream& ss ){
+        virtual void run( stringstream& ss ) {
             statsSnapshots.outputLockInfoHTML( ss );
 
             ss << "<a "
-                  "href=\"http://www.mongodb.org/pages/viewpage.action?pageId=7209296\" " 
-                  "title=\"snapshot: was the db in the write lock when this page was generated?\">";
+               "href=\"http://www.mongodb.org/pages/viewpage.action?pageId=7209296\" "
+               "title=\"snapshot: was the db in the write lock when this page was generated?\">";
             ss << "write locked now:</a> " << (dbMutex.info().isLocked() ? "true" : "false") << "\n";
         }
 
@@ -156,22 +155,26 @@ namespace mongo {
 
     class DBTopStatus : public WebStatusPlugin {
     public:
-        DBTopStatus() : WebStatusPlugin( "dbtop" , 50 , "(occurences|percent of elapsed)" ){}
+        DBTopStatus() : WebStatusPlugin( "dbtop" , 50 , "(occurences|percent of elapsed)" ) {}
 
-        void display( stringstream& ss , double elapsed , const Top::UsageData& usage ){
+        void display( stringstream& ss , double elapsed , const Top::UsageData& usage ) {
             ss << "<td>";
             ss << usage.count;
             ss << "</td><td>";
             double per = 100 * ((double)usage.time)/elapsed;
-            ss << setprecision(1) << fixed << per << "%";
+            if( per == (int) per )
+                ss << (int) per;
+            else
+                ss << setprecision(1) << fixed << per;
+            ss << '%';
             ss << "</td>";
         }
 
-        void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ){
-            if ( ns != "GLOBAL" && data.total.count == 0 )
+        void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ) {
+            if ( ns != "TOTAL" && data.total.count == 0 )
                 return;
             ss << "<tr><th>" << ns << "</th>";
-            
+
             display( ss , elapsed , data.total );
 
             display( ss , elapsed , data.readLock );
@@ -182,43 +185,43 @@ namespace mongo {
             display( ss , elapsed , data.insert );
             display( ss , elapsed , data.update );
             display( ss , elapsed , data.remove );
-            
+
             ss << "</tr>\n";
         }
 
-        void run( stringstream& ss ){
+        void run( stringstream& ss ) {
             auto_ptr<SnapshotDelta> delta = statsSnapshots.computeDelta();
             if ( ! delta.get() )
                 return;
-            
+
             ss << "<table border=1 cellpadding=2 cellspacing=0>";
             ss << "<tr align='left'><th>";
-            ss << a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "namespace") << 
-                "NS</a></th>"
-                "<th colspan=2>total</th>"
-                "<th colspan=2>Reads</th>"
-                "<th colspan=2>Writes</th>"
-                "<th colspan=2>Queries</th>"
-                "<th colspan=2>GetMores</th>"
-                "<th colspan=2>Inserts</th>"
-                "<th colspan=2>Updates</th>"
-                "<th colspan=2>Removes</th>";
+            ss << a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "namespace") <<
+               "NS</a></th>"
+               "<th colspan=2>total</th>"
+               "<th colspan=2>Reads</th>"
+               "<th colspan=2>Writes</th>"
+               "<th colspan=2>Queries</th>"
+               "<th colspan=2>GetMores</th>"
+               "<th colspan=2>Inserts</th>"
+               "<th colspan=2>Updates</th>"
+               "<th colspan=2>Removes</th>";
             ss << "</tr>\n";
-            
-            display( ss , (double) delta->elapsed() , "GLOBAL" , delta->globalUsageDiff() );
-            
+
+            display( ss , (double) delta->elapsed() , "TOTAL" , delta->globalUsageDiff() );
+
             Top::UsageMap usage = delta->collectionUsageDiff();
-            for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ){
+            for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ) {
                 display( ss , (double) delta->elapsed() , i->first , i->second );
             }
-            
+
             ss << "</table>";
-        
+
         }
 
-        virtual void init(){}
+        virtual void init() {}
     } dbtopStatus;
 
     Snapshots statsSnapshots;
-    SnapshotThread snapshotThread;    
+    SnapshotThread snapshotThread;
 }
diff --git a/db/stats/snapshots.h b/db/stats/snapshots.h
index 6d8e23d..d9b8e5e 100644
--- a/db/stats/snapshots.h
+++ b/db/stats/snapshots.h
@@ -28,7 +28,7 @@
 namespace mongo {
 
     class SnapshotThread;
-    
+
     /**
      * stores a point in time snapshot
      * i.e. all counters at a given time
@@ -45,14 +45,14 @@ namespace mongo {
         friend class SnapshotDelta;
         friend class Snapshots;
     };
-    
+
     /**
      * contains performance information for a time period
      */
     class SnapshotDelta {
     public:
         SnapshotDelta( const SnapshotData& older , const SnapshotData& newer );
-        
+
         unsigned long long start() const {
             return _older._created;
         }
@@ -60,7 +60,7 @@ namespace mongo {
         unsigned long long elapsed() const {
             return _elapsed;
         }
-        
+
         unsigned long long timeInWriteLock() const {
             return _newer._totalWriteLockedTime - _older._totalWriteLockedTime;
         }
@@ -83,15 +83,15 @@ namespace mongo {
     class Snapshots {
     public:
         Snapshots(int n=100);
-        
+
         const SnapshotData* takeSnapshot();
-        
+
         int numDeltas() const { return _stored-1; }
 
         const SnapshotData& getPrev( int numBack = 0 );
         auto_ptr<SnapshotDelta> computeDelta( int numBack = 0 );
-        
-        
+
+
         void outputLockInfoHTML( stringstream& ss );
     private:
         mongo::mutex _lock;
@@ -103,10 +103,10 @@ namespace mongo {
 
     class SnapshotThread : public BackgroundJob {
     public:
-        string name() { return "snapshot"; }
+        virtual string name() const { return "snapshot"; }
         void run();
     };
-    
+
     extern Snapshots statsSnapshots;
     extern SnapshotThread snapshotThread;
 
diff --git a/db/stats/top.cpp b/db/stats/top.cpp
index 3e65261..77aef0d 100644
--- a/db/stats/top.cpp
+++ b/db/stats/top.cpp
@@ -22,16 +22,16 @@
 #include "../commands.h"
 
 namespace mongo {
-    
-    Top::UsageData::UsageData( const UsageData& older , const UsageData& newer )
-        : time(newer.time-older.time) , 
-          count(newer.count-older.count) 
-    {
-        
+
+    Top::UsageData::UsageData( const UsageData& older , const UsageData& newer ) {
+        // this won't be 100% accurate on rollovers and drop(), but at least it won't be negative
+        time  = (newer.time  > older.time)  ? (newer.time  - older.time)  : newer.time;
+        count = (newer.count > older.count) ? (newer.count - older.count) : newer.count;
+
     }
 
     Top::CollectionData::CollectionData( const CollectionData& older , const CollectionData& newer )
-        : total( older.total , newer.total ) , 
+        : total( older.total , newer.total ) ,
           readLock( older.readLock , newer.readLock ) ,
           writeLock( older.writeLock , newer.writeLock ) ,
           queries( older.queries , newer.queries ) ,
@@ -39,17 +39,18 @@ namespace mongo {
           insert( older.insert , newer.insert ) ,
           update( older.update , newer.update ) ,
           remove( older.remove , newer.remove ),
-          commands( older.commands , newer.commands ) 
-    {
-        
+          commands( older.commands , newer.commands ) {
+
     }
 
-    
-    void Top::record( const string& ns , int op , int lockType , long long micros , bool command ){
+    void Top::record( const string& ns , int op , int lockType , long long micros , bool command ) {
+        if ( ns[0] == '?' )
+            return;
+
         //cout << "record: " << ns << "\t" << op << "\t" << command << endl;
         scoped_lock lk(_lock);
-        
-        if ( ( command || op == dbQuery ) && ns == _lastDropped ){
+
+        if ( ( command || op == dbQuery ) && ns == _lastDropped ) {
             _lastDropped = "";
             return;
         }
@@ -59,22 +60,15 @@ namespace mongo {
         _record( _global , op , lockType , micros , command );
     }
 
-    void Top::collectionDropped( const string& ns ){
-        //cout << "collectionDropped: " << ns << endl;
-        scoped_lock lk(_lock);
-        _usage.erase(ns);
-        _lastDropped = ns;
-    }
-    
-    void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ){
+    void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ) {
         c.total.inc( micros );
-        
+
         if ( lockType > 0 )
             c.writeLock.inc( micros );
         else if ( lockType < 0 )
             c.readLock.inc( micros );
-        
-        switch ( op ){
+
+        switch ( op ) {
         case 0:
             // use 0 for unknown, non-specific
             break;
@@ -98,7 +92,7 @@ namespace mongo {
             break;
         case dbKillCursors:
             break;
-        case opReply: 
+        case opReply:
         case dbMsg:
             log() << "unexpected op in Top::record: " << op << endl;
             break;
@@ -108,55 +102,62 @@ namespace mongo {
 
     }
 
-    void Top::cloneMap(Top::UsageMap& out){
+    void Top::collectionDropped( const string& ns ) {
+        //cout << "collectionDropped: " << ns << endl;
+        scoped_lock lk(_lock);
+        _usage.erase(ns);
+        _lastDropped = ns;
+    }
+
+    void Top::cloneMap(Top::UsageMap& out) const {
         scoped_lock lk(_lock);
         out = _usage;
     }
 
-    void Top::append( BSONObjBuilder& b ){
+    void Top::append( BSONObjBuilder& b ) {
         scoped_lock lk( _lock );
-        append( b , _usage );
+        _appendToUsageMap( b , _usage );
     }
 
-    void Top::append( BSONObjBuilder& b , const char * name , const UsageData& map ){
-        BSONObjBuilder bb( b.subobjStart( name ) );
-        bb.appendNumber( "time" , map.time );
-        bb.appendNumber( "count" , map.count );
-        bb.done();
-    }
+    void Top::_appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const {
+        for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ) {
+            BSONObjBuilder bb( b.subobjStart( i->first ) );
 
-    void Top::append( BSONObjBuilder& b , const UsageMap& map ){
-        for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ){
-            BSONObjBuilder bb( b.subobjStart( i->first.c_str() ) );
-            
             const CollectionData& coll = i->second;
-            
-            append( b , "total" , coll.total );
-            
-            append( b , "readLock" , coll.readLock );
-            append( b , "writeLock" , coll.writeLock );
-
-            append( b , "queries" , coll.queries );
-            append( b , "getmore" , coll.getmore );
-            append( b , "insert" , coll.insert );
-            append( b , "update" , coll.update );
-            append( b , "remove" , coll.remove );
-            append( b , "commands" , coll.commands );
-            
+
+            _appendStatsEntry( b , "total" , coll.total );
+
+            _appendStatsEntry( b , "readLock" , coll.readLock );
+            _appendStatsEntry( b , "writeLock" , coll.writeLock );
+
+            _appendStatsEntry( b , "queries" , coll.queries );
+            _appendStatsEntry( b , "getmore" , coll.getmore );
+            _appendStatsEntry( b , "insert" , coll.insert );
+            _appendStatsEntry( b , "update" , coll.update );
+            _appendStatsEntry( b , "remove" , coll.remove );
+            _appendStatsEntry( b , "commands" , coll.commands );
+
             bb.done();
         }
     }
 
+    void Top::_appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const {
+        BSONObjBuilder bb( b.subobjStart( statsName ) );
+        bb.appendNumber( "time" , map.time );
+        bb.appendNumber( "count" , map.count );
+        bb.done();
+    }
+
     class TopCmd : public Command {
     public:
-        TopCmd() : Command( "top", true ){}
+        TopCmd() : Command( "top", true ) {}
 
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return true; }
-        virtual LockType locktype() const { return READ; } 
+        virtual LockType locktype() const { return READ; }
         virtual void help( stringstream& help ) const { help << "usage by collection"; }
 
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             {
                 BSONObjBuilder b( result.subobjStart( "totals" ) );
                 Top::global.append( b );
@@ -164,11 +165,11 @@ namespace mongo {
             }
             return true;
         }
-        
+
     } topCmd;
 
     Top Top::global;
-    
+
     TopOld::T TopOld::_snapshotStart = TopOld::currentTime();
     TopOld::D TopOld::_snapshotDuration;
     TopOld::UsageMap TopOld::_totalUsage;
diff --git a/db/stats/top.h b/db/stats/top.h
index 135e8f8..9645ed1 100644
--- a/db/stats/top.h
+++ b/db/stats/top.h
@@ -31,29 +31,27 @@ namespace mongo {
     public:
         Top() : _lock("Top") { }
 
-        class UsageData {
-        public:
-            UsageData() : time(0) , count(0){}
+        struct UsageData {
+            UsageData() : time(0) , count(0) {}
             UsageData( const UsageData& older , const UsageData& newer );
             long long time;
             long long count;
 
-            void inc( long long micros ){
+            void inc( long long micros ) {
                 count++;
                 time += micros;
             }
         };
 
-        class CollectionData {
-        public:
+        struct CollectionData {
             /**
              * constructs a diff
              */
-            CollectionData(){}
+            CollectionData() {}
             CollectionData( const CollectionData& older , const CollectionData& newer );
-            
+
             UsageData total;
-            
+
             UsageData readLock;
             UsageData writeLock;
 
@@ -66,25 +64,23 @@ namespace mongo {
         };
 
         typedef map<string,CollectionData> UsageMap;
-        
+
     public:
         void record( const string& ns , int op , int lockType , long long micros , bool command );
         void append( BSONObjBuilder& b );
-        void cloneMap(UsageMap& out);
-        CollectionData getGlobalData(){ return _global; }
+        void cloneMap(UsageMap& out) const;
+        CollectionData getGlobalData() const { return _global; }
         void collectionDropped( const string& ns );
 
     public: // static stuff
         static Top global;
-        
-        void append( BSONObjBuilder& b , const char * name , const UsageData& map );
-        void append( BSONObjBuilder& b , const UsageMap& map );
-        
+
     private:
-        
+        void _appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const;
+        void _appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const;
         void _record( CollectionData& c , int op , int lockType , long long micros , bool command );
 
-        mongo::mutex _lock;
+        mutable mongo::mutex _lock;
         CollectionData _global;
         UsageMap _usage;
         string _lastDropped;
@@ -99,9 +95,9 @@ namespace mongo {
         typedef boost::tuple< D, int, int, int > UsageData;
     public:
         TopOld() : _read(false), _write(false) { }
-        
+
         /* these are used to record activity: */
-        
+
         void clientStart( const char *client ) {
             clientStop();
             _currentStart = currentTime();
@@ -130,11 +126,11 @@ namespace mongo {
 
         /* these are used to fetch the stats: */
 
-        struct Usage { 
-            string ns; 
-            D time; 
-            double pct; 
-            int reads, writes, calls; 
+        struct Usage {
+            string ns;
+            D time;
+            double pct;
+            int reads, writes, calls;
         };
 
         static void usage( vector< Usage > &res ) {
@@ -145,7 +141,7 @@ namespace mongo {
             UsageMap totalUsage;
             fillParentNamespaces( snapshot, _snapshot );
             fillParentNamespaces( totalUsage, _totalUsage );
-        
+
             multimap< D, string, more > sorted;
             for( UsageMap::iterator i = snapshot.begin(); i != snapshot.end(); ++i )
                 sorted.insert( make_pair( i->second.get<0>(), i->first ) );
@@ -181,7 +177,8 @@ namespace mongo {
             if ( &_snapshot == &_snapshotA ) {
                 _snapshot = _snapshotB;
                 _nextSnapshot = _snapshotA;
-            } else {
+            }
+            else {
                 _snapshot = _snapshotA;
                 _nextSnapshot = _snapshotB;
             }
@@ -211,7 +208,7 @@ namespace mongo {
                 g.get< 1 >()++;
             else if ( !_read && _write )
                 g.get< 2 >()++;
-            g.get< 3 >()++;        
+            g.get< 3 >()++;
         }
         static void fillParentNamespaces( UsageMap &to, const UsageMap &from ) {
             for( UsageMap::const_iterator i = from.begin(); i != from.end(); ++i ) {
@@ -224,8 +221,8 @@ namespace mongo {
                     current = current.substr( 0, dot );
                     inc( to[ current ], i->second );
                     dot = current.rfind( "." );
-                }            
-            }        
+                }
+            }
         }
         static void inc( UsageData &to, const UsageData &from ) {
             to.get<0>() += from.get<0>();
diff --git a/db/storage.cpp b/db/storage.cpp
deleted file mode 100644
index 63e7639..0000000
--- a/db/storage.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// storage.cpp
-/*
- *    Copyright (C) 2010 10gen Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#include "pch.h"
-#include "pdfile.h"
-//#include "reccache.h"
-#include "rec.h"
-#include "db.h"
-
-namespace mongo {
-
-// pick your store for indexes by setting this typedef
-// this doesn't need to be an ifdef, we can make it dynamic
-#if defined(_RECSTORE)
-RecStoreInterface *btreeStore = new CachedBasicRecStore();
-#else
-MongoMemMapped_RecStore *btreeStore = new MongoMemMapped_RecStore();
-#endif
-
-#if 0
-
-#if defined(_RECSTORE)
-    static int inited;
-#endif
-
-void writerThread();
-
-void BasicRecStore::init(const char *fn, unsigned recsize)
-{ 
-    massert( 10394 ,  "compile packing problem recstore?", sizeof(RecStoreHeader) == 8192);
-    filename = fn;
-    f.open(fn);
-    uassert( 10130 ,  string("couldn't open file:")+fn, f.is_open() );
-    len = f.len();
-    if( len == 0 ) { 
-        log() << "creating recstore file " << fn << '\n';
-        h.recsize = recsize;
-        len = sizeof(RecStoreHeader);
-        f.write(0, (const char *) &h, sizeof(RecStoreHeader));
-    }    
-    else { 
-        f.read(0, (char *) &h, sizeof(RecStoreHeader));
-        massert( 10395 , string("recstore was not closed cleanly: ")+fn, h.cleanShutdown==0);
-        massert( 10396 , string("recstore recsize mismatch, file:")+fn, h.recsize == recsize);
-        massert( 10397 , string("bad recstore [1], file:")+fn, (h.leof-sizeof(RecStoreHeader)) % recsize == 0);        
-        if( h.leof > len ) { 
-            stringstream ss;
-            ss << "bad recstore, file:" << fn << " leof:" << h.leof << " len:" << len;
-            massert( 10398 , ss.str(), false);
-        }
-        if( h.cleanShutdown )
-            log() << "warning: non-clean shutdown for file " << fn << '\n';
-        h.cleanShutdown = 2;
-        writeHeader();
-        f.fsync();
-    }
-#if defined(_RECSTORE)
-    if( inited++ == 0 ) {
-        boost::thread t(writerThread);
-    }
-#endif
-}
-
-#endif
-
-}
diff --git a/db/taskqueue.h b/db/taskqueue.h
new file mode 100644
index 0000000..c6a5667
--- /dev/null
+++ b/db/taskqueue.h
@@ -0,0 +1,106 @@
+// @file deferredinvoker.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mongomutex.h"
+
+namespace mongo {
+
+    /** defer work items by queueing them for invocation by another thread.  presumption is that
+        consumer thread is outside of locks more than the source thread.  Additional presumption
+        is that several objects or micro-tasks will be queued and that having a single thread
+        processing them in batch is hepful as they (in the first use case) use a common data
+        structure that can then be in local cpu classes.
+
+        this class is in db/ as it is dbMutex (mongomutex) specific (so far).
+
+        using a functor instead of go() might be more elegant too, once again, would like to test any
+        performance differential.  also worry that operator() hides things?
+
+        MT - copyable "micro task" object we can queue
+             must have a static method void MT::go(const MT&)
+
+        see DefInvoke in dbtests/ for an example.
+    */
+    template< class MT >
+    class TaskQueue {
+    public:
+        TaskQueue() : _which(0), _invokeMutex("deferredinvoker") { }
+
+        void defer(MT mt) {
+            // only one writer allowed.  however the invoke processing below can occur concurrently with
+            // writes (for the most part)
+            DEV dbMutex.assertWriteLocked();
+
+            _queues[_which].push_back(mt);
+        }
+
+        /** call to process deferrals.
+
+            concurrency: handled herein.  multiple threads could call invoke(), but their efforts will be
+                         serialized.  the common case is that there is a single processor calling invoke().
+
+            normally, you call this outside of any lock.  but if you want to fully drain the queue,
+            call from within a read lock.  for example:
+            {
+              // drain with minimal time in lock
+              d.invoke();
+              readlock lk;
+              d.invoke();
+              ...
+            }
+            you can also call invoke periodically to do some work and then pick up later on more.
+        */
+        void invoke() {
+            mutex::scoped_lock lk2(_invokeMutex);
+            int toDrain = 0;
+            {
+                // flip queueing to the other queue (we are double buffered)
+                readlocktry lk("", 5);
+                if( !lk.got() )
+                    return;
+                toDrain = _which;
+                _which = _which ^ 1;
+                wassert( _queues[_which].empty() ); // we are in dbMutex, so it should be/stay empty til we exit dbMutex
+            }
+
+            _drain( _queues[toDrain] );
+            assert( _queues[toDrain].empty() );
+        }
+
+    private:
+        int _which; // 0 or 1
+        typedef vector< MT > Queue;
+        Queue _queues[2];
+
+        // lock order when multiple locks: dbMutex, _invokeMutex
+        mongo::mutex _invokeMutex;
+
+        void _drain(Queue& queue) {
+            unsigned oldCap = queue.capacity();
+            for( typename Queue::iterator i = queue.begin(); i != queue.end(); i++ ) {
+                const MT& v = *i;
+                MT::go(v);
+            }
+            queue.clear();
+            DEV assert( queue.capacity() == oldCap ); // just checking that clear() doesn't deallocate, we don't want that
+        }
+    };
+
+}
diff --git a/db/tests.cpp b/db/tests.cpp
index 1218f1b..00f299e 100644
--- a/db/tests.cpp
+++ b/db/tests.cpp
@@ -32,7 +32,7 @@ namespace mongo {
 
         MemoryMappedFile f;
 
-        long len = 64*1024*1024;
+        unsigned long long len = 64*1024*1024;
         char *p = (char *) f.map("/tmp/test.dat", len);
         char *start = p;
         char *end = p + 64*1024*1024-2;
diff --git a/db/update.cpp b/db/update.cpp
index e178e0f..7de9bb1 100644
--- a/db/update.cpp
+++ b/db/update.cpp
@@ -31,21 +31,25 @@
 namespace mongo {
 
     const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" ,
-                                    "$bitand" , "$bitor" , "$bit" , "$addToSet" };
+                                    "$bitand" , "$bitor" , "$bit" , "$addToSet", "$rename", "$rename"
+                                  };
     unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*);
 
     bool Mod::_pullElementMatch( BSONElement& toMatch ) const {
-        
-        if ( elt.type() != Object ){
+
+        if ( elt.type() != Object ) {
             // if elt isn't an object, then comparison will work
             return toMatch.woCompare( elt , false ) == 0;
         }
 
-        if ( toMatch.type() != Object ){
+        if ( matcherOnPrimitive )
+            return matcher->matches( toMatch.wrap( "" ) );
+
+        if ( toMatch.type() != Object ) {
             // looking for an object, so this can't match
             return false;
         }
-        
+
         // now we have an object on both sides
         return matcher->matches( toMatch.embeddedObject() );
     }
@@ -54,41 +58,53 @@ namespace mongo {
     void Mod::appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const {
         BSONType a = in.type();
         BSONType b = elt.type();
-        
-        if ( a == NumberDouble || b == NumberDouble ){
+
+        if ( a == NumberDouble || b == NumberDouble ) {
             ms.incType = NumberDouble;
             ms.incdouble = elt.numberDouble() + in.numberDouble();
         }
-        else if ( a == NumberLong || b == NumberLong ){
+        else if ( a == NumberLong || b == NumberLong ) {
             ms.incType = NumberLong;
             ms.inclong = elt.numberLong() + in.numberLong();
         }
         else {
-            ms.incType = NumberInt;
-            ms.incint = elt.numberInt() + in.numberInt();
+            int x = elt.numberInt() + in.numberInt();
+            if ( x < 0 && elt.numberInt() > 0 && in.numberInt() > 0 ) {
+                // overflow
+                ms.incType = NumberLong;
+                ms.inclong = elt.numberLong() + in.numberLong();
+            }
+            else {
+                ms.incType = NumberInt;
+                ms.incint = elt.numberInt() + in.numberInt();
+            }
         }
-        
+
         ms.appendIncValue( bb , false );
     }
 
     template< class Builder >
     void appendUnset( Builder &b ) {
     }
-    
+
     template<>
     void appendUnset( BSONArrayBuilder &b ) {
         b.appendNull();
     }
-    
+
     template< class Builder >
     void Mod::apply( Builder& b , BSONElement in , ModState& ms ) const {
-        switch ( op ){
-        
+        if ( ms.dontApply ) {
+            return;
+        }
+
+        switch ( op ) {
+
         case INC: {
             appendIncremented( b , in , ms );
             break;
         }
-            
+
         case SET: {
             _checkForAppending( elt );
             b.appendAs( elt , shortFieldName );
@@ -99,13 +115,13 @@ namespace mongo {
             appendUnset( b );
             break;
         }
-            
+
         case PUSH: {
             uassert( 10131 ,  "$push can only be applied to an array" , in.type() == Array );
             BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
             BSONObjIterator i( in.embeddedObject() );
             int n=0;
-            while ( i.more() ){
+            while ( i.more() ) {
                 bb.append( i.next() );
                 n++;
             }
@@ -116,28 +132,35 @@ namespace mongo {
             bb.done();
             break;
         }
-            
+
         case ADDTOSET: {
             uassert( 12592 ,  "$addToSet can only be applied to an array" , in.type() == Array );
             BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
-            
+
             BSONObjIterator i( in.embeddedObject() );
-            int n=0;            
+            int n=0;
+
+            if ( isEach() ) {
 
-            if ( isEach() ){
-                
                 BSONElementSet toadd;
                 parseEach( toadd );
-                
-                while ( i.more() ){
+
+                while ( i.more() ) {
                     BSONElement cur = i.next();
                     bb.append( cur );
-                    n++;           
+                    n++;
                     toadd.erase( cur );
                 }
-                
-                for ( BSONElementSet::iterator j=toadd.begin(); j!=toadd.end(); j++ ){
-                    bb.appendAs( *j , BSONObjBuilder::numStr( n++ ) );
+
+                {
+                    BSONObjIterator i( getEach() );
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( toadd.count(e) ) {
+                            bb.appendAs( e , BSONObjBuilder::numStr( n++ ) );
+                            toadd.erase( e );
+                        }
+                    }
                 }
 
             }
@@ -145,34 +168,34 @@ namespace mongo {
 
                 bool found = false;
 
-                while ( i.more() ){
+                while ( i.more() ) {
                     BSONElement cur = i.next();
                     bb.append( cur );
                     n++;
                     if ( elt.woCompare( cur , false ) == 0 )
                         found = true;
                 }
-                
+
                 if ( ! found )
                     bb.appendAs( elt ,  bb.numStr( n ) );
-                
+
             }
-            
+
             bb.done();
             break;
         }
 
 
-            
+
         case PUSH_ALL: {
             uassert( 10132 ,  "$pushAll can only be applied to an array" , in.type() == Array );
             uassert( 10133 ,  "$pushAll has to be passed an array" , elt.type() );
 
             BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
-            
+
             BSONObjIterator i( in.embeddedObject() );
             int n=0;
-            while ( i.more() ){
+            while ( i.more() ) {
                 bb.append( i.next() );
                 n++;
             }
@@ -180,34 +203,34 @@ namespace mongo {
             ms.pushStartSize = n;
 
             i = BSONObjIterator( elt.embeddedObject() );
-            while ( i.more() ){
+            while ( i.more() ) {
                 bb.appendAs( i.next() , bb.numStr( n++ ) );
             }
 
             bb.done();
             break;
         }
-            
+
         case PULL:
         case PULL_ALL: {
             uassert( 10134 ,  "$pull/$pullAll can only be applied to an array" , in.type() == Array );
             BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
-                        
+
             int n = 0;
 
             BSONObjIterator i( in.embeddedObject() );
-            while ( i.more() ){
+            while ( i.more() ) {
                 BSONElement e = i.next();
                 bool allowed = true;
 
-                if ( op == PULL ){
+                if ( op == PULL ) {
                     allowed = ! _pullElementMatch( e );
                 }
                 else {
                     BSONObjIterator j( elt.embeddedObject() );
                     while( j.more() ) {
                         BSONElement arrJ = j.next();
-                        if ( e.woCompare( arrJ, false ) == 0 ){
+                        if ( e.woCompare( arrJ, false ) == 0 ) {
                             allowed = false;
                             break;
                         }
@@ -217,7 +240,7 @@ namespace mongo {
                 if ( allowed )
                     bb.appendAs( e , bb.numStr( n++ ) );
             }
-            
+
             bb.done();
             break;
         }
@@ -225,13 +248,13 @@ namespace mongo {
         case POP: {
             uassert( 10135 ,  "$pop can only be applied to an array" , in.type() == Array );
             BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
-                        
+
             int n = 0;
 
             BSONObjIterator i( in.embeddedObject() );
-            if ( elt.isNumber() && elt.number() < 0 ){
+            if ( elt.isNumber() && elt.number() < 0 ) {
                 // pop from front
-                if ( i.more() ){
+                if ( i.more() ) {
                     i.next();
                     n++;
                 }
@@ -246,7 +269,7 @@ namespace mongo {
                 while( i.more() ) {
                     n++;
                     BSONElement arrI = i.next();
-                    if ( i.more() ){
+                    if ( i.more() ) {
                         bb.append( arrI );
                     }
                 }
@@ -262,23 +285,23 @@ namespace mongo {
             uassert( 10136 ,  "$bit needs an array" , elt.type() == Object );
             uassert( 10137 ,  "$bit can only be applied to numbers" , in.isNumber() );
             uassert( 10138 ,  "$bit can't use a double" , in.type() != NumberDouble );
-            
+
             int x = in.numberInt();
             long long y = in.numberLong();
 
             BSONObjIterator it( elt.embeddedObject() );
-            while ( it.more() ){
+            while ( it.more() ) {
                 BSONElement e = it.next();
                 uassert( 10139 ,  "$bit field must be number" , e.isNumber() );
-                if ( strcmp( e.fieldName() , "and" ) == 0 ){
-                    switch( in.type() ){
+                if ( strcmp( e.fieldName() , "and" ) == 0 ) {
+                    switch( in.type() ) {
                     case NumberInt: x = x&e.numberInt(); break;
                     case NumberLong: y = y&e.numberLong(); break;
                     default: assert( 0 );
                     }
                 }
-                else if ( strcmp( e.fieldName() , "or" ) == 0 ){
-                    switch( in.type() ){
+                else if ( strcmp( e.fieldName() , "or" ) == 0 ) {
+                    switch( in.type() ) {
                     case NumberInt: x = x|e.numberInt(); break;
                     case NumberLong: y = y|e.numberLong(); break;
                     default: assert( 0 );
@@ -289,8 +312,8 @@ namespace mongo {
                     throw UserException( 9016, (string)"unknown bit mod:" + e.fieldName() );
                 }
             }
-            
-            switch( in.type() ){
+
+            switch( in.type() ) {
             case NumberInt: b.append( shortFieldName , x ); break;
             case NumberLong: b.append( shortFieldName , y ); break;
             default: assert( 0 );
@@ -299,6 +322,15 @@ namespace mongo {
             break;
         }
 
+        case RENAME_FROM: {
+            break;
+        }
+
+        case RENAME_TO: {
+            ms.handleRename( b, shortFieldName );
+            break;
+        }
+
         default:
             stringstream ss;
             ss << "Mod::apply can't handle type: " << op;
@@ -306,11 +338,30 @@ namespace mongo {
         }
     }
 
+    // -1 inside a non-object (non-object could be array)
+    // 0 missing
+    // 1 found
+    int validRenamePath( BSONObj obj, const char *path ) {
+        while( const char *p = strchr( path, '.' ) ) {
+            string left( path, p - path );
+            BSONElement e = obj.getField( left );
+            if ( e.eoo() ) {
+                return 0;
+            }
+            if ( e.type() != Object ) {
+                return -1;
+            }
+            obj = e.embeddedObject();
+            path = p + 1;
+        }
+        return !obj.getField( path ).eoo();
+    }
+
     auto_ptr<ModSetState> ModSet::prepare(const BSONObj &obj) const {
         DEBUGUPDATE( "\t start prepare" );
-        ModSetState * mss = new ModSetState( obj );
-        
-        
+        auto_ptr<ModSetState> mss( new ModSetState( obj ) );
+
+
         // Perform this check first, so that we don't leave a partially modified object on uassert.
         for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) {
             DEBUGUPDATE( "\t\t prepare : " << i->first );
@@ -318,23 +369,51 @@ namespace mongo {
 
             const Mod& m = i->second;
             BSONElement e = obj.getFieldDotted(m.fieldName);
-            
+
             ms.m = &m;
             ms.old = e;
 
+            if ( m.op == Mod::RENAME_FROM ) {
+                int source = validRenamePath( obj, m.fieldName );
+                uassert( 13489, "$rename source field invalid", source != -1 );
+                if ( source != 1 ) {
+                    ms.dontApply = true;
+                }
+                continue;
+            }
+
+            if ( m.op == Mod::RENAME_TO ) {
+                int source = validRenamePath( obj, m.renameFrom() );
+                if ( source == 1 ) {
+                    int target = validRenamePath( obj, m.fieldName );
+                    uassert( 13490, "$rename target field invalid", target != -1 );
+                    ms.newVal = obj.getFieldDotted( m.renameFrom() );
+                    mss->amIInPlacePossible( false );
+                }
+                else {
+                    ms.dontApply = true;
+                }
+                continue;
+            }
+
             if ( e.eoo() ) {
                 mss->amIInPlacePossible( m.op == Mod::UNSET );
                 continue;
-            } 
-            
+            }
+
             switch( m.op ) {
             case Mod::INC:
                 uassert( 10140 ,  "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() );
-                if ( mss->amIInPlacePossible( e.isNumber() ) ){
+                if ( mss->amIInPlacePossible( e.isNumber() ) ) {
                     // check more typing info here
-                    if ( m.elt.type() != e.type() ){
+                    if ( m.elt.type() != e.type() ) {
                         // if i'm incrememnting with a double, then the storage has to be a double
-                        mss->amIInPlacePossible( m.elt.type() != NumberDouble ); 
+                        mss->amIInPlacePossible( m.elt.type() != NumberDouble );
+                    }
+
+                    // check for overflow
+                    if ( e.type() == NumberInt && e.numberLong() + m.elt.numberLong() > numeric_limits<int>::max() ) {
+                        mss->amIInPlacePossible( false );
                     }
                 }
                 break;
@@ -343,7 +422,7 @@ namespace mongo {
                 mss->amIInPlacePossible( m.elt.type() == e.type() &&
                                          m.elt.valuesize() == e.valuesize() );
                 break;
-            
+
             case Mod::PUSH:
             case Mod::PUSH_ALL:
                 uassert( 10141 ,  "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() );
@@ -358,7 +437,7 @@ namespace mongo {
                     BSONElement arrI = i.next();
                     if ( m.op == Mod::PULL ) {
                         mss->amIInPlacePossible( ! m._pullElementMatch( arrI ) );
-                    } 
+                    }
                     else if ( m.op == Mod::PULL_ALL ) {
                         BSONObjIterator j( m.elt.embeddedObject() );
                         while( mss->_inPlacePossible && j.moreWithEOO() ) {
@@ -377,12 +456,12 @@ namespace mongo {
                 mss->amIInPlacePossible( e.embeddedObject().isEmpty() );
                 break;
             }
-                
+
             case Mod::ADDTOSET: {
                 uassert( 12591 ,  "Cannot apply $addToSet modifier to non-array", e.type() == Array || e.eoo() );
-                
+
                 BSONObjIterator i( e.embeddedObject() );
-                if ( m.isEach() ){
+                if ( m.isEach() ) {
                     BSONElementSet toadd;
                     m.parseEach( toadd );
                     while( i.more() ) {
@@ -395,7 +474,7 @@ namespace mongo {
                     bool found = false;
                     while( i.more() ) {
                         BSONElement arrI = i.next();
-                        if ( arrI.woCompare( m.elt , false ) == 0 ){
+                        if ( arrI.woCompare( m.elt , false ) == 0 ) {
                             found = true;
                             break;
                         }
@@ -404,7 +483,7 @@ namespace mongo {
                 }
                 break;
             }
-                
+
             default:
                 // mods we don't know about shouldn't be done in place
                 mss->amIInPlacePossible( false );
@@ -412,28 +491,49 @@ namespace mongo {
         }
 
         DEBUGUPDATE( "\t mss\n" << mss->toString() << "\t--" );
-        
-        return auto_ptr<ModSetState>( mss );
+
+        return mss;
     }
 
     void ModState::appendForOpLog( BSONObjBuilder& b ) const {
-        if ( incType ){
+        if ( dontApply ) {
+            return;
+        }
+
+        if ( incType ) {
             DEBUGUPDATE( "\t\t\t\t\t appendForOpLog inc fieldname: " << m->fieldName << " short:" << m->shortFieldName );
             BSONObjBuilder bb( b.subobjStart( "$set" ) );
             appendIncValue( bb , true );
             bb.done();
             return;
         }
-        
+
+        if ( m->op == Mod::RENAME_FROM ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_FROM fielName:" << m->fieldName );
+            BSONObjBuilder bb( b.subobjStart( "$unset" ) );
+            bb.append( m->fieldName, 1 );
+            bb.done();
+            return;
+        }
+
+        if ( m->op == Mod::RENAME_TO ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_TO fielName:" << m->fieldName );
+            BSONObjBuilder bb( b.subobjStart( "$set" ) );
+            bb.appendAs( newVal, m->fieldName );
+            return;
+        }
+
         const char * name = fixedOpName ? fixedOpName : Mod::modNames[op()];
 
         DEBUGUPDATE( "\t\t\t\t\t appendForOpLog name:" << name << " fixed: " << fixed << " fn: " << m->fieldName );
 
         BSONObjBuilder bb( b.subobjStart( name ) );
-        if ( fixed )
+        if ( fixed ) {
             bb.appendAs( *fixed , m->fieldName );
-        else
+        }
+        else {
             bb.appendAs( m->elt , m->fieldName );
+        }
         bb.done();
     }
 
@@ -445,30 +545,55 @@ namespace mongo {
             ss << " fixed: " << fixed;
         return ss.str();
     }
-    
-    void ModSetState::applyModsInPlace() {
+
+    template< class Builder >
+    void ModState::handleRename( Builder &newObjBuilder, const char *shortFieldName ) {
+        newObjBuilder.appendAs( newVal , shortFieldName );
+        BSONObjBuilder b;
+        b.appendAs( newVal, shortFieldName );
+        assert( _objData.isEmpty() );
+        _objData = b.obj();
+        newVal = _objData.firstElement();
+    }
+
+    void ModSetState::applyModsInPlace( bool isOnDisk ) {
+        // TODO i think this assert means that we can get rid of the isOnDisk param
+        //      and just use isOwned as the determination
+        DEV assert( isOnDisk == ! _obj.isOwned() );
+
         for ( ModStateHolder::iterator i = _mods.begin(); i != _mods.end(); ++i ) {
             ModState& m = i->second;
-            
-            switch ( m.m->op ){
+
+            if ( m.dontApply ) {
+                continue;
+            }
+
+            switch ( m.m->op ) {
             case Mod::UNSET:
             case Mod::PULL:
             case Mod::PULL_ALL:
             case Mod::ADDTOSET:
+            case Mod::RENAME_FROM:
+            case Mod::RENAME_TO:
                 // this should have been handled by prepare
                 break;
-
-            // [dm] the BSONElementManipulator statements below are for replication (correct?)
+                // [dm] the BSONElementManipulator statements below are for replication (correct?)
             case Mod::INC:
-                m.m->incrementMe( m.old );
+                if ( isOnDisk )
+                    m.m->IncrementMe( m.old );
+                else
+                    m.m->incrementMe( m.old );
                 m.fixedOpName = "$set";
                 m.fixed = &(m.old);
                 break;
             case Mod::SET:
-                BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt );
+                if ( isOnDisk )
+                    BSONElementManipulator( m.old ).ReplaceTypeAndValue( m.m->elt );
+                else
+                    BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt );
                 break;
             default:
-                uassert( 10144 ,  "can't apply mod in place - shouldn't have gotten here" , 0 );
+                uassert( 13478 ,  "can't apply mod in place - shouldn't have gotten here" , 0 );
             }
         }
     }
@@ -488,61 +613,62 @@ namespace mongo {
             empty = false;
         }
         if ( empty )
-            fields[ base + top.fieldName() ] = top;            
+            fields[ base + top.fieldName() ] = top;
     }
-    
+
     template< class Builder >
-    void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ){
+    void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ) {
         const char * temp = m.fieldName();
         temp += root.size();
         const char * dot = strchr( temp , '.' );
-        if ( dot ){
+        if ( dot ) {
             string nr( m.fieldName() , 0 , 1 + ( dot - m.fieldName() ) );
             string nf( temp , 0 , dot - temp );
             if ( onedownseen.count( nf ) )
                 return;
             onedownseen.insert( nf );
-            BSONObjBuilder bb ( b.subobjStart( nf.c_str() ) );
+            BSONObjBuilder bb ( b.subobjStart( nf ) );
             createNewFromMods( nr , bb , BSONObj() ); // don't infer an array from name
             bb.done();
         }
         else {
             appendNewFromMod( m , b );
         }
-        
+
     }
-    
+
     template< class Builder >
-    void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ){
+    void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ) {
         DEBUGUPDATE( "\t\t createNewFromMods root: " << root );
         BSONObjIteratorSorted es( obj );
         BSONElement e = es.next();
-        
+
         ModStateHolder::iterator m = _mods.lower_bound( root );
         StringBuilder buf(root.size() + 2 );
         buf << root << (char)255;
         ModStateHolder::iterator mend = _mods.lower_bound( buf.str() );
-        
+
         set<string> onedownseen;
-        
-        while ( e.type() && m != mend ){
+
+        while ( e.type() && m != mend ) {
             string field = root + e.fieldName();
             FieldCompareResult cmp = compareDottedFieldNames( m->second.m->fieldName , field );
 
             DEBUGUPDATE( "\t\t\t field:" << field << "\t mod:" << m->second.m->fieldName << "\t cmp:" << cmp << "\t short: " << e.fieldName() );
-            
-            switch ( cmp ){
-                
+
+            switch ( cmp ) {
+
             case LEFT_SUBFIELD: { // Mod is embeddeed under this element
-                uassert( 10145 ,  "LEFT_SUBFIELD only supports Object" , e.type() == Object || e.type() == Array );
-                if ( onedownseen.count( e.fieldName() ) == 0 ){
+                uassert( 10145 ,  str::stream() << "LEFT_SUBFIELD only supports Object: " << field << " not: " << e.type() , e.type() == Object || e.type() == Array );
+                if ( onedownseen.count( e.fieldName() ) == 0 ) {
                     onedownseen.insert( e.fieldName() );
                     if ( e.type() == Object ) {
                         BSONObjBuilder bb( b.subobjStart( e.fieldName() ) );
                         stringstream nr; nr << root << e.fieldName() << ".";
                         createNewFromMods( nr.str() , bb , e.embeddedObject() );
-                        bb.done();                        
-                    } else {
+                        bb.done();
+                    }
+                    else {
                         BSONArrayBuilder ba( b.subarrayStart( e.fieldName() ) );
                         stringstream nr; nr << root << e.fieldName() << ".";
                         createNewFromMods( nr.str() , ba , e.embeddedObject() );
@@ -578,22 +704,22 @@ namespace mongo {
                 e = es.next();
                 continue;
             case RIGHT_SUBFIELD:
-                massert( 10399 ,  "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 ); 
+                massert( 10399 ,  "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 );
                 break;
             default:
                 massert( 10400 ,  "unhandled case" , 0 );
             }
         }
-        
+
         // finished looping the mods, just adding the rest of the elements
-        while ( e.type() ){
+        while ( e.type() ) {
             DEBUGUPDATE( "\t\t\t copying: " << e.fieldName() );
             b.append( e );  // if array, ignore field name
             e = es.next();
         }
-        
+
         // do mods that don't have fields already
-        for ( ; m != mend; m++ ){
+        for ( ; m != mend; m++ ) {
             DEBUGUPDATE( "\t\t\t\t appending from mod at end: " << m->second.m->fieldName );
             _appendNewFromMods( root , m->second , b , onedownseen );
         }
@@ -602,30 +728,30 @@ namespace mongo {
     BSONObj ModSetState::createNewFromMods() {
         BSONObjBuilder b( (int)(_obj.objsize() * 1.1) );
         createNewFromMods( "" , b , _obj );
-        return b.obj();
+        return _newFromMods = b.obj();
     }
 
     string ModSetState::toString() const {
         stringstream ss;
-        for ( ModStateHolder::const_iterator i=_mods.begin(); i!=_mods.end(); ++i ){
+        for ( ModStateHolder::const_iterator i=_mods.begin(); i!=_mods.end(); ++i ) {
             ss << "\t\t" << i->first << "\t" << i->second.toString() << "\n";
         }
         return ss.str();
     }
 
-    BSONObj ModSet::createNewFromQuery( const BSONObj& query ){
+    BSONObj ModSet::createNewFromQuery( const BSONObj& query ) {
         BSONObj newObj;
 
         {
             BSONObjBuilder bb;
             EmbeddedBuilder eb( &bb );
             BSONObjIteratorSorted i( query );
-            while ( i.more() ){
+            while ( i.more() ) {
                 BSONElement e = i.next();
                 if ( e.fieldName()[0] == '$' ) // for $atomic and anything else we add
                     continue;
 
-                if ( e.type() == Object && e.embeddedObject().firstElement().fieldName()[0] == '$' ){
+                if ( e.type() == Object && e.embeddedObject().firstElement().fieldName()[0] == '$' ) {
                     // this means this is a $gt type filter, so don't make part of the new object
                     continue;
                 }
@@ -635,17 +761,17 @@ namespace mongo {
             eb.done();
             newObj = bb.obj();
         }
-        
+
         auto_ptr<ModSetState> mss = prepare( newObj );
 
         if ( mss->canApplyInPlace() )
-            mss->applyModsInPlace();
+            mss->applyModsInPlace( false );
         else
             newObj = mss->createNewFromMods();
-        
+
         return newObj;
     }
-    
+
     /* get special operations like $inc
        { $inc: { a:1, b:1 } }
        { $set: { a:77 } }
@@ -656,21 +782,21 @@ namespace mongo {
        NOTE: MODIFIES source from object!
     */
     ModSet::ModSet(
-        const BSONObj &from , 
+        const BSONObj &from ,
         const set<string>& idxKeys,
         const set<string> *backgroundKeys)
         : _isIndexed(0) , _hasDynamicArray( false ) {
-        
+
         BSONObjIterator it(from);
-        
+
         while ( it.more() ) {
             BSONElement e = it.next();
             const char *fn = e.fieldName();
-            
+
             uassert( 10147 ,  "Invalid modifier specified" + string( fn ), e.type() == Object );
             BSONObj j = e.embeddedObject();
             DEBUGUPDATE( "\t" << j );
-            
+
             BSONObjIterator jt(j);
             Mod::Op op = opFromStr( fn );
 
@@ -685,18 +811,45 @@ namespace mongo {
                 uassert( 10151 ,  "have conflicting mods in update" , ! haveConflictingMod( fieldName ) );
                 uassert( 10152 ,  "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC );
                 uassert( 10153 ,  "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) );
-                
+
+                if ( op == Mod::RENAME_TO ) {
+                    uassert( 13494, "$rename target must be a string", f.type() == String );
+                    const char *target = f.valuestr();
+                    uassert( 13495, "$rename source must differ from target", strcmp( fieldName, target ) != 0 );
+                    uassert( 13496, "invalid mod field name, source may not be empty", fieldName[0] );
+                    uassert( 13479, "invalid mod field name, target may not be empty", target[0] );
+                    uassert( 13480, "invalid mod field name, source may not begin or end in period", fieldName[0] != '.' && fieldName[ strlen( fieldName ) - 1 ] != '.' );
+                    uassert( 13481, "invalid mod field name, target may not begin or end in period", target[0] != '.' && target[ strlen( target ) - 1 ] != '.' );
+                    uassert( 13482, "$rename affecting _id not allowed", !( fieldName[0] == '_' && fieldName[1] == 'i' && fieldName[2] == 'd' && ( !fieldName[3] || fieldName[3] == '.' ) ) );
+                    uassert( 13483, "$rename affecting _id not allowed", !( target[0] == '_' && target[1] == 'i' && target[2] == 'd' && ( !target[3] || target[3] == '.' ) ) );
+                    uassert( 13484, "field name duplication not allowed with $rename target", !haveModForField( target ) );
+                    uassert( 13485, "conflicting mods not allowed with $rename target", !haveConflictingMod( target ) );
+                    uassert( 13486, "$rename target may not be a parent of source", !( strncmp( fieldName, target, strlen( target ) ) == 0 && fieldName[ strlen( target ) ] == '.' ) );
+                    uassert( 13487, "$rename source may not be dynamic array", strstr( fieldName , ".$" ) == 0 );
+                    uassert( 13488, "$rename target may not be dynamic array", strstr( target , ".$" ) == 0 );
+
+                    Mod from;
+                    from.init( Mod::RENAME_FROM, f );
+                    from.setFieldName( fieldName );
+                    updateIsIndexed( from, idxKeys, backgroundKeys );
+                    _mods[ from.fieldName ] = from;
+
+                    Mod to;
+                    to.init( Mod::RENAME_TO, f );
+                    to.setFieldName( target );
+                    updateIsIndexed( to, idxKeys, backgroundKeys );
+                    _mods[ to.fieldName ] = to;
+
+                    DEBUGUPDATE( "\t\t " << fieldName << "\t" << from.fieldName << "\t" << to.fieldName );
+                    continue;
+                }
+
                 _hasDynamicArray = _hasDynamicArray || strstr( fieldName , ".$" ) > 0;
-                
+
                 Mod m;
                 m.init( op , f );
                 m.setFieldName( f.fieldName() );
-                
-                if ( m.isIndexed( idxKeys ) ||
-                    (backgroundKeys && m.isIndexed(*backgroundKeys)) ) {
-                    _isIndexed++;
-                }
-
+                updateIsIndexed( m, idxKeys, backgroundKeys );
                 _mods[m.fieldName] = m;
 
                 DEBUGUPDATE( "\t\t " << fieldName << "\t" << m.fieldName << "\t" << _hasDynamicArray );
@@ -709,10 +862,10 @@ namespace mongo {
         ModSet * n = new ModSet();
         n->_isIndexed = _isIndexed;
         n->_hasDynamicArray = _hasDynamicArray;
-        for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ){
+        for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ) {
             string s = i->first;
             size_t idx = s.find( ".$" );
-            if ( idx == string::npos ){
+            if ( idx == string::npos ) {
                 n->_mods[s] = i->second;
                 continue;
             }
@@ -726,7 +879,7 @@ namespace mongo {
         }
         return n;
     }
-    
+
     void checkNoMods( BSONObj o ) {
         BSONObjIterator i( o );
         while( i.moreWithEOO() ) {
@@ -736,10 +889,10 @@ namespace mongo {
             uassert( 10154 ,  "Modifiers and non-modifiers cannot be mixed", e.fieldName()[ 0 ] != '$' );
         }
     }
-    
+
     class UpdateOp : public MultiCursor::CursorOp {
     public:
-        UpdateOp( bool hasPositionalField ) : _nscanned(), _hasPositionalField( hasPositionalField ){}
+        UpdateOp( bool hasPositionalField ) : _nscanned(), _hasPositionalField( hasPositionalField ) {}
         virtual void _init() {
             _c = qp().newCursor();
             if ( ! _c->ok() ) {
@@ -751,14 +904,18 @@ namespace mongo {
                 _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , qp().ns() ) );
             }
             return _cc->prepareToYield( _yieldData );
-        }        
+        }
         virtual void recoverFromYield() {
             if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
                 _c.reset();
                 _cc.reset();
                 massert( 13339, "cursor dropped during update", false );
             }
-        }        
+        }
+        virtual long long nscanned() {
+            assert( _c.get() );
+            return _c->nscanned();
+        }
         virtual void next() {
             if ( ! _c->ok() ) {
                 setComplete();
@@ -789,64 +946,62 @@ namespace mongo {
     };
 
     static void checkTooLarge(const BSONObj& newObj) {
-        uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= ( 4 * 1024 * 1024 ) );
+        uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= BSONObjMaxUserSize );
     }
 
-    /* note: this is only (as-is) called for 
+    /* note: this is only (as-is) called for
 
              - not multi
              - not mods is indexed
              - not upsert
     */
-    static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet *mods, int profile, NamespaceDetails *d, 
+    static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet *mods, int profile, NamespaceDetails *d,
                                     NamespaceDetailsTransient *nsdt,
-                                    bool god, const char *ns, 
-                                    const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) 
-    {
+                                    bool god, const char *ns,
+                                    const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) {
         DiskLoc loc;
         {
             IndexDetails& i = d->idx(idIdxNo);
             BSONObj key = i.getKeyFromQuery( patternOrig );
             loc = i.head.btree()->findSingle(i, i.head, key);
-            if( loc.isNull() ) { 
+            if( loc.isNull() ) {
                 // no upsert support in _updateById yet, so we are done.
                 return UpdateResult(0, 0, 0);
             }
         }
 
         Record *r = loc.rec();
-                
+
         /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
            regular ones at the moment. */
-        if ( isOperatorUpdate ) {                   
-            const BSONObj& onDisk = loc.obj();                    
+        if ( isOperatorUpdate ) {
+            const BSONObj& onDisk = loc.obj();
             auto_ptr<ModSetState> mss = mods->prepare( onDisk );
-                    
+
             if( mss->canApplyInPlace() ) {
-                mss->applyModsInPlace();                    
+                mss->applyModsInPlace(true);
                 DEBUGUPDATE( "\t\t\t updateById doing in place update" );
                 /*if ( profile )
                     ss << " fastmod "; */
-            } 
+            }
             else {
                 BSONObj newObj = mss->createNewFromMods();
                 checkTooLarge(newObj);
-                bool changedId;
                 assert(nsdt);
-                DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug, changedId);                        
+                DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
             }
-                    
+
             if ( logop ) {
                 DEV assert( mods->size() );
-                 
+
                 BSONObj pattern = patternOrig;
                 if ( mss->haveArrayDepMod() ) {
                     BSONObjBuilder patternBuilder;
                     patternBuilder.appendElements( pattern );
                     mss->appendSizeSpecForArrayDepMods( patternBuilder );
-                    pattern = patternBuilder.obj();                        
+                    pattern = patternBuilder.obj();
                 }
-                        
+
                 if( mss->needOpLogRewrite() ) {
                     DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
                     logOp("u", ns, mss->getOpLogRewrite() , &pattern );
@@ -857,24 +1012,18 @@ namespace mongo {
             }
             return UpdateResult( 1 , 1 , 1);
         } // end $operator update
-                
+
         // regular update
         BSONElementManipulator::lookForTimestamps( updateobj );
         checkNoMods( updateobj );
-        bool changedId = false;
         assert(nsdt);
-        theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, changedId);
+        theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug );
         if ( logop ) {
-            if ( !changedId ) {
-                logOp("u", ns, updateobj, &patternOrig );
-            } else {
-                logOp("d", ns, patternOrig );
-                logOp("i", ns, updateobj );                    
-            }
+            logOp("u", ns, updateobj, &patternOrig );
         }
         return UpdateResult( 1 , 0 , 1 );
     }
- 
+
     UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug, RemoveSaver* rs ) {
         DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi );
         Client& client = cc();
@@ -883,20 +1032,20 @@ namespace mongo {
 
         if ( logLevel > 2 )
             ss << " update: " << updateobj.toString();
-        
+
         /* idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case */
         /* NOTE: when yield() is added herein, these must be refreshed after each call to yield! */
         NamespaceDetails *d = nsdetails(ns); // can be null if an upsert...
         NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get_w(ns);
         /* end note */
-        
+
         auto_ptr<ModSet> mods;
         bool isOperatorUpdate = updateobj.firstElement().fieldName()[0] == '$';
         int modsIsIndexed = false; // really the # of indexes
-        if ( isOperatorUpdate ){
-            if( d && d->backgroundIndexBuildInProgress ) { 
+        if ( isOperatorUpdate ) {
+            if( d && d->indexBuildInProgress ) {
                 set<string> bgKeys;
-                d->backgroundIdx().keyPattern().getFieldNames(bgKeys);
+                d->inProgIdx().keyPattern().getFieldNames(bgKeys);
                 mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) );
             }
             else {
@@ -914,30 +1063,30 @@ namespace mongo {
         }
 
         set<DiskLoc> seenObjects;
-        
+
         int numModded = 0;
         long long nscanned = 0;
         MatchDetails details;
         shared_ptr< MultiCursor::CursorOp > opPtr( new UpdateOp( mods.get() && mods->hasDynamicArray() ) );
         shared_ptr< MultiCursor > c( new MultiCursor( ns, patternOrig, BSONObj(), opPtr, true ) );
-        
+
         auto_ptr<ClientCursor> cc;
-            
+
         while ( c->ok() ) {
             nscanned++;
 
             bool atomic = c->matcher()->docMatcher().atomic();
-                
+
             // May have already matched in UpdateOp, but do again to get details set correctly
-            if ( ! c->matcher()->matches( c->currKey(), c->currLoc(), &details ) ){
+            if ( ! c->matcher()->matches( c->currKey(), c->currLoc(), &details ) ) {
                 c->advance();
-                    
-                if ( nscanned % 256 == 0 && ! atomic ){
+
+                if ( nscanned % 256 == 0 && ! atomic ) {
                     if ( cc.get() == 0 ) {
                         shared_ptr< Cursor > cPtr = c;
                         cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
                     }
-                    if ( ! cc->yield() ){
+                    if ( ! cc->yield() ) {
                         cc.release();
                         // TODO should we assert or something?
                         break;
@@ -948,20 +1097,20 @@ namespace mongo {
                 }
                 continue;
             }
-            
+
             Record *r = c->_current();
             DiskLoc loc = c->currLoc();
-                
+
             // TODO Maybe this is unnecessary since we have seenObjects
-            if ( c->getsetdup( loc ) ){
+            if ( c->getsetdup( loc ) ) {
                 c->advance();
                 continue;
             }
-                
+
             BSONObj js(r);
-                
+
             BSONObj pattern = patternOrig;
-                
+
             if ( logop ) {
                 BSONObjBuilder idPattern;
                 BSONElement id;
@@ -977,80 +1126,79 @@ namespace mongo {
                     uassert( 10157 ,  "multi-update requires all modified objects to have an _id" , ! multi );
                 }
             }
-                
+
             if ( profile )
                 ss << " nscanned:" << nscanned;
-                
+
             /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
                 regular ones at the moment. */
             if ( isOperatorUpdate ) {
-                    
-                if ( multi ){
+
+                if ( multi ) {
                     c->advance(); // go to next record in case this one moves
                     if ( seenObjects.count( loc ) )
                         continue;
                 }
-                    
+
                 const BSONObj& onDisk = loc.obj();
-                    
+
                 ModSet * useMods = mods.get();
                 bool forceRewrite = false;
-                    
+
                 auto_ptr<ModSet> mymodset;
-                if ( details.elemMatchKey && mods->hasDynamicArray() ){
+                if ( details.elemMatchKey && mods->hasDynamicArray() ) {
                     useMods = mods->fixDynamicArray( details.elemMatchKey );
                     mymodset.reset( useMods );
                     forceRewrite = true;
                 }
-                    
+
                 auto_ptr<ModSetState> mss = useMods->prepare( onDisk );
-                    
+
                 bool indexHack = multi && ( modsIsIndexed || ! mss->canApplyInPlace() );
-                    
-                if ( indexHack ){
+
+                if ( indexHack ) {
                     if ( cc.get() )
                         cc->updateLocation();
                     else
                         c->noteLocation();
                 }
-                    
-                if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ){
-                    mss->applyModsInPlace();// const_cast<BSONObj&>(onDisk) );
-                    
+
+                if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) {
+                    mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) );
+
                     DEBUGUPDATE( "\t\t\t doing in place update" );
                     if ( profile )
                         ss << " fastmod ";
-                    
-                    if ( modsIsIndexed ){
+
+                    if ( modsIsIndexed ) {
                         seenObjects.insert( loc );
                     }
-                } 
+                }
                 else {
                     if ( rs )
                         rs->goingToDelete( onDisk );
 
                     BSONObj newObj = mss->createNewFromMods();
                     checkTooLarge(newObj);
-                    bool changedId;
-                    DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug, changedId);
+                    DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
                     if ( newLoc != loc || modsIsIndexed ) {
                         // object moved, need to make sure we don' get again
                         seenObjects.insert( newLoc );
                     }
-                        
+
                 }
-                    
+
                 if ( logop ) {
                     DEV assert( mods->size() );
-                        
+
                     if ( mss->haveArrayDepMod() ) {
                         BSONObjBuilder patternBuilder;
                         patternBuilder.appendElements( pattern );
                         mss->appendSizeSpecForArrayDepMods( patternBuilder );
-                        pattern = patternBuilder.obj();                        
+                        pattern = patternBuilder.obj();
                     }
-                        
-                    if ( forceRewrite || mss->needOpLogRewrite() ){
+
+                    if ( forceRewrite || mss->needOpLogRewrite() ) {
                         DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
                         logOp("u", ns, mss->getOpLogRewrite() , &pattern );
                     }
@@ -1063,13 +1211,13 @@ namespace mongo {
                     return UpdateResult( 1 , 1 , numModded );
                 if ( indexHack )
                     c->checkLocation();
-                    
-                if ( nscanned % 64 == 0 && ! atomic ){
+
+                if ( nscanned % 64 == 0 && ! atomic ) {
                     if ( cc.get() == 0 ) {
                         shared_ptr< Cursor > cPtr = c;
                         cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
                     }
-                    if ( ! cc->yield() ){
+                    if ( ! cc->yield() ) {
                         cc.release();
                         break;
                     }
@@ -1077,35 +1225,32 @@ namespace mongo {
                         break;
                     }
                 }
-                
+
+                if (atomic)
+                    getDur().commitIfNeeded();
+
                 continue;
-            } 
-                
+            }
+
             uassert( 10158 ,  "multi update only works with $ operators" , ! multi );
-                
+
             BSONElementManipulator::lookForTimestamps( updateobj );
             checkNoMods( updateobj );
-            bool changedId = false;
-            theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, changedId, god);
+            theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, god);
             if ( logop ) {
                 DEV if( god ) log() << "REALLY??" << endl; // god doesn't get logged, this would be bad.
-                if ( !changedId ) {
-                    logOp("u", ns, updateobj, &pattern );
-                } else {
-                    logOp("d", ns, pattern );
-                    logOp("i", ns, updateobj );                    
-                }
+                logOp("u", ns, updateobj, &pattern );
             }
             return UpdateResult( 1 , 0 , 1 );
         }
-        
+
         if ( numModded )
             return UpdateResult( 1 , 1 , numModded );
 
-        
+
         if ( profile )
             ss << " nscanned:" << nscanned;
-        
+
         if ( upsert ) {
             if ( updateobj.firstElement().fieldName()[0] == '$' ) {
                 /* upsert of an $inc. build a default */
@@ -1115,7 +1260,7 @@ namespace mongo {
                 theDataFileMgr.insertWithObjMod(ns, newObj, god);
                 if ( logop )
                     logOp( "i", ns, newObj );
-                
+
                 return UpdateResult( 0 , 1 , 1 , newObj );
             }
             uassert( 10159 ,  "multi update only works with $ operators" , ! multi );
@@ -1130,14 +1275,14 @@ namespace mongo {
         }
         return UpdateResult( 0 , 0 , 0 );
     }
- 
+
     UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) {
         uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 );
         if ( strstr(ns, ".system.") ) {
             /* dm: it's very important that system.indexes is never updated as IndexDetails has pointers into it */
-            uassert( 10156 , "cannot update system collection", legalClientSystemNS( ns , true ) );
+            uassert( 10156 , str::stream() << "cannot update system collection: " << ns << " q: " << patternOrig << " u: " << updateobj , legalClientSystemNS( ns , true ) );
         }
         return _updateObjects(false, ns, updateobj, patternOrig, upsert, multi, logop, debug);
     }
-   
+
 }
diff --git a/db/update.h b/db/update.h
index b7950de..d8396b5 100644
--- a/db/update.h
+++ b/db/update.h
@@ -26,32 +26,42 @@ namespace mongo {
     class ModState;
     class ModSetState;
 
-    /* Used for modifiers such as $inc, $set, $push, ... 
+    /* Used for modifiers such as $inc, $set, $push, ...
      * stores the info about a single operation
      * once created should never be modified
      */
     struct Mod {
         // See opFromStr below
-        //        0    1    2     3         4     5          6    7      8       9       10    11
-        enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET  } op;
-        
+        //        0    1    2     3         4     5          6    7      8       9       10    11        12           13
+        enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET, RENAME_FROM, RENAME_TO } op;
+
         static const char* modNames[];
         static unsigned modNamesNum;
 
         const char *fieldName;
         const char *shortFieldName;
-        
+
         BSONElement elt; // x:5 note: this is the actual element from the updateobj
         boost::shared_ptr<Matcher> matcher;
+        bool matcherOnPrimitive;
 
-        void init( Op o , BSONElement& e ){
+        void init( Op o , BSONElement& e ) {
             op = o;
             elt = e;
-            if ( op == PULL && e.type() == Object )
-                matcher.reset( new Matcher( e.embeddedObject() ) );
+            if ( op == PULL && e.type() == Object ) {
+                BSONObj t = e.embeddedObject();
+                if ( t.firstElement().getGtLtOp() == 0 ) {
+                    matcher.reset( new Matcher( t ) );
+                    matcherOnPrimitive = false;
+                }
+                else {
+                    matcher.reset( new Matcher( BSON( "" << t ) ) );
+                    matcherOnPrimitive = true;
+                }
+            }
         }
 
-        void setFieldName( const char * s ){
+        void setFieldName( const char * s ) {
             fieldName = s;
             shortFieldName = strrchr( fieldName , '.' );
             if ( shortFieldName )
@@ -59,14 +69,13 @@ namespace mongo {
             else
                 shortFieldName = fieldName;
         }
-        
+
         /**
          * @param in incrememnts the actual value inside in
          */
         void incrementMe( BSONElement& in ) const {
             BSONElementManipulator manip( in );
-            
-            switch ( in.type() ){
+            switch ( in.type() ) {
             case NumberDouble:
                 manip.setNumber( elt.numberDouble() + in.numberDouble() );
                 break;
@@ -79,18 +88,33 @@ namespace mongo {
             default:
                 assert(0);
             }
-            
         }
-        
+        void IncrementMe( BSONElement& in ) const {
+            BSONElementManipulator manip( in );
+            switch ( in.type() ) {
+            case NumberDouble:
+                manip.SetNumber( elt.numberDouble() + in.numberDouble() );
+                break;
+            case NumberLong:
+                manip.SetLong( elt.numberLong() + in.numberLong() );
+                break;
+            case NumberInt:
+                manip.SetInt( elt.numberInt() + in.numberInt() );
+                break;
+            default:
+                assert(0);
+            }
+        }
+
         template< class Builder >
         void appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const;
-        
+
         bool operator<( const Mod &other ) const {
             return strcmp( fieldName, other.fieldName ) < 0;
         }
-        
+
         bool arrayDep() const {
-            switch (op){
+            switch (op) {
             case PUSH:
             case PUSH_ALL:
             case POP:
@@ -99,8 +123,8 @@ namespace mongo {
                 return false;
             }
         }
-        
-        static bool isIndexed( const string& fullName , const set<string>& idxKeys ){
+
+        static bool isIndexed( const string& fullName , const set<string>& idxKeys ) {
             const char * fieldName = fullName.c_str();
             // check if there is an index key that is a parent of mod
             for( const char *dot = strchr( fieldName, '.' ); dot; dot = strchr( dot + 1, '.' ) )
@@ -117,23 +141,23 @@ namespace mongo {
 
             return false;
         }
-        
+
         bool isIndexed( const set<string>& idxKeys ) const {
             string fullName = fieldName;
-            
+
             if ( isIndexed( fullName , idxKeys ) )
                 return true;
-            
-            if ( strstr( fieldName , "." ) ){
+
+            if ( strstr( fieldName , "." ) ) {
                 // check for a.0.1
                 StringBuilder buf( fullName.size() + 1 );
-                for ( size_t i=0; i<fullName.size(); i++ ){
+                for ( size_t i=0; i<fullName.size(); i++ ) {
                     char c = fullName[i];
-                    
-                    if ( c == '$' && 
-                         i > 0 && fullName[i-1] == '.' &&
-                         i+1<fullName.size() && 
-                         fullName[i+1] == '.' ){
+
+                    if ( c == '$' &&
+                            i > 0 && fullName[i-1] == '.' &&
+                            i+1<fullName.size() &&
+                            fullName[i+1] == '.' ) {
                         i++;
                         continue;
                     }
@@ -145,10 +169,10 @@ namespace mongo {
 
                     if ( ! isdigit( fullName[i+1] ) )
                         continue;
-                    
+
                     bool possible = true;
                     size_t j=i+2;
-                    for ( ; j<fullName.size(); j++ ){
+                    for ( ; j<fullName.size(); j++ ) {
                         char d = fullName[j];
                         if ( d == '.' )
                             break;
@@ -157,7 +181,7 @@ namespace mongo {
                         possible = false;
                         break;
                     }
-                    
+
                     if ( possible )
                         i = j;
                 }
@@ -168,25 +192,25 @@ namespace mongo {
 
             return false;
         }
-        
+
         template< class Builder >
         void apply( Builder& b , BSONElement in , ModState& ms ) const;
-        
+
         /**
          * @return true iff toMatch should be removed from the array
          */
         bool _pullElementMatch( BSONElement& toMatch ) const;
 
         void _checkForAppending( const BSONElement& e ) const {
-            if ( e.type() == Object ){
+            if ( e.type() == Object ) {
                 // this is a tiny bit slow, but rare and important
                 // only when setting something TO an object, not setting something in an object
-                // and it checks for { $set : { x : { 'a.b' : 1 } } } 
+                // and it checks for { $set : { x : { 'a.b' : 1 } } }
                 // which is feel has been common
                 uassert( 12527 , "not okForStorage" , e.embeddedObject().okForStorage() );
             }
         }
-        
+
         bool isEach() const {
             if ( elt.type() != Object )
                 return false;
@@ -199,14 +223,18 @@ namespace mongo {
         BSONObj getEach() const {
             return elt.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck();
         }
-        
+
         void parseEach( BSONElementSet& s ) const {
             BSONObjIterator i(getEach());
-            while ( i.more() ){
+            while ( i.more() ) {
                 s.insert( i.next() );
             }
         }
-        
+
+        const char *renameFrom() const {
+            massert( 13492, "mod must be RENAME_TO type", op == Mod::RENAME_TO );
+            return elt.fieldName();
+        }
     };
 
     /**
@@ -220,7 +248,7 @@ namespace mongo {
         bool _hasDynamicArray;
 
         static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base );
-        
+
         FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const {
             bool mDone = ( m == _mods.end() );
             bool pDone = ( p == pEnd );
@@ -236,11 +264,11 @@ namespace mongo {
 
             return compareDottedFieldNames( m->first, p->first.c_str() );
         }
-        
+
         bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) {
             for( string left = EmbeddedBuilder::splitDot( right );
-                 left.length() > 0 && left[ left.length() - 1 ] != '.';
-                 left += "." + EmbeddedBuilder::splitDot( right ) ) {
+                    left.length() > 0 && left[ left.length() - 1 ] != '.';
+                    left += "." + EmbeddedBuilder::splitDot( right ) ) {
                 if ( existing.count( left ) > 0 && existing[ left ].type() != Object )
                     return false;
                 if ( haveModForField( left.c_str() ) )
@@ -250,7 +278,7 @@ namespace mongo {
         }
         static Mod::Op opFromStr( const char *fn ) {
             assert( fn[0] == '$' );
-            switch( fn[1] ){
+            switch( fn[1] ) {
             case 'i': {
                 if ( fn[2] == 'n' && fn[3] == 'c' && fn[4] == 0 )
                     return Mod::INC;
@@ -262,14 +290,14 @@ namespace mongo {
                 break;
             }
             case 'p': {
-                if ( fn[2] == 'u' ){
-                    if ( fn[3] == 's' && fn[4] == 'h' ){
+                if ( fn[2] == 'u' ) {
+                    if ( fn[3] == 's' && fn[4] == 'h' ) {
                         if ( fn[5] == 0 )
                             return Mod::PUSH;
                         if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
                             return Mod::PUSH_ALL;
                     }
-                    else if ( fn[3] == 'l' && fn[4] == 'l' ){
+                    else if ( fn[3] == 'l' && fn[4] == 'l' ) {
                         if ( fn[5] == 0 )
                             return Mod::PULL;
                         if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
@@ -286,7 +314,7 @@ namespace mongo {
                 break;
             }
             case 'b': {
-                if ( fn[2] == 'i' && fn[3] == 't' ){
+                if ( fn[2] == 'i' && fn[3] == 't' ) {
                     if ( fn[4] == 0 )
                         return Mod::BIT;
                     if ( fn[4] == 'a' && fn[5] == 'n' && fn[6] == 'd' && fn[7] == 0 )
@@ -297,27 +325,41 @@ namespace mongo {
                 break;
             }
             case 'a': {
-                if ( fn[2] == 'd' && fn[3] == 'd' ){
+                if ( fn[2] == 'd' && fn[3] == 'd' ) {
                     // add
                     if ( fn[4] == 'T' && fn[5] == 'o' && fn[6] == 'S' && fn[7] == 'e' && fn[8] == 't' && fn[9] == 0 )
                         return Mod::ADDTOSET;
-                    
+
+                }
+                break;
+            }
+            case 'r': {
+                if ( fn[2] == 'e' && fn[3] == 'n' && fn[4] == 'a' && fn[5] == 'm' && fn[6] =='e' ) {
+                    return Mod::RENAME_TO; // with this return code we handle both RENAME_TO and RENAME_FROM
                 }
+                break;
             }
             default: break;
             }
             uassert( 10161 ,  "Invalid modifier specified " + string( fn ), false );
             return Mod::INC;
         }
-        
-        ModSet(){}
+
+        ModSet() {}
+
+        void updateIsIndexed( const Mod &m, const set<string> &idxKeys, const set<string> *backgroundKeys ) {
+            if ( m.isIndexed( idxKeys ) ||
+                    (backgroundKeys && m.isIndexed(*backgroundKeys)) ) {
+                _isIndexed++;
+            }
+        }
 
     public:
-        
-        ModSet( const BSONObj &from , 
-            const set<string>& idxKeys = set<string>(),
-            const set<string>* backgroundKeys = 0
-            );
+
+        ModSet( const BSONObj &from ,
+                const set<string>& idxKeys = set<string>(),
+                const set<string>* backgroundKeys = 0
+              );
 
         // TODO: this is inefficient - should probably just handle when iterating
         ModSet * fixDynamicArray( const char * elemMatchKey ) const;
@@ -329,7 +371,7 @@ namespace mongo {
          * doesn't change or modify this ModSet or any underying Mod
          */
         auto_ptr<ModSetState> prepare( const BSONObj& obj ) const;
-        
+
         /**
          * given a query pattern, builds an object suitable for an upsert
          * will take the query spec and combine all $ operators
@@ -349,15 +391,15 @@ namespace mongo {
             return _mods.find( fieldName ) != _mods.end();
         }
 
-        bool haveConflictingMod( const string& fieldName ){
+        bool haveConflictingMod( const string& fieldName ) {
             size_t idx = fieldName.find( '.' );
             if ( idx == string::npos )
                 idx = fieldName.size();
-            
+
             ModHolder::const_iterator start = _mods.lower_bound(fieldName.substr(0,idx));
-            for ( ; start != _mods.end(); start++ ){
+            for ( ; start != _mods.end(); start++ ) {
                 FieldCompareResult r = compareDottedFieldNames( fieldName , start->first );
-                switch ( r ){
+                switch ( r ) {
                 case LEFT_SUBFIELD: return true;
                 case LEFT_BEFORE: return false;
                 case SAME: return true;
@@ -367,9 +409,9 @@ namespace mongo {
             }
             return false;
 
-            
+
         }
-        
+
     };
 
     /**
@@ -379,23 +421,28 @@ namespace mongo {
     public:
         const Mod * m;
         BSONElement old;
-        
+        BSONElement newVal;
+        BSONObj _objData;
+
         const char * fixedOpName;
         BSONElement * fixed;
         int pushStartSize;
-        
+
         BSONType incType;
         int incint;
         double incdouble;
         long long inclong;
-        
-        ModState(){
+
+        bool dontApply;
+
+        ModState() {
             fixedOpName = 0;
             fixed = 0;
             pushStartSize = -1;
             incType = EOO;
+            dontApply = false;
         }
-           
+
         Mod::Op op() const {
             return m->op;
         }
@@ -403,12 +450,18 @@ namespace mongo {
         const char * fieldName() const {
             return m->fieldName;
         }
-        
+
         bool needOpLogRewrite() const {
+            if ( dontApply )
+                return false;
+
             if ( fixed || fixedOpName || incType )
                 return true;
-            
-            switch( op() ){
+
+            switch( op() ) {
+            case Mod::RENAME_FROM:
+            case Mod::RENAME_TO:
+                return true;
             case Mod::BIT:
             case Mod::BITAND:
             case Mod::BITOR:
@@ -418,19 +471,19 @@ namespace mongo {
                 return false;
             }
         }
-        
+
         void appendForOpLog( BSONObjBuilder& b ) const;
 
         template< class Builder >
-        void apply( Builder& b , BSONElement in ){
+        void apply( Builder& b , BSONElement in ) {
             m->apply( b , in , *this );
         }
-        
+
         template< class Builder >
         void appendIncValue( Builder& b , bool useFullName ) const {
             const char * n = useFullName ? m->fieldName : m->shortFieldName;
 
-            switch ( incType ){
+            switch ( incType ) {
             case NumberDouble:
                 b.append( n , incdouble ); break;
             case NumberLong:
@@ -443,8 +496,11 @@ namespace mongo {
         }
 
         string toString() const;
+
+        template< class Builder >
+        void handleRename( Builder &newObjBuilder, const char *shortFieldName );
     };
-    
+
     /**
      * this is used to hold state, meta data while applying a ModSet to a BSONObj
      * the goal is to make ModSet const so its re-usable
@@ -459,15 +515,16 @@ namespace mongo {
         const BSONObj& _obj;
         ModStateHolder _mods;
         bool _inPlacePossible;
-        
-        ModSetState( const BSONObj& obj ) 
-            : _obj( obj ) , _inPlacePossible(true){
+        BSONObj _newFromMods; // keep this data alive, as oplog generation may depend on it
+
+        ModSetState( const BSONObj& obj )
+            : _obj( obj ) , _inPlacePossible(true) {
         }
-        
+
         /**
          * @return if in place is still possible
          */
-        bool amIInPlacePossible( bool inPlacePossible ){
+        bool amIInPlacePossible( bool inPlacePossible ) {
             if ( ! inPlacePossible )
                 _inPlacePossible = false;
             return _inPlacePossible;
@@ -478,17 +535,21 @@ namespace mongo {
 
         template< class Builder >
         void _appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen );
-        
+
         template< class Builder >
-        void appendNewFromMod( ModState& ms , Builder& b ){
+        void appendNewFromMod( ModState& ms , Builder& b ) {
+            if ( ms.dontApply ) {
+                return;
+            }
+
             //const Mod& m = *(ms.m); // HACK
             Mod& m = *((Mod*)(ms.m)); // HACK
-                
-            switch ( m.op ){
-                    
-            case Mod::PUSH: 
-            case Mod::ADDTOSET: { 
-                if ( m.isEach() ){
+
+            switch ( m.op ) {
+
+            case Mod::PUSH:
+            case Mod::ADDTOSET: {
+                if ( m.isEach() ) {
                     b.appendArray( m.shortFieldName , m.getEach() );
                 }
                 else {
@@ -497,19 +558,19 @@ namespace mongo {
                     arr.done();
                 }
                 break;
-            } 
-                
+            }
+
             case Mod::PUSH_ALL: {
                 b.appendAs( m.elt, m.shortFieldName );
                 break;
-            } 
-                
+            }
+
             case Mod::UNSET:
             case Mod::PULL:
             case Mod::PULL_ALL:
                 // no-op b/c unset/pull of nothing does nothing
                 break;
-                
+
             case Mod::INC:
                 ms.fixedOpName = "$set";
             case Mod::SET: {
@@ -517,24 +578,29 @@ namespace mongo {
                 b.appendAs( m.elt, m.shortFieldName );
                 break;
             }
-            default: 
+            // shouldn't see RENAME_FROM here
+            case Mod::RENAME_TO:
+                ms.handleRename( b, m.shortFieldName );
+                break;
+            default:
                 stringstream ss;
                 ss << "unknown mod in appendNewFromMod: " << m.op;
                 throw UserException( 9015, ss.str() );
             }
-         
+
         }
 
     public:
-        
+
         bool canApplyInPlace() const {
             return _inPlacePossible;
         }
-        
+
         /**
          * modified underlying _obj
+         * @param isOnDisk - true means this is an on disk object, and this update needs to be made durable
          */
-        void applyModsInPlace();
+        void applyModsInPlace( bool isOnDisk );
 
         BSONObj createNewFromMods();
 
@@ -544,9 +610,9 @@ namespace mongo {
             for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
                 if ( i->second.needOpLogRewrite() )
                     return true;
-            return false;            
+            return false;
         }
-        
+
         BSONObj getOpLogRewrite() const {
             BSONObjBuilder b;
             for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
@@ -564,7 +630,7 @@ namespace mongo {
         void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const {
             for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) {
                 const ModState& m = i->second;
-                if ( m.m->arrayDep() ){
+                if ( m.m->arrayDep() ) {
                     if ( m.pushStartSize == -1 )
                         b.appendNull( m.fieldName() );
                     else
@@ -577,6 +643,6 @@ namespace mongo {
 
         friend class ModSet;
     };
-    
+
 }