Imported Upstream version 2.0.0

author: Antonin Kral <a.kral@bobek.cz> 2011-09-14 17:08:06 +0200
committer: Antonin Kral <a.kral@bobek.cz> 2011-09-14 17:08:06 +0200
commit: 5d342a758c6095b4d30aba0750b54f13b8916f51 (patch)
tree: 762e9aa84781f5e3b96db2c02d356c29cf0217c0 /db
parent: cbe2d992e9cd1ea66af9fa91df006106775d3073 (diff)
download: mongodb-5d342a758c6095b4d30aba0750b54f13b8916f51.tar.gz
146 files changed, 17299 insertions, 9524 deletions
diff --git a/db/btree.cpp b/db/btree.cpp
index 299c212..bf9926e 100644
--- a/db/btree.cpp
+++ b/db/btree.cpp
@@ -27,33 +27,23 @@
 #include "curop-inl.h"
 #include "stats/counters.h"
 #include "dur_commitjob.h"
+#include "btreebuilder.h"
+#include "../util/unittest.h"
 
 namespace mongo {
 
-#define VERIFYTHISLOC dassert( thisLoc.btree() == this );
+    BOOST_STATIC_ASSERT( Record::HeaderSize == 16 );
+    BOOST_STATIC_ASSERT( Record::HeaderSize + BtreeData_V1::BucketSize == 8192 );
 
-    /**
-     * give us a writable version of the btree bucket (declares write intent).
-     * note it is likely more efficient to declare write intent on something smaller when you can.
-     */
-    BtreeBucket* DiskLoc::btreemod() const {
-        assert( _a != -1 );
-        BtreeBucket *b = const_cast< BtreeBucket * >( btree() );
-        return static_cast< BtreeBucket* >( getDur().writingPtr( b, BucketSize ) );
-    }
+#define VERIFYTHISLOC dassert( thisLoc.btree<V>() == this );
 
-    _KeyNode& _KeyNode::writing() const {
-        return *getDur().writing( const_cast< _KeyNode* >( this ) );
+    template< class Loc >
+    __KeyNode<Loc> & __KeyNode<Loc>::writing() const {
+        return *getDur().writing( const_cast< __KeyNode<Loc> * >( this ) );
     }
 
-    KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) :
-        prevChildBucket(k.prevChildBucket),
-        recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
-    { }
-
-    // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
-    static const int KeyMax = BucketSize / 10;
-
+    // BucketBasics::lowWaterMark()
+    //
     // We define this value as the maximum number of bytes such that, if we have
     // fewer than this many bytes, we must be able to either merge with or receive
     // keys from any neighboring node.  If our utilization goes below this value we
@@ -65,18 +55,15 @@ namespace mongo {
     // rebalancedSeparatorPos().  The conditions for lowWaterMark - 1 are as
     // follows:  We know we cannot merge with the neighbor, so the total data size
     // for us, the neighbor, and the separator must be at least
-    // BtreeBucket::bodySize() + 1.  We must be able to accept one key of any
+    // BtreeBucket<V>::bodySize() + 1.  We must be able to accept one key of any
     // allowed size, so our size plus storage for that additional key must be
-    // <= BtreeBucket::bodySize() / 2.  This way, with the extra key we'll have a
+    // <= BtreeBucket<V>::bodySize() / 2.  This way, with the extra key we'll have a
     // new bucket data size < half the total data size and by the implementation
     // of rebalancedSeparatorPos() the key must be added.
-    static const int lowWaterMark = BtreeBucket::bodySize() / 2 - KeyMax - sizeof( _KeyNode ) + 1;
 
     static const int split_debug = 0;
     static const int insert_debug = 0;
 
-    extern int otherTraceLevel;
-
     /**
      * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly
      * for the 10287 error code.
@@ -88,47 +75,57 @@ namespace mongo {
 
     /* BucketBasics --------------------------------------------------- */
 
-    void BucketBasics::assertWritable() {
+    template< class V >
+    void BucketBasics<V>::assertWritable() {
         if( cmdLine.dur )
-            dur::assertAlreadyDeclared(this, sizeof(*this));
+	  dur::assertAlreadyDeclared(this, V::BucketSize);
     }
 
-    string BtreeBucket::bucketSummary() const {
+    template< class V >
+    string BtreeBucket<V>::bucketSummary() const {
         stringstream ss;
         ss << "  Bucket info:" << endl;
-        ss << "    n: " << n << endl;
-        ss << "    parent: " << parent.toString() << endl;
-        ss << "    nextChild: " << parent.toString() << endl;
-        ss << "    flags:" << flags << endl;
-        ss << "    emptySize: " << emptySize << " topSize: " << topSize << endl;
+        ss << "    n: " << this->n << endl;
+        ss << "    parent: " << this->parent.toString() << endl;
+        ss << "    nextChild: " << this->parent.toString() << endl;
+        ss << "    flags:" << this->flags << endl;
+        ss << "    emptySize: " << this->emptySize << " topSize: " << this->topSize << endl;
         return ss.str();
     }
 
-    int BucketBasics::Size() const {
-        assert( _wasSize == BucketSize );
-        return BucketSize;
+    template< class V >
+    int BucketBasics<V>::Size() const {
+        return V::BucketSize;
     }
 
-    void BucketBasics::_shape(int level, stringstream& ss) const {
+    template< class V >
+    void BucketBasics<V>::_shape(int level, stringstream& ss) const {
         for ( int i = 0; i < level; i++ ) ss << ' ';
         ss << "*\n";
-        for ( int i = 0; i < n; i++ )
-            if ( !k(i).prevChildBucket.isNull() )
-                k(i).prevChildBucket.btree()->_shape(level+1,ss);
-        if ( !nextChild.isNull() )
-            nextChild.btree()->_shape(level+1,ss);
+        for ( int i = 0; i < this->n; i++ ) {
+            if ( !k(i).prevChildBucket.isNull() ) {
+                DiskLoc ll = k(i).prevChildBucket;
+                ll.btree<V>()->_shape(level+1,ss);
+            }
+        }
+        if ( !this->nextChild.isNull() ) {
+            DiskLoc ll = this->nextChild;
+            ll.btree<V>()->_shape(level+1,ss);
+        }
     }
 
     int bt_fv=0;
     int bt_dmp=0;
 
-    void BtreeBucket::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const {
+    template< class V >
+    void BtreeBucket<V>::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const {
         bt_dmp=1;
         fullValidate(thisLoc, order);
         bt_dmp=0;
     }
 
-    int BtreeBucket::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount, bool strict) const {
+    template< class V >
+    long long BtreeBucket<V>::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount, bool strict, unsigned depth) const {
         {
             bool f = false;
             assert( f = true );
@@ -136,18 +133,18 @@ namespace mongo {
         }
 
         killCurrentOp.checkForInterrupt();
-        assertValid(order, true);
+        this->assertValid(order, true);
 
         if ( bt_dmp ) {
-            out() << thisLoc.toString() << ' ';
-            ((BtreeBucket *) this)->dump();
+            _log() << thisLoc.toString() << ' ';
+            ((BtreeBucket *) this)->dump(depth);
         }
 
         // keycount
-        int kc = 0;
+        long long kc = 0;
 
-        for ( int i = 0; i < n; i++ ) {
-            const _KeyNode& kn = k(i);
+        for ( int i = 0; i < this->n; i++ ) {
+            const _KeyNode& kn = this->k(i);
 
             if ( kn.isUsed() ) {
                 kc++;
@@ -159,25 +156,26 @@ namespace mongo {
             }
             if ( !kn.prevChildBucket.isNull() ) {
                 DiskLoc left = kn.prevChildBucket;
-                const BtreeBucket *b = left.btree();
+                const BtreeBucket *b = left.btree<V>();
                 if ( strict ) {
                     assert( b->parent == thisLoc );
                 }
                 else {
                     wassert( b->parent == thisLoc );
                 }
-                kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict);
+                kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict, depth+1);
             }
         }
-        if ( !nextChild.isNull() ) {
-            const BtreeBucket *b = nextChild.btree();
+        if ( !this->nextChild.isNull() ) {
+	    DiskLoc ll = this->nextChild;
+            const BtreeBucket *b = ll.btree<V>();
             if ( strict ) {
                 assert( b->parent == thisLoc );
             }
             else {
                 wassert( b->parent == thisLoc );
             }
-            kc += b->fullValidate(nextChild, order, unusedCount, strict);
+            kc += b->fullValidate(this->nextChild, order, unusedCount, strict, depth+1);
         }
 
         return kc;
@@ -185,12 +183,17 @@ namespace mongo {
 
     int nDumped = 0;
 
-    void BucketBasics::assertValid(const Ordering &order, bool force) const {
+    template< class V >
+    void BucketBasics<V>::assertValid(const Ordering &order, bool force) const {
         if ( !debug && !force )
             return;
-        wassert( n >= 0 && n < Size() );
-        wassert( emptySize >= 0 && emptySize < BucketSize );
-        wassert( topSize >= n && topSize <= BucketSize );
+        {
+            int foo = this->n;
+            wassert( foo >= 0 && this->n < Size() );
+            foo = this->emptySize;
+            wassert( foo >= 0 && this->emptySize < V::BucketSize );
+            wassert( this->topSize >= this->n && this->topSize <= V::BucketSize );
+        }
 
         // this is very slow so don't do often
         {
@@ -201,26 +204,26 @@ namespace mongo {
 
         DEV {
             // slow:
-            for ( int i = 0; i < n-1; i++ ) {
-                BSONObj k1 = keyNode(i).key;
-                BSONObj k2 = keyNode(i+1).key;
+            for ( int i = 0; i < this->n-1; i++ ) {
+                Key k1 = keyNode(i).key;
+                Key k2 = keyNode(i+1).key;
                 int z = k1.woCompare(k2, order); //OK
                 if ( z > 0 ) {
                     out() << "ERROR: btree key order corrupt.  Keys:" << endl;
                     if ( ++nDumped < 5 ) {
-                        for ( int j = 0; j < n; j++ ) {
+                        for ( int j = 0; j < this->n; j++ ) {
                             out() << "  " << keyNode(j).key.toString() << endl;
                         }
-                        ((BtreeBucket *) this)->dump();
+                        ((BtreeBucket<V> *) this)->dump();
                     }
                     wassert(false);
                     break;
                 }
                 else if ( z == 0 ) {
                     if ( !(k(i).recordLoc < k(i+1).recordLoc) ) {
-                        out() << "ERROR: btree key order corrupt (recordloc's wrong).  Keys:" << endl;
-                        out() << " k(" << i << "):" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl;
-                        out() << " k(" << i+1 << "):" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl;
+                        out() << "ERROR: btree key order corrupt (recordloc's wrong):" << endl;
+                        out() << " k(" << i << ")" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl;
+                        out() << " k(" << i+1 << ")" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl;
                         wassert( k(i).recordLoc < k(i+1).recordLoc );
                     }
                 }
@@ -228,15 +231,15 @@ namespace mongo {
         }
         else {
             //faster:
-            if ( n > 1 ) {
-                BSONObj k1 = keyNode(0).key;
-                BSONObj k2 = keyNode(n-1).key;
+            if ( this->n > 1 ) {
+                Key k1 = keyNode(0).key;
+                Key k2 = keyNode(this->n-1).key;
                 int z = k1.woCompare(k2, order);
                 //wassert( z <= 0 );
                 if ( z > 0 ) {
                     problem() << "btree keys out of order" << '\n';
                     ONCE {
-                        ((BtreeBucket *) this)->dump();
+                        ((BtreeBucket<V> *) this)->dump();
                     }
                     assert(false);
                 }
@@ -244,53 +247,59 @@ namespace mongo {
         }
     }
 
-    inline void BucketBasics::markUnused(int keypos) {
-        assert( keypos >= 0 && keypos < n );
+    template< class V >
+    inline void BucketBasics<V>::markUnused(int keypos) {
+        assert( keypos >= 0 && keypos < this->n );
         k(keypos).setUnused();
     }
 
-    inline int BucketBasics::totalDataSize() const {
-        return (int) (Size() - (data-(char*)this));
+    template< class V >
+    inline int BucketBasics<V>::totalDataSize() const {
+        return (int) (Size() - (this->data-(char*)this));
     }
 
-    void BucketBasics::init() {
-        parent.Null();
-        nextChild.Null();
-        _wasSize = BucketSize;
-        _reserved1 = 0;
-        flags = Packed;
-        n = 0;
-        emptySize = totalDataSize();
-        topSize = 0;
-        reserved = 0;
+    template< class V >
+    void BucketBasics<V>::init() {
+        this->_init();
+        this->parent.Null();
+        this->nextChild.Null();
+        this->flags = Packed;
+        this->n = 0;
+        this->emptySize = totalDataSize();
+        this->topSize = 0;
     }
 
     /** see _alloc */
-    inline void BucketBasics::_unalloc(int bytes) {
-        topSize -= bytes;
-        emptySize += bytes;
+    template< class V >
+    inline void BucketBasics<V>::_unalloc(int bytes) {
+        this->topSize -= bytes;
+        this->emptySize += bytes;
     }
 
     /**
      * we allocate space from the end of the buffer for data.
      * the keynodes grow from the front.
      */
-    inline int BucketBasics::_alloc(int bytes) {
-        topSize += bytes;
-        emptySize -= bytes;
-        int ofs = totalDataSize() - topSize;
+    template< class V >
+    inline int BucketBasics<V>::_alloc(int bytes) {
+        assert( this->emptySize >= bytes );
+        this->topSize += bytes;
+        this->emptySize -= bytes;
+        int ofs = totalDataSize() - this->topSize;
         assert( ofs > 0 );
         return ofs;
     }
 
-    void BucketBasics::_delKeyAtPos(int keypos, bool mayEmpty) {
-        assert( keypos >= 0 && keypos <= n );
+    template< class V >
+    void BucketBasics<V>::_delKeyAtPos(int keypos, bool mayEmpty) {
+        // TODO This should be keypos < n
+        assert( keypos >= 0 && keypos <= this->n );
         assert( childForPos(keypos).isNull() );
         // TODO audit cases where nextChild is null
-        assert( ( mayEmpty && n > 0 ) || n > 1 || nextChild.isNull() );
-        emptySize += sizeof(_KeyNode);
-        n--;
-        for ( int j = keypos; j < n; j++ )
+        assert( ( mayEmpty && this->n > 0 ) || this->n > 1 || this->nextChild.isNull() );
+        this->emptySize += sizeof(_KeyNode);
+        this->n--;
+        for ( int j = keypos; j < this->n; j++ )
             k(j) = k(j+1);
         setNotPacked();
     }
@@ -299,38 +308,54 @@ namespace mongo {
      * pull rightmost key from the bucket.  this version requires its right child to be null so it
      *  does not bother returning that value.
      */
-    void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) {
-        massert( 10282 ,  "n==0 in btree popBack()", n > 0 );
-        assert( k(n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that
-        KeyNode kn = keyNode(n-1);
+    template< class V >
+    void BucketBasics<V>::popBack(DiskLoc& recLoc, Key &key) {
+        massert( 10282 ,  "n==0 in btree popBack()", this->n > 0 );
+        assert( k(this->n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that
+        KeyNode kn = keyNode(this->n-1);
         recLoc = kn.recordLoc;
-        key = kn.key;
-        int keysize = kn.key.objsize();
+        key.assign(kn.key);
+        int keysize = kn.key.dataSize();
 
-        massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull());
+        massert( 10283 , "rchild not null in btree popBack()", this->nextChild.isNull());
 
         // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full.
-        nextChild = kn.prevChildBucket;
+        this->nextChild = kn.prevChildBucket;
 
-        n--;
-        emptySize += sizeof(_KeyNode);
+        this->n--;
+        // This is risky because the key we are returning points to this unalloc'ed memory,
+        // and we are assuming that the last key points to the last allocated
+        // bson region.
+        this->emptySize += sizeof(_KeyNode);
         _unalloc(keysize);
     }
 
     /** add a key.  must be > all existing.  be careful to set next ptr right. */
-    bool BucketBasics::_pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) {
-        int bytesNeeded = key.objsize() + sizeof(_KeyNode);
-        if ( bytesNeeded > emptySize )
+    template< class V >
+    bool BucketBasics<V>::_pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
+        int bytesNeeded = key.dataSize() + sizeof(_KeyNode);
+        if ( bytesNeeded > this->emptySize )
             return false;
-        assert( bytesNeeded <= emptySize );
-        assert( n == 0 || keyNode(n-1).key.woCompare(key, order) <= 0 );
-        emptySize -= sizeof(_KeyNode);
-        _KeyNode& kn = k(n++);
+        assert( bytesNeeded <= this->emptySize );
+        if( this->n ) {
+            const KeyNode klast = keyNode(this->n-1);
+            if(  klast.key.woCompare(key, order) > 0 ) { 
+                log() << "btree bucket corrupt? consider reindexing or running validate command" << endl;
+                log() << "  klast: " << keyNode(this->n-1).key.toString() << endl;
+                log() << "  key:   " << key.toString() << endl;
+                DEV klast.key.woCompare(key, order);
+                assert(false);
+            }
+        }
+        this->emptySize -= sizeof(_KeyNode);
+        _KeyNode& kn = k(this->n++);
         kn.prevChildBucket = prevChild;
         kn.recordLoc = recordLoc;
-        kn.setKeyDataOfs( (short) _alloc(key.objsize()) );
-        char *p = dataAt(kn.keyDataOfs());
-        memcpy(p, key.objdata(), key.objsize());
+        kn.setKeyDataOfs( (short) _alloc(key.dataSize()) );
+        short ofs = kn.keyDataOfs();
+        char *p = dataAt(ofs);
+        memcpy(p, key.data(), key.dataSize());
+
         return true;
     }
 
@@ -342,19 +367,20 @@ namespace mongo {
     /** insert a key in a bucket with no complexity -- no splits required
         @return false if a split is required.
     */
-    bool BucketBasics::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const {
-        assert( keypos >= 0 && keypos <= n );
-        int bytesNeeded = key.objsize() + sizeof(_KeyNode);
-        if ( bytesNeeded > emptySize ) {
+    template< class V >
+    bool BucketBasics<V>::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const {
+        assert( keypos >= 0 && keypos <= this->n );
+        int bytesNeeded = key.dataSize() + sizeof(_KeyNode);
+        if ( bytesNeeded > this->emptySize ) {
             _pack(thisLoc, order, keypos);
-            if ( bytesNeeded > emptySize )
+            if ( bytesNeeded > this->emptySize )
                 return false;
         }
 
         BucketBasics *b;
         {
             const char *p = (const char *) &k(keypos);
-            const char *q = (const char *) &k(n+1);
+            const char *q = (const char *) &k(this->n+1);
             // declare that we will write to [k(keypos),k(n)]
             // todo: this writes a medium amount to the journal.  we may want to add a verb "shift" to the redo log so
             //       we can log a very small amount.
@@ -364,39 +390,45 @@ namespace mongo {
             // 1 4 9
             // ->
             // 1 4 _ 9
-            for ( int j = n; j > keypos; j-- ) // make room
+            for ( int j = this->n; j > keypos; j-- ) // make room
                 b->k(j) = b->k(j-1);
         }
 
-        getDur().declareWriteIntent(&b->emptySize, 12); // [b->emptySize..b->n] is 12 bytes and we are going to write those
+        getDur().declareWriteIntent(&b->emptySize, sizeof(this->emptySize)+sizeof(this->topSize)+sizeof(this->n));
         b->emptySize -= sizeof(_KeyNode);
         b->n++;
 
+        // This _KeyNode was marked for writing above.
         _KeyNode& kn = b->k(keypos);
         kn.prevChildBucket.Null();
         kn.recordLoc = recordLoc;
-        kn.setKeyDataOfs((short) b->_alloc(key.objsize()) );
+        kn.setKeyDataOfs((short) b->_alloc(key.dataSize()) );
         char *p = b->dataAt(kn.keyDataOfs());
-        getDur().declareWriteIntent(p, key.objsize());
-        memcpy(p, key.objdata(), key.objsize());
+        getDur().declareWriteIntent(p, key.dataSize());
+        memcpy(p, key.data(), key.dataSize());
         return true;
     }
 
-    /** with this implementation, refPos == 0 disregards effect of refPos */
-    bool BucketBasics::mayDropKey( int index, int refPos ) const {
+    /**
+     * With this implementation, refPos == 0 disregards effect of refPos.
+     * index > 0 prevents creation of an empty bucket.
+     */
+    template< class V >
+    bool BucketBasics<V>::mayDropKey( int index, int refPos ) const {
         return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull();
     }
 
-    int BucketBasics::packedDataSize( int refPos ) const {
-        if ( flags & Packed ) {
-            return BucketSize - emptySize - headerSize();
+    template< class V >
+    int BucketBasics<V>::packedDataSize( int refPos ) const {
+        if ( this->flags & Packed ) {
+	  return V::BucketSize - this->emptySize - headerSize();
         }
         int size = 0;
-        for( int j = 0; j < n; ++j ) {
+        for( int j = 0; j < this->n; ++j ) {
             if ( mayDropKey( j, refPos ) ) {
                 continue;
             }
-            size += keyNode( j ).key.objsize() + sizeof( _KeyNode );
+            size += keyNode( j ).key.dataSize() + sizeof( _KeyNode );
         }
         return size;
     }
@@ -405,8 +437,9 @@ namespace mongo {
      * when we delete things we just leave empty space until the node is
      * full and then we repack it.
      */
-    void BucketBasics::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const {
-        if ( flags & Packed )
+    template< class V >
+    void BucketBasics<V>::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const {
+        if ( this->flags & Packed )
             return;
 
         VERIFYTHISLOC
@@ -416,22 +449,23 @@ namespace mongo {
                  declaration anyway within the group commit interval, in which case we would just be adding
                  code and complexity without benefit.
         */
-        thisLoc.btreemod()->_packReadyForMod(order, refPos);
+        thisLoc.btreemod<V>()->_packReadyForMod(order, refPos);
     }
 
     /** version when write intent already declared */
-    void BucketBasics::_packReadyForMod( const Ordering &order, int &refPos ) {
+    template< class V >
+    void BucketBasics<V>::_packReadyForMod( const Ordering &order, int &refPos ) {
         assertWritable();
 
-        if ( flags & Packed )
+        if ( this->flags & Packed )
             return;
 
         int tdz = totalDataSize();
-        char temp[BucketSize];
+        char temp[V::BucketSize];
         int ofs = tdz;
-        topSize = 0;
+        this->topSize = 0;
         int i = 0;
-        for ( int j = 0; j < n; j++ ) {
+        for ( int j = 0; j < this->n; j++ ) {
             if( mayDropKey( j, refPos ) ) {
                 continue; // key is unused and has no children - drop it
             }
@@ -442,36 +476,40 @@ namespace mongo {
                 k( i ) = k( j );
             }
             short ofsold = k(i).keyDataOfs();
-            int sz = keyNode(i).key.objsize();
+            int sz = keyNode(i).key.dataSize();
             ofs -= sz;
-            topSize += sz;
+            this->topSize += sz;
             memcpy(temp+ofs, dataAt(ofsold), sz);
             k(i).setKeyDataOfsSavingUse( ofs );
             ++i;
         }
-        if ( refPos == n ) {
+        if ( refPos == this->n ) {
             refPos = i;
         }
-        n = i;
+        this->n = i;
         int dataUsed = tdz - ofs;
-        memcpy(data + ofs, temp + ofs, dataUsed);
+        memcpy(this->data + ofs, temp + ofs, dataUsed);
 
         // assertWritable();
         // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this));
 
-        emptySize = tdz - dataUsed - n * sizeof(_KeyNode);
-        assert( emptySize >= 0 );
+        this->emptySize = tdz - dataUsed - this->n * sizeof(_KeyNode);
+        {
+            int foo = this->emptySize;
+            assert( foo >= 0 );
+        }
 
         setPacked();
 
         assertValid( order );
     }
 
-    inline void BucketBasics::truncateTo(int N, const Ordering &order, int &refPos) {
+    template< class V >
+    inline void BucketBasics<V>::truncateTo(int N, const Ordering &order, int &refPos) {
         dbMutex.assertWriteLocked();
         assertWritable();
 
-        n = N;
+        this->n = N;
         setNotPacked();
         _packReadyForMod( order, refPos );
     }
@@ -489,19 +527,21 @@ namespace mongo {
      * We just have a simple algorithm right now: if a key includes the
      * halfway point (or 10% way point) in terms of bytes, split on that key;
      * otherwise split on the key immediately to the left of the halfway
-     * point.
+     * point (or 10% point).
      *
      * This function is expected to be called on a packed bucket.
      */
-    int BucketBasics::splitPos( int keypos ) const {
-        assert( n > 2 );
+    template< class V >
+    int BucketBasics<V>::splitPos( int keypos ) const {
+        assert( this->n > 2 );
         int split = 0;
         int rightSize = 0;
         // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split.
         // see SERVER-983
-        int rightSizeLimit = ( topSize + sizeof( _KeyNode ) * n ) / ( keypos == n ? 10 : 2 );
-        for( int i = n - 1; i > -1; --i ) {
-            rightSize += keyNode( i ).key.objsize() + sizeof( _KeyNode );
+        // TODO I think we only want to do the 90% split on the rhs node of the tree.
+        int rightSizeLimit = ( this->topSize + sizeof( _KeyNode ) * this->n ) / ( keypos == this->n ? 10 : 2 );
+        for( int i = this->n - 1; i > -1; --i ) {
+            rightSize += keyNode( i ).key.dataSize() + sizeof( _KeyNode );
             if ( rightSize > rightSizeLimit ) {
                 split = i;
                 break;
@@ -511,37 +551,40 @@ namespace mongo {
         if ( split < 1 ) {
             split = 1;
         }
-        else if ( split > n - 2 ) {
-            split = n - 2;
+        else if ( split > this->n - 2 ) {
+            split = this->n - 2;
         }
 
         return split;
     }
 
-    void BucketBasics::reserveKeysFront( int nAdd ) {
-        assert( emptySize >= int( sizeof( _KeyNode ) * nAdd ) );
-        emptySize -= sizeof( _KeyNode ) * nAdd;
-        for( int i = n - 1; i > -1; --i ) {
+    template< class V >
+    void BucketBasics<V>::reserveKeysFront( int nAdd ) {
+        assert( this->emptySize >= int( sizeof( _KeyNode ) * nAdd ) );
+        this->emptySize -= sizeof( _KeyNode ) * nAdd;
+        for( int i = this->n - 1; i > -1; --i ) {
             k( i + nAdd ) = k( i );
         }
-        n += nAdd;
+        this->n += nAdd;
     }
 
-    void BucketBasics::setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket ) {
+    template< class V >
+    void BucketBasics<V>::setKey( int i, const DiskLoc recordLoc, const Key &key, const DiskLoc prevChildBucket ) {
         _KeyNode &kn = k( i );
         kn.recordLoc = recordLoc;
         kn.prevChildBucket = prevChildBucket;
-        short ofs = (short) _alloc( key.objsize() );
+        short ofs = (short) _alloc( key.dataSize() );
         kn.setKeyDataOfs( ofs );
         char *p = dataAt( ofs );
-        memcpy( p, key.objdata(), key.objsize() );
+        memcpy( p, key.data(), key.dataSize() );
     }
 
-    void BucketBasics::dropFront( int nDrop, const Ordering &order, int &refpos ) {
-        for( int i = nDrop; i < n; ++i ) {
+    template< class V >
+    void BucketBasics<V>::dropFront( int nDrop, const Ordering &order, int &refpos ) {
+        for( int i = nDrop; i < this->n; ++i ) {
             k( i - nDrop ) = k( i );
         }
-        n -= nDrop;
+        this->n -= nDrop;
         setNotPacked();
         _packReadyForMod( order, refpos );
     }
@@ -549,10 +592,11 @@ namespace mongo {
     /* - BtreeBucket --------------------------------------------------- */
 
     /** @return largest key in the subtree. */
-    void BtreeBucket::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) {
+    template< class V >
+    void BtreeBucket<V>::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) {
         DiskLoc loc = thisLoc;
         while ( 1 ) {
-            const BtreeBucket *b = loc.btree();
+            const BtreeBucket *b = loc.btree<V>();
             if ( !b->nextChild.isNull() ) {
                 loc = b->nextChild;
                 continue;
@@ -571,8 +615,16 @@ namespace mongo {
      * not have more keys than an unsigned variable has bits.  The same
      * assumption is used in the implementation below with respect to the 'mask'
      * variable.
+     *
+     * @param l a regular bsonobj
+     * @param rBegin composed partly of an existing bsonobj, and the remaining keys are taken from a vector of elements that frequently changes 
+     *
+     * see 
+     *  jstests/index_check6.js
+     *  https://jira.mongodb.org/browse/SERVER-371
      */
-    int BtreeBucket::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) {
+    template< class V >
+    int BtreeBucket<V>::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) {
         BSONObjIterator ll( l );
         BSONObjIterator rr( rBegin );
         vector< const BSONElement * >::const_iterator rr2 = rEnd.begin();
@@ -610,31 +662,29 @@ namespace mongo {
         return 0;
     }
 
-    bool BtreeBucket::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const {
-        int pos;
-        bool found;
-        DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
+    template< class V >
+    bool BtreeBucket<V>::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const {
+            int pos;
+            bool found;
+            DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
 
-        // skip unused keys
-        while ( 1 ) {
-            if( b.isNull() )
-                break;
-            const BtreeBucket *bucket = b.btree();
-            const _KeyNode& kn = bucket->k(pos);
-            if ( kn.isUsed() )
-                return bucket->keyAt(pos).woEqual(key);
-            b = bucket->advance(b, pos, 1, "BtreeBucket::exists");
+            // skip unused keys
+            while ( 1 ) {
+                if( b.isNull() )
+                    break;
+                const BtreeBucket *bucket = b.btree<V>();
+                const _KeyNode& kn = bucket->k(pos);
+                if ( kn.isUsed() )
+                    return bucket->keyAt(pos).woEqual(key);
+            b = bucket->advance(b, pos, 1, "BtreeBucket<V>::exists");
         }
         return false;
     }
 
-    /**
-     * @param self - don't complain about ourself already being in the index case.
-     * @return true = there is a duplicate.
-     */
-    bool BtreeBucket::wouldCreateDup(
+    template< class V >
+    bool BtreeBucket<V>::wouldCreateDup(
         const IndexDetails& idx, const DiskLoc &thisLoc,
-        const BSONObj& key, const Ordering& order,
+        const Key& key, const Ordering& order,
         const DiskLoc &self) const {
         int pos;
         bool found;
@@ -642,24 +692,25 @@ namespace mongo {
 
         while ( !b.isNull() ) {
             // we skip unused keys
-            const BtreeBucket *bucket = b.btree();
+            const BtreeBucket *bucket = b.btree<V>();
             const _KeyNode& kn = bucket->k(pos);
             if ( kn.isUsed() ) {
                 if( bucket->keyAt(pos).woEqual(key) )
                     return kn.recordLoc != self;
                 break;
             }
-            b = bucket->advance(b, pos, 1, "BtreeBucket::dupCheck");
+            b = bucket->advance(b, pos, 1, "BtreeBucket<V>::dupCheck");
         }
 
         return false;
     }
 
-    string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ) {
+    template< class V >
+    string BtreeBucket<V>::dupKeyError( const IndexDetails& idx , const Key& key ) {
         stringstream ss;
         ss << "E11000 duplicate key error ";
         ss << "index: " << idx.indexNamespace() << "  ";
-        ss << "dup key: " << key;
+        ss << "dup key: " << key.toString();
         return ss.str();
     }
 
@@ -677,30 +728,20 @@ namespace mongo {
      * returns n if it goes after the last existing key.
      * note result might be an Unused location!
      */
-    char foo;
-    bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const {
-#if defined(_EXPERIMENT1)
-        {
-            char *z = (char *) this;
-            int i = 0;
-            while( 1 ) {
-                i += 4096;
-                if( i >= BucketSize )
-                    break;
-                foo += z[i];
-            }
-        }
-#endif
-
+    template< class V >
+    bool BtreeBucket<V>::find(const IndexDetails& idx, const Key& key, const DiskLoc &rl, 
+			      const Ordering &order, int& pos, bool assertIfDup) const {
+        Loc recordLoc;
+        recordLoc = rl;
         globalIndexCounters.btree( (char*)this );
 
         // binary search for this key
         bool dupsChecked = false;
         int l=0;
-        int h=n-1;
+        int h=this->n-1;
         while ( l <= h ) {
             int m = (l+h)/2;
-            KeyNode M = keyNode(m);
+            KeyNode M = this->keyNode(m);
             int x = key.woCompare(M.key, order);
             if ( x == 0 ) {
                 if( assertIfDup ) {
@@ -710,8 +751,8 @@ namespace mongo {
                         // coding effort in here to make this particularly fast
                         if( !dupsChecked ) {
                             dupsChecked = true;
-                            if( idx.head.btree()->exists(idx, idx.head, key, order) ) {
-                                if( idx.head.btree()->wouldCreateDup(idx, idx.head, key, order, recordLoc) )
+                            if( idx.head.btree<V>()->exists(idx, idx.head, key, order) ) {
+                                if( idx.head.btree<V>()->wouldCreateDup(idx, idx.head, key, order, recordLoc) )
                                     uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
                                 else
                                     alreadyInIndex();
@@ -726,7 +767,7 @@ namespace mongo {
                 }
 
                 // dup keys allowed.  use recordLoc as if it is part of the key
-                DiskLoc unusedRL = M.recordLoc;
+                Loc unusedRL = M.recordLoc;
                 unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up
                 x = recordLoc.compare(unusedRL);
             }
@@ -742,49 +783,59 @@ namespace mongo {
         }
         // not found
         pos = l;
-        if ( pos != n ) {
-            BSONObj keyatpos = keyNode(pos).key;
+        if ( pos != this->n ) {
+            Key keyatpos = keyNode(pos).key;
             wassert( key.woCompare(keyatpos, order) <= 0 );
             if ( pos > 0 ) {
-                wassert( keyNode(pos-1).key.woCompare(key, order) <= 0 );
+                if( !( keyNode(pos-1).key.woCompare(key, order) <= 0 ) ) {
+                    DEV {
+                        log() << key.toString() << endl;
+                        log() << keyNode(pos-1).key.toString() << endl;
+                    }
+                    wassert(false);
+                }
             }
         }
 
         return false;
     }
 
-    void BtreeBucket::delBucket(const DiskLoc thisLoc, const IndexDetails& id) {
+    template< class V >
+    void BtreeBucket<V>::delBucket(const DiskLoc thisLoc, const IndexDetails& id) {
         ClientCursor::informAboutToDeleteBucket(thisLoc); // slow...
         assert( !isHead() );
 
-        const BtreeBucket *p = parent.btree();
+	DiskLoc ll = this->parent;
+        const BtreeBucket *p = ll.btree<V>();
         int parentIdx = indexInParent( thisLoc );
         p->childForPos( parentIdx ).writing().Null();
         deallocBucket( thisLoc, id );
     }
 
-    void BtreeBucket::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) {
+    template< class V >
+    void BtreeBucket<V>::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) {
 #if 0
         // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
         // it (meaning it is ineligible for reuse).
         memset(this, 0, Size());
 #else
         // defensive:
-        n = -1;
-        parent.Null();
+        this->n = -1;
+        this->parent.Null();
         string ns = id.indexNamespace();
         theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc);
 #endif
     }
 
     /** note: may delete the entire bucket!  this invalid upon return sometimes. */
-    void BtreeBucket::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) {
-        assert(n>0);
-        DiskLoc left = childForPos(p);
-
-        if ( n == 1 ) {
-            if ( left.isNull() && nextChild.isNull() ) {
-                _delKeyAtPos(p);
+    template< class V >
+    void BtreeBucket<V>::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) {
+        assert(this->n>0);
+        DiskLoc left = this->childForPos(p);
+
+        if ( this->n == 1 ) {
+            if ( left.isNull() && this->nextChild.isNull() ) {
+                this->_delKeyAtPos(p);
                 if ( isHead() ) {
                     // we don't delete the top bucket ever
                 }
@@ -803,7 +854,7 @@ namespace mongo {
         }
 
         if ( left.isNull() ) {
-            _delKeyAtPos(p);
+            this->_delKeyAtPos(p);
             mayBalanceWithNeighbors( thisLoc, id, order );
         }
         else {
@@ -833,53 +884,71 @@ namespace mongo {
      * k by k', preserving the key's unused marking.  This function is only
      * expected to mark a key as unused when handling a legacy btree.
      */
-    void BtreeBucket::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) {
-        DiskLoc lchild = childForPos( keypos );
-        DiskLoc rchild = childForPos( keypos + 1 );
+    template< class V >
+    void BtreeBucket<V>::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = this->childForPos( keypos );
+        DiskLoc rchild = this->childForPos( keypos + 1 );
         assert( !lchild.isNull() || !rchild.isNull() );
         int advanceDirection = lchild.isNull() ? 1 : -1;
         int advanceKeyOfs = keypos;
         DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ );
-
-        if ( !advanceLoc.btree()->childForPos( advanceKeyOfs ).isNull() ||
-                !advanceLoc.btree()->childForPos( advanceKeyOfs + 1 ).isNull() ) {
+        // advanceLoc must be a descentant of thisLoc, because thisLoc has a
+        // child in the proper direction and all descendants of thisLoc must be
+        // nonempty because they are not the root.
+         
+        if ( !advanceLoc.btree<V>()->childForPos( advanceKeyOfs ).isNull() ||
+                !advanceLoc.btree<V>()->childForPos( advanceKeyOfs + 1 ).isNull() ) {
             // only expected with legacy btrees, see note above
-            markUnused( keypos );
+            this->markUnused( keypos );
             return;
         }
 
-        KeyNode kn = advanceLoc.btree()->keyNode( advanceKeyOfs );
-        setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, childForPos( keypos ), childForPos( keypos + 1 ), id );
-        advanceLoc.btreemod()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order );
+        KeyNode kn = advanceLoc.btree<V>()->keyNode( advanceKeyOfs );
+        // Because advanceLoc is a descendant of thisLoc, updating thisLoc will
+        // not affect packing or keys of advanceLoc and kn will be stable
+        // during the following setInternalKey()
+        setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, this->childForPos( keypos ), this->childForPos( keypos + 1 ), id );
+        advanceLoc.btreemod<V>()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order );
     }
 
-    void BtreeBucket::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) {
-        assert( n == 0 && !nextChild.isNull() );
-        if ( parent.isNull() ) {
+//#define BTREE(loc) (static_cast<DiskLoc>(loc).btree<V>())
+#define BTREE(loc) (loc.template btree<V>())
+//#define BTREEMOD(loc) (static_cast<DiskLoc>(loc).btreemod<V>())
+#define BTREEMOD(loc) (loc.template btreemod<V>())
+
+    template< class V >
+    void BtreeBucket<V>::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) {
+        assert( this->n == 0 && !this->nextChild.isNull() );
+        if ( this->parent.isNull() ) {
             assert( id.head == thisLoc );
-            id.head.writing() = nextChild;
+            id.head.writing() = this->nextChild;
         }
         else {
-            parent.btree()->childForPos( indexInParent( thisLoc ) ).writing() = nextChild;
+	    DiskLoc ll = this->parent;
+            ll.btree<V>()->childForPos( indexInParent( thisLoc ) ).writing() = this->nextChild;
         }
-        nextChild.btree()->parent.writing() = parent;
+        BTREE(this->nextChild)->parent.writing() = this->parent;
+
+        BTREE(this->nextChild)->parent.writing() = this->parent;
+        //(static_cast<DiskLoc>(this->nextChild).btree<V>())->parent.writing() = this->parent;
         ClientCursor::informAboutToDeleteBucket( thisLoc );
         deallocBucket( thisLoc, id );
     }
 
-    bool BtreeBucket::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const {
-        assert( leftIndex >= 0 && leftIndex < n );
-        DiskLoc leftNodeLoc = childForPos( leftIndex );
-        DiskLoc rightNodeLoc = childForPos( leftIndex + 1 );
+    template< class V >
+    bool BtreeBucket<V>::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const {
+        assert( leftIndex >= 0 && leftIndex < this->n );
+        DiskLoc leftNodeLoc = this->childForPos( leftIndex );
+        DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 );
         if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) {
             // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway
             return false;
         }
         int pos = 0;
         {
-            const BtreeBucket *l = leftNodeLoc.btree();
-            const BtreeBucket *r = rightNodeLoc.btree();
-            if ( ( headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.objsize() + sizeof(_KeyNode) > unsigned( BucketSize ) ) ) {
+            const BtreeBucket *l = leftNodeLoc.btree<V>();
+            const BtreeBucket *r = rightNodeLoc.btree<V>();
+            if ( ( this->headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.dataSize() + sizeof(_KeyNode) > unsigned( V::BucketSize ) ) ) {
                 return false;
             }
         }
@@ -890,33 +959,34 @@ namespace mongo {
      * This implementation must respect the meaning and value of lowWaterMark.
      * Also see comments in splitPos().
      */
-    int BtreeBucket::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const {
+    template< class V >
+    int BtreeBucket<V>::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const {
         int split = -1;
         int rightSize = 0;
-        const BtreeBucket *l = childForPos( leftIndex ).btree();
-        const BtreeBucket *r = childForPos( leftIndex + 1 ).btree();
+        const BtreeBucket *l = BTREE(this->childForPos( leftIndex ));
+        const BtreeBucket *r = BTREE(this->childForPos( leftIndex + 1 ));
 
         int KNS = sizeof( _KeyNode );
-        int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.objsize() + KNS + r->topSize + r->n * KNS ) / 2;
+        int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.dataSize() + KNS + r->topSize + r->n * KNS ) / 2;
         // This constraint should be ensured by only calling this function
         // if we go below the low water mark.
-        assert( rightSizeLimit < BtreeBucket::bodySize() );
+        assert( rightSizeLimit < BtreeBucket<V>::bodySize() );
         for( int i = r->n - 1; i > -1; --i ) {
-            rightSize += r->keyNode( i ).key.objsize() + KNS;
+            rightSize += r->keyNode( i ).key.dataSize() + KNS;
             if ( rightSize > rightSizeLimit ) {
                 split = l->n + 1 + i;
                 break;
             }
         }
         if ( split == -1 ) {
-            rightSize += keyNode( leftIndex ).key.objsize() + KNS;
+            rightSize += keyNode( leftIndex ).key.dataSize() + KNS;
             if ( rightSize > rightSizeLimit ) {
                 split = l->n;
             }
         }
         if ( split == -1 ) {
             for( int i = l->n - 1; i > -1; --i ) {
-                rightSize += l->keyNode( i ).key.objsize() + KNS;
+                rightSize += l->keyNode( i ).key.dataSize() + KNS;
                 if ( rightSize > rightSizeLimit ) {
                     split = i;
                     break;
@@ -934,15 +1004,18 @@ namespace mongo {
         return split;
     }
 
-    void BtreeBucket::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
-        DiskLoc leftNodeLoc = childForPos( leftIndex );
-        DiskLoc rightNodeLoc = childForPos( leftIndex + 1 );
-        BtreeBucket *l = leftNodeLoc.btreemod();
-        BtreeBucket *r = rightNodeLoc.btreemod();
+    template< class V >
+    void BtreeBucket<V>::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc leftNodeLoc = this->childForPos( leftIndex );
+        DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 );
+        BtreeBucket *l = leftNodeLoc.btreemod<V>();
+        BtreeBucket *r = rightNodeLoc.btreemod<V>();
         int pos = 0;
         l->_packReadyForMod( order, pos );
         r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys
 
+        // We know the additional keys below will fit in l because canMergeChildren()
+        // must be true.
         int oldLNum = l->n;
         {
             KeyNode kn = keyNode( leftIndex );
@@ -955,10 +1028,10 @@ namespace mongo {
         l->nextChild = r->nextChild;
         l->fixParentPtrs( leftNodeLoc, oldLNum );
         r->delBucket( rightNodeLoc, id );
-        childForPos( leftIndex + 1 ) = leftNodeLoc;
-        childForPos( leftIndex ) = DiskLoc();
-        _delKeyAtPos( leftIndex, true );
-        if ( n == 0 ) {
+        this->childForPos( leftIndex + 1 ) = leftNodeLoc;
+        this->childForPos( leftIndex ) = DiskLoc();
+        this->_delKeyAtPos( leftIndex, true );
+        if ( this->n == 0 ) {
             // will trash this and thisLoc
             // TODO To ensure all leaves are of equal height, we should ensure
             // this is only called on the root.
@@ -970,9 +1043,10 @@ namespace mongo {
         }
     }
 
-    int BtreeBucket::indexInParent( const DiskLoc &thisLoc ) const {
-        assert( !parent.isNull() );
-        const BtreeBucket *p = parent.btree();
+    template< class V >
+    int BtreeBucket<V>::indexInParent( const DiskLoc &thisLoc ) const {
+        assert( !this->parent.isNull() );
+        const BtreeBucket *p = BTREE(this->parent);
         if ( p->nextChild == thisLoc ) {
             return p->n;
         }
@@ -986,27 +1060,33 @@ namespace mongo {
         out() << "ERROR: can't find ref to child bucket.\n";
         out() << "child: " << thisLoc << "\n";
         dump();
-        out() << "Parent: " << parent << "\n";
+        out() << "Parent: " << this->parent << "\n";
         p->dump();
         assert(false);
         return -1; // just to compile
     }
 
-    bool BtreeBucket::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const {
+    template< class V >
+    bool BtreeBucket<V>::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const {
         // If we can merge, then we must merge rather than balance to preserve
         // bucket utilization constraints.
         if ( canMergeChildren( thisLoc, leftIndex ) ) {
             return false;
         }
-        thisLoc.btreemod()->doBalanceChildren( thisLoc, leftIndex, id, order );
+        thisLoc.btreemod<V>()->doBalanceChildren( thisLoc, leftIndex, id, order );
         return true;
     }
 
-    void BtreeBucket::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+    template< class V >
+    void BtreeBucket<V>::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
                                             BtreeBucket *l, const DiskLoc lchild,
                                             BtreeBucket *r, const DiskLoc rchild,
                                             IndexDetails &id, const Ordering &order ) {
         // TODO maybe do some audits the same way pushBack() does?
+        // As a precondition, rchild + the old separator are <= half a body size,
+        // and lchild is at most completely full.  Based on the value of split,
+        // rchild will get <= half of the total bytes which is at most 75%
+        // of a full body.  So rchild will have room for the following keys:
         int rAdd = l->n - split;
         r->reserveKeysFront( rAdd );
         for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) {
@@ -1021,16 +1101,26 @@ namespace mongo {
         {
             KeyNode kn = l->keyNode( split );
             l->nextChild = kn.prevChildBucket;
+            // Because lchild is a descendant of thisLoc, updating thisLoc will
+            // not not affect packing or keys of lchild and kn will be stable
+            // during the following setInternalKey()            
             setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
         }
         int zeropos = 0;
+        // lchild and rchild cannot be merged, so there must be >0 (actually more)
+        // keys to the left of split.
         l->truncateTo( split, order, zeropos );
     }
 
-    void BtreeBucket::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+    template< class V >
+    void BtreeBucket<V>::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
                                             BtreeBucket *l, const DiskLoc lchild,
                                             BtreeBucket *r, const DiskLoc rchild,
                                             IndexDetails &id, const Ordering &order ) {
+        // As a precondition, lchild + the old separator are <= half a body size,
+        // and rchild is at most completely full.  Based on the value of split,
+        // lchild will get less than half of the total bytes which is at most 75%
+        // of a full body.  So lchild will have room for the following keys:
         int lN = l->n;
         {
             KeyNode kn = keyNode( leftIndex );
@@ -1043,20 +1133,27 @@ namespace mongo {
         {
             KeyNode kn = r->keyNode( split - lN - 1 );
             l->nextChild = kn.prevChildBucket;
+            // Child lN was lchild's old nextChild, and don't need to fix that one.
             l->fixParentPtrs( lchild, lN + 1, l->n );
+            // Because rchild is a descendant of thisLoc, updating thisLoc will
+            // not affect packing or keys of rchild and kn will be stable
+            // during the following setInternalKey()            
             setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
         }
         int zeropos = 0;
+        // lchild and rchild cannot be merged, so there must be >0 (actually more)
+        // keys to the right of split.
         r->dropFront( split - lN, order, zeropos );
     }
 
-    void BtreeBucket::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
-        DiskLoc lchild = childForPos( leftIndex );
-        DiskLoc rchild = childForPos( leftIndex + 1 );
+    template< class V >
+    void BtreeBucket<V>::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = this->childForPos( leftIndex );
+        DiskLoc rchild = this->childForPos( leftIndex + 1 );
         int zeropos = 0;
-        BtreeBucket *l = lchild.btreemod();
+        BtreeBucket *l = lchild.btreemod<V>();
         l->_packReadyForMod( order, zeropos );
-        BtreeBucket *r = rchild.btreemod();
+        BtreeBucket *r = rchild.btreemod<V>();
         r->_packReadyForMod( order, zeropos );
         int split = rebalancedSeparatorPos( thisLoc, leftIndex );
 
@@ -1071,16 +1168,17 @@ namespace mongo {
         }
     }
 
-    bool BtreeBucket::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const {
-        if ( parent.isNull() ) { // we are root, there are no neighbors
+    template< class V >
+    bool BtreeBucket<V>::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const {
+        if ( this->parent.isNull() ) { // we are root, there are no neighbors
             return false;
         }
 
-        if ( packedDataSize( 0 ) >= lowWaterMark ) {
+        if ( this->packedDataSize( 0 ) >= this->lowWaterMark() ) {
             return false;
         }
 
-        const BtreeBucket *p = parent.btree();
+        const BtreeBucket *p = BTREE(this->parent);
         int parentIdx = indexInParent( thisLoc );
 
         // TODO will missing neighbor case be possible long term?  Should we try to merge/balance somehow in that case if so?
@@ -1091,21 +1189,21 @@ namespace mongo {
         // to preserve btree bucket utilization constraints since that's a more
         // heavy duty operation (especially if we must re-split later).
         if ( mayBalanceRight &&
-                p->tryBalanceChildren( parent, parentIdx, id, order ) ) {
+                p->tryBalanceChildren( this->parent, parentIdx, id, order ) ) {
             return true;
         }
         if ( mayBalanceLeft &&
-                p->tryBalanceChildren( parent, parentIdx - 1, id, order ) ) {
+                p->tryBalanceChildren( this->parent, parentIdx - 1, id, order ) ) {
             return true;
         }
 
-        BtreeBucket *pm = parent.btreemod();
+        BtreeBucket *pm = BTREEMOD(this->parent);
         if ( mayBalanceRight ) {
-            pm->doMergeChildren( parent, parentIdx, id, order );
+            pm->doMergeChildren( this->parent, parentIdx, id, order );
             return true;
         }
         else if ( mayBalanceLeft ) {
-            pm->doMergeChildren( parent, parentIdx - 1, id, order );
+            pm->doMergeChildren( this->parent, parentIdx - 1, id, order );
             return true;
         }
 
@@ -1113,64 +1211,70 @@ namespace mongo {
     }
 
     /** remove a key from the index */
-    bool BtreeBucket::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const {
+    template< class V >
+    bool BtreeBucket<V>::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const {
         int pos;
         bool found;
-        DiskLoc loc = locate(id, thisLoc, key, Ordering::make(id.keyPattern()), pos, found, recordLoc, 1);
+        const Ordering ord = Ordering::make(id.keyPattern());
+        DiskLoc loc = locate(id, thisLoc, key, ord, pos, found, recordLoc, 1);
         if ( found ) {
-
-            if ( key.objsize() > KeyMax ) {
+            if ( key.objsize() > this->KeyMax ) {
                 OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl;
-            }
-            
-            loc.btreemod()->delKeyAtPos(loc, id, pos, Ordering::make(id.keyPattern()));
-            
+            }            
+            loc.btreemod<V>()->delKeyAtPos(loc, id, pos, ord);            
             return true;
         }
         return false;
     }
 
-    BtreeBucket* BtreeBucket::allocTemp() {
-        BtreeBucket *b = (BtreeBucket*) malloc(BucketSize);
+    template< class V >
+    BtreeBucket<V> * BtreeBucket<V>::allocTemp() {
+      BtreeBucket *b = (BtreeBucket*) malloc(V::BucketSize);
         b->init();
         return b;
     }
 
-    inline void BtreeBucket::fix(const DiskLoc thisLoc, const DiskLoc child) {
+    template< class V >
+    inline void BtreeBucket<V>::fix(const DiskLoc thisLoc, const DiskLoc child) {
         if ( !child.isNull() ) {
             if ( insert_debug )
-                out() << "      " << child.toString() << ".parent=" << thisLoc.toString() << endl;
-            child.btree()->parent.writing() = thisLoc;
+                out() << "     fix " << child.toString() << ".parent=" << thisLoc.toString() << endl;
+            child.btree<V>()->parent.writing() = thisLoc;
         }
     }
 
-    /** this sucks.  maybe get rid of parent ptrs. */
-    void BtreeBucket::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const {
+    /**
+     * This can cause a lot of additional page writes when we assign buckets to
+     * different parents.  Maybe get rid of parent ptrs?
+     */
+    template< class V >
+    void BtreeBucket<V>::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const {
         VERIFYTHISLOC
         if ( lastIndex == -1 ) {
-            lastIndex = n;
+            lastIndex = this->n;
         }
         for ( int i = firstIndex; i <= lastIndex; i++ ) {
-            fix(thisLoc, childForPos(i));
+            fix(thisLoc, this->childForPos(i));
         }
     }
 
-    void BtreeBucket::setInternalKey( const DiskLoc thisLoc, int keypos,
-                                      const DiskLoc recordLoc, const BSONObj &key, const Ordering &order,
+    template< class V >
+    void BtreeBucket<V>::setInternalKey( const DiskLoc thisLoc, int keypos,
+                                      const DiskLoc recordLoc, const Key &key, const Ordering &order,
                                       const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) {
-        childForPos( keypos ).Null();
+        this->childForPos( keypos ).Null();
 
         // This may leave the bucket empty (n == 0) which is ok only as a
         // transient state.  In the instant case, the implementation of
         // insertHere behaves correctly when n == 0 and as a side effect
         // increments n.
-        _delKeyAtPos( keypos, true );
+        this->_delKeyAtPos( keypos, true );
 
         // Ensure we do not orphan neighbor's old child.
-        assert( childForPos( keypos ) == rchild );
+        assert( this->childForPos( keypos ) == rchild );
 
         // Just set temporarily - required to pass validation in insertHere()
-        childForPos( keypos ) = lchild;
+        this->childForPos( keypos ) = lchild;
 
         insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx );
     }
@@ -1180,127 +1284,137 @@ namespace mongo {
      * @keypos - where to insert the key in range 0..n.  0=make leftmost, n=make rightmost.
      * NOTE this function may free some data, and as a result the value passed for keypos may
      * be invalid after calling insertHere()
+     *
+     * Some of the write intent signaling below relies on the implementation of
+     * the optimized write intent code in basicInsert().
      */
-    void BtreeBucket::insertHere( const DiskLoc thisLoc, int keypos,
-                                  const DiskLoc recordLoc, const BSONObj& key, const Ordering& order,
+    template< class V >
+    void BtreeBucket<V>::insertHere( const DiskLoc thisLoc, int keypos,
+                                  const DiskLoc recordLoc, const Key& key, const Ordering& order,
                                   const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const {
         if ( insert_debug )
             out() << "   " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' '
                   << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
 
-        if ( !basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
-            thisLoc.btreemod()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx);
+        if ( !this->basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
+            // If basicInsert() fails, the bucket will be packed as required by split().
+            thisLoc.btreemod<V>()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx);
             return;
         }
 
         {
             const _KeyNode *_kn = &k(keypos);
             _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert()
-            if ( keypos+1 == n ) { // last key
-                if ( nextChild != lchild ) {
+            if ( keypos+1 == this->n ) { // last key
+                if ( this->nextChild != lchild ) {
                     out() << "ERROR nextChild != lchild" << endl;
                     out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
-                    out() << "  keyPos: " << keypos << " n:" << n << endl;
-                    out() << "  nextChild: " << nextChild.toString() << " lchild: " << lchild.toString() << endl;
+                    out() << "  keyPos: " << keypos << " n:" << this->n << endl;
+                    out() << "  nextChild: " << this->nextChild.toString() << " lchild: " << lchild.toString() << endl;
                     out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
                     out() << "  key: " << key.toString() << endl;
                     dump();
                     assert(false);
                 }
-                kn->prevChildBucket = nextChild;
+                kn->prevChildBucket = this->nextChild;
                 assert( kn->prevChildBucket == lchild );
-                nextChild.writing() = rchild;
+                this->nextChild.writing() = rchild;
                 if ( !rchild.isNull() )
-                    rchild.btree()->parent.writing() = thisLoc;
+		    BTREE(rchild)->parent.writing() = thisLoc;
             }
             else {
                 kn->prevChildBucket = lchild;
                 if ( k(keypos+1).prevChildBucket != lchild ) {
                     out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl;
                     out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
-                    out() << "  keyPos: " << keypos << " n:" << n << endl;
+                    out() << "  keyPos: " << keypos << " n:" << this->n << endl;
                     out() << "  k(keypos+1).pcb: " << k(keypos+1).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl;
                     out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
                     out() << "  key: " << key.toString() << endl;
                     dump();
                     assert(false);
                 }
-                const DiskLoc *pc = &k(keypos+1).prevChildBucket;
-                *getDur().alreadyDeclared((DiskLoc*) pc) = rchild; // declared in basicInsert()
+                const Loc *pc = &k(keypos+1).prevChildBucket;
+                *getDur().alreadyDeclared( const_cast<Loc*>(pc) ) = rchild; // declared in basicInsert()
                 if ( !rchild.isNull() )
-                    rchild.btree()->parent.writing() = thisLoc;
+                    rchild.btree<V>()->parent.writing() = thisLoc;
             }
             return;
         }
     }
 
-    void BtreeBucket::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) {
-        assertWritable();
+    template< class V >
+    void BtreeBucket<V>::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const Key& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) {
+        this->assertWritable();
 
         if ( split_debug )
             out() << "    " << thisLoc.toString() << ".split" << endl;
 
-        int split = splitPos( keypos );
+        int split = this->splitPos( keypos );
         DiskLoc rLoc = addBucket(idx);
-        BtreeBucket *r = rLoc.btreemod();
+        BtreeBucket *r = rLoc.btreemod<V>();
         if ( split_debug )
-            out() << "     split:" << split << ' ' << keyNode(split).key.toString() << " n:" << n << endl;
-        for ( int i = split+1; i < n; i++ ) {
+            out() << "     split:" << split << ' ' << keyNode(split).key.toString() << " this->n:" << this->n << endl;
+        for ( int i = split+1; i < this->n; i++ ) {
             KeyNode kn = keyNode(i);
             r->pushBack(kn.recordLoc, kn.key, order, kn.prevChildBucket);
         }
-        r->nextChild = nextChild;
+        r->nextChild = this->nextChild;
         r->assertValid( order );
 
         if ( split_debug )
-            out() << "     new rLoc:" << rLoc.toString() << endl;
+            out() << "     this->new rLoc:" << rLoc.toString() << endl;
         r = 0;
-        rLoc.btree()->fixParentPtrs(rLoc);
+        rLoc.btree<V>()->fixParentPtrs(rLoc);
 
         {
             KeyNode splitkey = keyNode(split);
-            nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r)
+            this->nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r)
             if ( split_debug ) {
                 out() << "    splitkey key:" << splitkey.key.toString() << endl;
             }
 
-            // promote splitkey to a parent node
-            if ( parent.isNull() ) {
-                // make a new parent if we were the root
+            // Because thisLoc is a descendant of parent, updating parent will
+            // not affect packing or keys of thisLoc and splitkey will be stable
+            // during the following:
+            
+            // promote splitkey to a parent this->node
+            if ( this->parent.isNull() ) {
+                // make a this->new this->parent if we were the root
                 DiskLoc L = addBucket(idx);
-                BtreeBucket *p = L.btreemod();
+                BtreeBucket *p = L.btreemod<V>();
                 p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc);
                 p->nextChild = rLoc;
                 p->assertValid( order );
-                parent = idx.head.writing() = L;
+                this->parent = idx.head.writing() = L;
                 if ( split_debug )
-                    out() << "    we were root, making new root:" << hex << parent.getOfs() << dec << endl;
-                rLoc.btree()->parent.writing() = parent;
+                    out() << "    we were root, making this->new root:" << hex << this->parent.getOfs() << dec << endl;
+                rLoc.btree<V>()->parent.writing() = this->parent;
             }
             else {
                 // set this before calling _insert - if it splits it will do fixParent() logic and change the value.
-                rLoc.btree()->parent.writing() = parent;
+                rLoc.btree<V>()->parent.writing() = this->parent;
                 if ( split_debug )
                     out() << "    promoting splitkey key " << splitkey.key.toString() << endl;
-                parent.btree()->_insert(parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx);
+                BTREE(this->parent)->_insert(this->parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx);
             }
         }
 
         int newpos = keypos;
         // note this may trash splitkey.key.  thus we had to promote it before finishing up here.
-        truncateTo(split, order, newpos);  // note this may trash splitkey.key.  thus we had to promote it before finishing up here.
+        this->truncateTo(split, order, newpos);
 
-        // add our new key, there is room now
+        // add our this->new key, there is room this->now
         {
             if ( keypos <= split ) {
                 if ( split_debug )
-                    out() << "  keypos<split, insertHere() the new key" << endl;
+                    out() << "  keypos<split, insertHere() the this->new key" << endl;
                 insertHere(thisLoc, newpos, recordLoc, key, order, lchild, rchild, idx);
             }
             else {
                 int kp = keypos-split-1;
                 assert(kp>=0);
-                rLoc.btree()->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx);
+                BTREE(rLoc)->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx);
             }
         }
 
@@ -1308,41 +1422,44 @@ namespace mongo {
             out() << "     split end " << hex << thisLoc.getOfs() << dec << endl;
     }
 
-    /** start a new index off, empty */
-    DiskLoc BtreeBucket::addBucket(const IndexDetails& id) {
+    /** start a this->new index off, empty */
+    template< class V >
+    DiskLoc BtreeBucket<V>::addBucket(const IndexDetails& id) {
         string ns = id.indexNamespace();
-        DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, BucketSize, true);
-        BtreeBucket *b = loc.btreemod();
+        DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, V::BucketSize, true);
+        BtreeBucket *b = BTREEMOD(loc);
         b->init();
         return loc;
     }
 
-    void BtreeBucket::renameIndexNamespace(const char *oldNs, const char *newNs) {
+    void renameIndexNamespace(const char *oldNs, const char *newNs) {
         renameNamespace( oldNs, newNs );
     }
 
-    const DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) const {
+    template< class V >
+    const DiskLoc BtreeBucket<V>::getHead(const DiskLoc& thisLoc) const {
         DiskLoc p = thisLoc;
-        while ( !p.btree()->isHead() )
-            p = p.btree()->parent;
+        while ( !BTREE(p)->isHead() )
+            p = BTREE(p)->parent;
         return p;
     }
 
-    DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const {
-        if ( keyOfs < 0 || keyOfs >= n ) {
-            out() << "ASSERT failure BtreeBucket::advance, caller: " << caller << endl;
+    template< class V >
+    DiskLoc BtreeBucket<V>::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const {
+        if ( keyOfs < 0 || keyOfs >= this->n ) {
+            out() << "ASSERT failure BtreeBucket<V>::advance, caller: " << caller << endl;
             out() << "  thisLoc: " << thisLoc.toString() << endl;
-            out() << "  keyOfs: " << keyOfs << " n:" << n << " direction: " << direction << endl;
+            out() << "  keyOfs: " << keyOfs << " this->n:" << this->n << " direction: " << direction << endl;
             out() << bucketSummary() << endl;
             assert(false);
         }
         int adj = direction < 0 ? 1 : 0;
         int ko = keyOfs + direction;
-        DiskLoc nextDown = childForPos(ko+adj);
+        DiskLoc nextDown = this->childForPos(ko+adj);
         if ( !nextDown.isNull() ) {
             while ( 1 ) {
-                keyOfs = direction>0 ? 0 : nextDown.btree()->n - 1;
-                DiskLoc loc = nextDown.btree()->childForPos(keyOfs + adj);
+	      keyOfs = direction>0 ? 0 : BTREE(nextDown)->n - 1;
+	        DiskLoc loc = BTREE(nextDown)->childForPos(keyOfs + adj);
                 if ( loc.isNull() )
                     break;
                 nextDown = loc;
@@ -1350,18 +1467,18 @@ namespace mongo {
             return nextDown;
         }
 
-        if ( ko < n && ko >= 0 ) {
+        if ( ko < this->n && ko >= 0 ) {
             keyOfs = ko;
             return thisLoc;
         }
 
         // end of bucket.  traverse back up.
         DiskLoc childLoc = thisLoc;
-        DiskLoc ancestor = parent;
+        DiskLoc ancestor = this->parent;
         while ( 1 ) {
             if ( ancestor.isNull() )
                 break;
-            const BtreeBucket *an = ancestor.btree();
+            const BtreeBucket *an = BTREE(ancestor);
             for ( int i = 0; i < an->n; i++ ) {
                 if ( an->childForPos(i+adj) == childLoc ) {
                     keyOfs = i;
@@ -1369,7 +1486,7 @@ namespace mongo {
                 }
             }
             assert( direction<0 || an->nextChild == childLoc );
-            // parent exhausted also, keep going up
+            // this->parent exhausted also, keep going up
             childLoc = ancestor;
             ancestor = an->parent;
         }
@@ -1377,7 +1494,14 @@ namespace mongo {
         return DiskLoc();
     }
 
-    DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
+    template< class V >
+    DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
+        KeyOwned k(key);
+        return locate(idx, thisLoc, k, order, pos, found, recordLoc, direction);
+    }
+
+    template< class V >
+    DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const Key& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
         int p;
         found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false);
         if ( found ) {
@@ -1385,10 +1509,10 @@ namespace mongo {
             return thisLoc;
         }
 
-        DiskLoc child = childForPos(p);
+        DiskLoc child = this->childForPos(p);
 
         if ( !child.isNull() ) {
-            DiskLoc l = child.btree()->locate(idx, child, key, order, pos, found, recordLoc, direction);
+            DiskLoc l = BTREE(child)->locate(idx, child, key, order, pos, found, recordLoc, direction);
             if ( !l.isNull() )
                 return l;
         }
@@ -1397,14 +1521,15 @@ namespace mongo {
         if ( direction < 0 )
             return --pos == -1 ? DiskLoc() /*theend*/ : thisLoc;
         else
-            return pos == n ? DiskLoc() /*theend*/ : thisLoc;
+            return pos == this->n ? DiskLoc() /*theend*/ : thisLoc;
     }
 
-    bool BtreeBucket::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const {
+    template< class V >
+    bool BtreeBucket<V>::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const {
         while( 1 ) {
             if ( l + 1 == h ) {
                 keyOfs = ( direction > 0 ) ? h : l;
-                DiskLoc next = thisLoc.btree()->k( h ).prevChildBucket;
+                DiskLoc next = BTREE(thisLoc)->k( h ).prevChildBucket;
                 if ( !next.isNull() ) {
                     bestParent = make_pair( thisLoc, keyOfs );
                     thisLoc = next;
@@ -1415,7 +1540,7 @@ namespace mongo {
                 }
             }
             int m = l + ( h - l ) / 2;
-            int cmp = customBSONCmp( thisLoc.btree()->keyNode( m ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+            int cmp = customBSONCmp( BTREE(thisLoc)->keyNode( m ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
             if ( cmp < 0 ) {
                 l = m;
             }
@@ -1438,18 +1563,19 @@ namespace mongo {
      * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd
      * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient
      */
-    void BtreeBucket::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const {
+    template< class V >
+    void BtreeBucket<V>::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const {
         int l,h;
         bool dontGoUp;
         if ( direction > 0 ) {
             l = keyOfs;
-            h = n - 1;
-            dontGoUp = ( customBSONCmp( keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
+            h = this->n - 1;
+            dontGoUp = ( customBSONCmp( keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
         }
         else {
             l = 0;
             h = keyOfs;
-            dontGoUp = ( customBSONCmp( keyNode( l ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
+            dontGoUp = ( customBSONCmp( keyNode( l ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
         }
         pair< DiskLoc, int > bestParent;
         if ( dontGoUp ) {
@@ -1459,16 +1585,16 @@ namespace mongo {
             }
         }
         else {
-            // go up parents until rightmost/leftmost node is >=/<= target or at top
-            while( !thisLoc.btree()->parent.isNull() ) {
-                thisLoc = thisLoc.btree()->parent;
+            // go up this->parents until rightmost/leftmost node is >=/<= target or at top
+	    while( !BTREE(thisLoc)->parent.isNull() ) {
+	        thisLoc = BTREE(thisLoc)->parent;
                 if ( direction > 0 ) {
-                    if ( customBSONCmp( thisLoc.btree()->keyNode( thisLoc.btree()->n - 1 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) {
+		  if ( customBSONCmp( BTREE(thisLoc)->keyNode( BTREE(thisLoc)->n - 1 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) {
                         break;
                     }
                 }
                 else {
-                    if ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) {
+		  if ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) {
                         break;
                     }
                 }
@@ -1477,31 +1603,32 @@ namespace mongo {
         customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent );
     }
 
-    void BtreeBucket::customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const {
-        if ( thisLoc.btree()->n == 0 ) {
+    template< class V >
+    void BtreeBucket<V>::customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const {
+      if ( BTREE(thisLoc)->n == 0 ) {
             thisLoc = DiskLoc();
             return;
         }
         // go down until find smallest/biggest >=/<= target
         while( 1 ) {
             int l = 0;
-            int h = thisLoc.btree()->n - 1;
+            int h = BTREE(thisLoc)->n - 1;
             // leftmost/rightmost key may possibly be >=/<= search key
             bool firstCheck;
             if ( direction > 0 ) {
-                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
+	      firstCheck = ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
             }
             else {
-                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
+	      firstCheck = ( customBSONCmp( BTREE(thisLoc)->keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
             }
             if ( firstCheck ) {
                 DiskLoc next;
                 if ( direction > 0 ) {
-                    next = thisLoc.btree()->k( 0 ).prevChildBucket;
+		    next = BTREE(thisLoc)->k( 0 ).prevChildBucket;
                     keyOfs = 0;
                 }
                 else {
-                    next = thisLoc.btree()->nextChild;
+                    next = BTREE(thisLoc)->nextChild;
                     keyOfs = h;
                 }
                 if ( !next.isNull() ) {
@@ -1515,21 +1642,21 @@ namespace mongo {
             }
             bool secondCheck;
             if ( direction > 0 ) {
-                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) < 0 );
+                secondCheck = ( customBSONCmp( BTREE(thisLoc)->keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) < 0 );
             }
             else {
-                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) > 0 );
+                secondCheck = ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) > 0 );
             }
             if ( secondCheck ) {
                 DiskLoc next;
                 if ( direction > 0 ) {
-                    next = thisLoc.btree()->nextChild;
+                    next = BTREE(thisLoc)->nextChild;
                 }
                 else {
-                    next = thisLoc.btree()->k( 0 ).prevChildBucket;
+                    next = BTREE(thisLoc)->k( 0 ).prevChildBucket;
                 }
                 if ( next.isNull() ) {
-                    // if bestParent is null, we've hit the end and thisLoc gets set to DiskLoc()
+                    // if bestParent is this->null, we've hit the end and thisLoc gets set to DiskLoc()
                     thisLoc = bestParent.first;
                     keyOfs = bestParent.second;
                     return;
@@ -1547,14 +1674,15 @@ namespace mongo {
 
 
     /** @thisLoc disk location of *this */
-    int BtreeBucket::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
-                             const BSONObj& key, const Ordering &order, bool dupsAllowed,
+    template< class V >
+    int BtreeBucket<V>::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                             const Key& key, const Ordering &order, bool dupsAllowed,
                              const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const {
-        if ( key.objsize() > KeyMax ) {
-            problem() << "ERROR: key too large len:" << key.objsize() << " max:" << KeyMax << ' ' << key.objsize() << ' ' << idx.indexNamespace() << endl;
+        if ( key.dataSize() > this->KeyMax ) {
+            problem() << "ERROR: key too large len:" << key.dataSize() << " max:" << this->KeyMax << ' ' << key.dataSize() << ' ' << idx.indexNamespace() << endl;
             return 2;
         }
-        assert( key.objsize() > 0 );
+        assert( key.dataSize() > 0 );
 
         int pos;
         bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed);
@@ -1562,15 +1690,15 @@ namespace mongo {
             out() << "  " << thisLoc.toString() << '.' << "_insert " <<
                   key.toString() << '/' << recordLoc.toString() <<
                   " l:" << lChild.toString() << " r:" << rChild.toString() << endl;
-            out() << "    found:" << found << " pos:" << pos << " n:" << n << endl;
+            out() << "    found:" << found << " pos:" << pos << " this->n:" << this->n << endl;
         }
 
         if ( found ) {
             const _KeyNode& kn = k(pos);
             if ( kn.isUnused() ) {
                 log(4) << "btree _insert: reusing unused key" << endl;
-                massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull());
-                massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull());
+                massert( 10285 , "_insert: reuse key but lchild is not this->null", lChild.isNull());
+                massert( 10286 , "_insert: reuse key but rchild is not this->null", rChild.isNull());
                 kn.writing().setUsed();
                 return 0;
             }
@@ -1580,78 +1708,89 @@ namespace mongo {
                 log() << "  " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n';
                 log() << "  " << key.toString() << '\n';
                 log() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
-                log() << "  old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl;
-                log() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
+                log() << "  old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl;
+                log() << "  this->new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
             }
             alreadyInIndex();
         }
 
         DEBUGGING out() << "TEMP: key: " << key.toString() << endl;
-        DiskLoc child = childForPos(pos);
+        Loc ch = this->childForPos(pos);
+        DiskLoc child = ch;
         if ( insert_debug )
             out() << "    getChild(" << pos << "): " << child.toString() << endl;
-        if ( child.isNull() || !rChild.isNull() /* means an 'internal' insert */ ) {
+        // In current usage, rChild isNull() for a this->new key and false when we are
+        // promoting a split key.  These are the only two cases where _insert()
+        // is called currently.
+        if ( child.isNull() || !rChild.isNull() ) {
+            // A this->new key will be inserted at the same tree height as an adjacent existing key.
             insertHere(thisLoc, pos, recordLoc, key, order, lChild, rChild, idx);
             return 0;
         }
 
-        return child.btree()->bt_insert(child, recordLoc, key, order, dupsAllowed, idx, /*toplevel*/false);
+        return child.btree<V>()->_insert(child, recordLoc, key, order, dupsAllowed, /*lchild*/DiskLoc(), /*rchild*/DiskLoc(), idx);
     }
 
-    void BtreeBucket::dump() const {
-        out() << "DUMP btreebucket n:" << n;
-        out() << " parent:" << hex << parent.getOfs() << dec;
-        for ( int i = 0; i < n; i++ ) {
-            out() << '\n';
+    template< class V >
+    void BtreeBucket<V>::dump(unsigned depth) const {
+        string indent = string(depth, ' ');
+        _log() << "BUCKET n:" << this->n;
+        _log() << " parent:" << hex << this->parent.getOfs() << dec;
+        for ( int i = 0; i < this->n; i++ ) {
+            _log() << '\n' << indent;
             KeyNode k = keyNode(i);
-            out() << '\t' << i << '\t' << k.key.toString() << "\tleft:" << hex <<
-                  k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec;
+            string ks = k.key.toString();
+            _log() << "  " << hex << k.prevChildBucket.getOfs() << '\n';
+            _log() << indent << "    " << i << ' ' << ks.substr(0, 30) << " Loc:" << k.recordLoc.toString() << dec;
             if ( this->k(i).isUnused() )
-                out() << " UNUSED";
+                _log() << " UNUSED";
         }
-        out() << " right:" << hex << nextChild.getOfs() << dec << endl;
+        _log() << "\n" << indent << "  " << hex << this->nextChild.getOfs() << dec << endl;
     }
 
     /** todo: meaning of return code unclear clean up */
-    int BtreeBucket::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
-                               const BSONObj& key, const Ordering &order, bool dupsAllowed,
-                               IndexDetails& idx, bool toplevel) const {
+    template< class V >
+    int BtreeBucket<V>::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                               const BSONObj& _key, const Ordering &order, bool dupsAllowed,
+                               IndexDetails& idx, bool toplevel) const 
+    {
+        KeyOwned key(_key);
+
         if ( toplevel ) {
-            if ( key.objsize() > KeyMax ) {
-                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.objsize() << ' ' << key.toString() << endl;
+            if ( key.dataSize() > this->KeyMax ) {
+                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.dataSize() << ' ' << key.toString() << endl;
                 return 3;
             }
         }
 
         int x = _insert(thisLoc, recordLoc, key, order, dupsAllowed, DiskLoc(), DiskLoc(), idx);
-        assertValid( order );
+        this->assertValid( order );
 
         return x;
     }
 
-    void BtreeBucket::shape(stringstream& ss) const {
-        _shape(0, ss);
+    template< class V >
+    void BtreeBucket<V>::shape(stringstream& ss) const {
+        this->_shape(0, ss);
     }
 
-    int BtreeBucket::getLowWaterMark() {
-        return lowWaterMark;
+    template< class V >
+    int BtreeBucket<V>::getKeyMax() {
+        return V::KeyMax;
     }
 
-    int BtreeBucket::getKeyMax() {
-        return KeyMax;
-    }
-
-    DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const {
-        indexdetails.checkVersion();
+    template< class V >
+    DiskLoc BtreeBucket<V>::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const {
         int pos;
         bool found;
-        // TODO: is it really ok here that the order is a default?
+        // TODO: is it really ok here that the order is a default?  
+        // for findById() use, yes.  for checkNoIndexConflicts, this->no?
         Ordering o = Ordering::make(BSONObj());
         DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc );
         if ( bucket.isNull() )
             return bucket;
 
-        const BtreeBucket *b = bucket.btree();
+        const BtreeBucket<V> *b = bucket.btree<V>();
         while ( 1 ) {
             const _KeyNode& knraw = b->k(pos);
             if ( knraw.isUsed() )
@@ -1659,23 +1798,24 @@ namespace mongo {
             bucket = b->advance( bucket , pos , 1 , "findSingle" );
             if ( bucket.isNull() )
                 return bucket;
-            b = bucket.btree();
+            b = bucket.btree<V>();
         }
         KeyNode kn = b->keyNode( pos );
-        if ( key.woCompare( kn.key ) != 0 )
+        if ( KeyOwned(key).woCompare( kn.key, o ) != 0 )
             return DiskLoc();
         return kn.recordLoc;
     }
 
-} // namespace mongo
+} // this->namespace mongo
 
 #include "db.h"
 #include "dbhelpers.h"
 
 namespace mongo {
 
-    void BtreeBucket::a_test(IndexDetails& id) {
-        BtreeBucket *b = id.head.btreemod();
+    template< class V >
+    void BtreeBucket<V>::a_test(IndexDetails& id) {
+        BtreeBucket *b = id.head.btreemod<V>();
 
         // record locs for testing
         DiskLoc A(1, 20);
@@ -1703,155 +1843,45 @@ namespace mongo {
 
         b->dumpTree(id.head, orderObj);
 
-        /*        b->bt_insert(id.head, B, key, order, false, id);
+        /* b->bt_insert(id.head, B, key, order, false, id);
         b->k(1).setUnused();
-
         b->dumpTree(id.head, order);
-
         b->bt_insert(id.head, A, key, order, false, id);
-
         b->dumpTree(id.head, order);
         */
 
         // this should assert.  does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing)
         b->bt_insert(id.head, C, key, order, false, id);
 
-//        b->dumpTree(id.head, order);
+        // b->dumpTree(id.head, order);
     }
 
-    /* --- BtreeBuilder --- */
+    template class BucketBasics<V0>;
+    template class BucketBasics<V1>;
+    template class BtreeBucket<V0>;
+    template class BtreeBucket<V1>;
 
-    BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) :
-        dupsAllowed(_dupsAllowed),
-        idx(_idx),
-        n(0),
-        order( idx.keyPattern() ),
-        ordering( Ordering::make(idx.keyPattern()) ) {
-        first = cur = BtreeBucket::addBucket(idx);
-        b = cur.btreemod();
-        committed = false;
-    }
-
-    void BtreeBuilder::newBucket() {
-        DiskLoc L = BtreeBucket::addBucket(idx);
-        b->tempNext() = L;
-        cur = L;
-        b = cur.btreemod();
-    }
-
-    void BtreeBuilder::mayCommitProgressDurably() {
-        if ( getDur().commitIfNeeded() ) {
-            b = cur.btreemod();
-        }
-    }
-
-    void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) {
-        if ( key.objsize() > KeyMax ) {
-            problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() 
-                      << ' ' << key.objsize() << ' ' << key.toString() << endl;
-            return;
-        }
-
-        if( !dupsAllowed ) {
-            if( n > 0 ) {
-                int cmp = keyLast.woCompare(key, order);
-                massert( 10288 ,  "bad key order in BtreeBuilder - server internal error", cmp <= 0 );
-                if( cmp == 0 ) {
-                    //if( !dupsAllowed )
-                    uasserted( ASSERT_ID_DUPKEY , BtreeBucket::dupKeyError( idx , keyLast ) );
-                }
+    struct BTUnitTest : public UnitTest {
+        void run() {
+            DiskLoc big(0xf12312, 0x70001234);
+            DiskLoc56Bit bigl;
+            {
+                bigl = big;
+                assert( big == bigl );
+                DiskLoc e = bigl;
+                assert( big == e );
             }
-            keyLast = key;
-        }
-
-        if ( ! b->_pushBack(loc, key, ordering, DiskLoc()) ) {
-            // bucket was full
-            newBucket();
-            b->pushBack(loc, key, ordering, DiskLoc());
-        }
-        n++;
-        mayCommitProgressDurably();
-    }
-
-    void BtreeBuilder::buildNextLevel(DiskLoc loc) {
-        int levels = 1;
-        while( 1 ) {
-            if( loc.btree()->tempNext().isNull() ) {
-                // only 1 bucket at this level. we are done.
-                getDur().writingDiskLoc(idx.head) = loc;
-                break;
-            }
-            levels++;
-
-            DiskLoc upLoc = BtreeBucket::addBucket(idx);
-            DiskLoc upStart = upLoc;
-            BtreeBucket *up = upLoc.btreemod();
-
-            DiskLoc xloc = loc;
-            while( !xloc.isNull() ) {
-                if ( getDur().commitIfNeeded() ) {
-                    b = cur.btreemod();
-                    up = upLoc.btreemod();
-                }
-
-                BtreeBucket *x = xloc.btreemod();
-                BSONObj k;
-                DiskLoc r;
-                x->popBack(r,k);
-                bool keepX = ( x->n != 0 );
-                DiskLoc keepLoc = keepX ? xloc : x->nextChild;
-
-                if ( ! up->_pushBack(r, k, ordering, keepLoc) ) {
-                    // current bucket full
-                    DiskLoc n = BtreeBucket::addBucket(idx);
-                    up->tempNext() = n;
-                    upLoc = n;
-                    up = upLoc.btreemod();
-                    up->pushBack(r, k, ordering, keepLoc);
-                }
-
-                DiskLoc nextLoc = x->tempNext(); // get next in chain at current level
-                if ( keepX ) {
-                    x->parent = upLoc;
-                }
-                else {
-                    if ( !x->nextChild.isNull() )
-                        x->nextChild.btreemod()->parent = upLoc;
-                    x->deallocBucket( xloc, idx );
-                }
-                xloc = nextLoc;
+            {
+                DiskLoc d;
+                assert( d.isNull() );
+                DiskLoc56Bit l;
+                l = d;
+                assert( l.isNull() );
+                d = l;
+                assert( d.isNull() );
+                assert( l < bigl );
             }
-
-            loc = upStart;
-            mayCommitProgressDurably();
         }
-
-        if( levels > 1 )
-            log(2) << "btree levels: " << levels << endl;
-    }
-
-    /** when all addKeys are done, we then build the higher levels of the tree */
-    void BtreeBuilder::commit() {
-        buildNextLevel(first);
-        committed = true;
-    }
-
-    BtreeBuilder::~BtreeBuilder() {
-        DESTRUCTOR_GUARD(
-            if( !committed ) {
-                log(2) << "Rolling back partially built index space" << endl;
-                DiskLoc x = first;
-                while( !x.isNull() ) {
-                    DiskLoc next = x.btree()->tempNext();
-                    string ns = idx.indexNamespace();
-                    theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x);
-                    x = next;
-                    getDur().commitIfNeeded();
-                }
-                assert( idx.head.isNull() );
-                log(2) << "done rollback" << endl;
-            }
-        )
-    }
+    } btunittest;
 
 }
diff --git a/db/btree.h b/db/btree.h
index bced95e..9ffa54c 100644
--- a/db/btree.h
+++ b/db/btree.h
@@ -22,37 +22,99 @@
 #include "jsobj.h"
 #include "diskloc.h"
 #include "pdfile.h"
+#include "key.h"
 
 namespace mongo {
 
-    const int BucketSize = 8192;
+    /**
+     * Our btree implementation generally follows the standard btree algorithm,
+     * which is described in many places.  The nodes of our btree are referred to
+     * as buckets below.  These buckets are of size BucketSize and their body is
+     * an ordered array of <bson key, disk loc> pairs, where disk loc is the disk
+     * location of a document and bson key is a projection of this document into
+     * the schema of the index for this btree.  Ordering is determined on the
+     * basis of bson key first and then disk loc in case of a tie.  All bson keys
+     * for a btree have identical schemas with empty string field names and may
+     * not have an objsize() exceeding KeyMax.  The btree's buckets are
+     * themselves organized into an ordered tree.  Although there are exceptions,
+     * generally buckets with n keys have n+1 children and the body of a bucket is
+     * at least lowWaterMark bytes.  A more strictly enforced requirement is that
+     * a non root bucket must have at least one key except in certain transient
+     * states.
+     *
+     * Our btrees support the following primary read operations: finding a
+     * specified key; iterating from a starting key to the next or previous
+     * ordered key; and skipping from a starting key to another specified key
+     * without checking every intermediate key.  The primary write operations
+     * are insertion and deletion of keys.  Insertion may trigger a bucket split
+     * if necessary to avoid bucket overflow.  In such a case, subsequent splits
+     * will occur recursively as necessary.  Deletion may trigger a bucket
+     * rebalance, in which a size deficient bucket is filled with keys from an
+     * adjacent bucket.  In this case, splitting may potentially occur in the
+     * parent.  Deletion may alternatively trigger a merge, in which the keys
+     * from two buckets and a key from their shared parent are combined into the
+     * same bucket.  In such a case, rebalancing or merging may proceed
+     * recursively from the parent.
+     *
+     * While the btree data format has been relatively constant over time, btrees
+     * initially created by versions of mongo earlier than the current version
+     * may embody different properties than freshly created btrees (while
+     * following the same data format).  These older btrees are referred to
+     * below as legacy btrees.
+     */
+    
+    const int OldBucketSize = 8192;
 
 #pragma pack(1)
-    struct _KeyNode {
+    template< class Version > class BucketBasics;
+
+    /**
+     * This is the fixed width data component for storage of a key within a
+     * bucket.  It contains an offset pointer to the variable width bson
+     * data component.  A _KeyNode may be 'unused', please see below.
+     */
+    template< class Loc >
+    struct __KeyNode {
         /** Signals that we are writing this _KeyNode and casts away const */
-        _KeyNode& writing() const;
-        DiskLoc prevChildBucket; // the lchild
-        DiskLoc recordLoc; // location of the record associated with the key
-        short keyDataOfs() const {
-            return (short) _kdo;
-        }
+        __KeyNode<Loc> & writing() const;
+        /**
+         * The 'left' child bucket of this key.  If this is the i-th key, it
+         * points to the i index child bucket.
+         */
+        Loc prevChildBucket;
+        /** The location of the record associated with this key. */
+        Loc recordLoc;
+        short keyDataOfs() const { return (short) _kdo; }
+
+        /** Offset within current bucket of the variable width bson key for this _KeyNode. */
         unsigned short _kdo;
         void setKeyDataOfs(short s) {
             _kdo = s;
             assert(s>=0);
         }
+        /** Seems to be redundant. */
         void setKeyDataOfsSavingUse(short s) {
             _kdo = s;
             assert(s>=0);
         }
-        void setUsed() { recordLoc.GETOFS() &= ~1; }
+        /**
+         * Unused keys are not returned by read operations.  Keys may be marked
+         * as unused in cases where it is difficult to delete them while
+         * maintaining the constraints required of a btree.
+         *
+         * Setting ofs to odd is the sentinel for unused, as real recordLoc's
+         * are always even numbers.  Note we need to keep its value basically
+         * the same as we use the recordLoc as part of the key in the index
+         * (to handle duplicate keys efficiently).
+         *
+         * Flagging keys as unused is a feature that is being phased out in favor
+         * of deleting the keys outright.  The current btree implementation is
+         * not expected to mark a key as unused in a non legacy btree.
+         */
         void setUnused() {
-            // Setting ofs to odd is the sentinel for unused, as real recordLoc's are always
-            //  even numbers.
-            // Note we need to keep its value basically the same as we use the recordLoc
-            // as part of the key in the index (to handle duplicate keys efficiently).
             recordLoc.GETOFS() |= 1;
         }
+        void setUsed() { recordLoc.GETOFS() &= ~1; }
         int isUnused() const {
             return recordLoc.getOfs() & 1;
         }
@@ -60,44 +122,175 @@ namespace mongo {
             return !isUnused();
         }
     };
-#pragma pack()
-
-    class BucketBasics;
 
     /**
-     * wrapper - this is our in memory representation of the key.
-     * _KeyNode is the disk representation.
+     * This structure represents header data for a btree bucket.  An object of
+     * this type is typically allocated inside of a buffer of size BucketSize,
+     * resulting in a full bucket with an appropriate header.
      *
-     * This object and its bson key will become invalid if the key is moved.
+     * The body of a btree bucket contains an array of _KeyNode objects starting
+     * from its lowest indexed bytes and growing to higher indexed bytes.  The
+     * body also contains variable width bson keys, which are allocated from the
+     * highest indexed bytes toward lower indexed bytes.
+     *
+     * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
+     * h = header data
+     * k = KeyNode data
+     * - = empty space
+     * b = bson key data
+     * u = unused (old) bson key data, that may be garbage collected
      */
-    class KeyNode {
-    public:
-        KeyNode(const BucketBasics& bb, const _KeyNode &k);
-        const DiskLoc& prevChildBucket;
-        const DiskLoc& recordLoc;
-        BSONObj key;
-    };
-
-#pragma pack(1)
-    class BtreeData {
+    class BtreeData_V0 {
     protected:
+        /** Parent bucket of this bucket, which isNull() for the root bucket. */
         DiskLoc parent;
-        DiskLoc nextChild; // child bucket off and to the right of the highest key.
-        unsigned short _wasSize; // can be reused, value is 8192 in current pdfile version Apr2010
-        unsigned short _reserved1; // zero
+        /** Given that there are n keys, this is the n index child. */
+        DiskLoc nextChild;
+        /** can be reused, value is 8192 in current pdfile version Apr2010 */
+        unsigned short _wasSize;
+        /** zero */
+        unsigned short _reserved1;
         int flags;
 
-        // basicInsert() assumes these three are together and in this order:
-        int emptySize; // size of the empty region
-        int topSize; // size of the data at the top of the bucket (keys are at the beginning or 'bottom')
-        int n; // # of keys so far.
+        void _init() {
+            _reserved1 = 0;
+            _wasSize = BucketSize;
+            reserved = 0;
+        }
+
+        /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+        /** Size of the empty region. */
+        int emptySize;
+        /** Size used for bson storage, including storage of old keys. */
+        int topSize;
+        /* Number of keys in the bucket. */
+        int n;
 
         int reserved;
+        /* Beginning of the bucket's body */
+        char data[4];
+
+    public:
+        typedef __KeyNode<DiskLoc> _KeyNode;
+        typedef DiskLoc Loc;
+        typedef KeyBson Key;
+        typedef KeyBson KeyOwned;
+        enum { BucketSize = 8192 };
+
+        // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
+        static const int KeyMax = OldBucketSize / 10;
+    };
+
+    // a a a ofs ofs ofs ofs
+    class DiskLoc56Bit {
+        int ofs;
+        unsigned char _a[3];
+        unsigned long long Z() const { 
+            // endian
+            return *((unsigned long long*)this) & 0x00ffffffffffffffULL;
+        }
+        enum { 
+            // first bit of offsets used in _KeyNode we don't use -1 here.
+            OurNullOfs = -2
+        };
+    public:
+        template< class V >
+        const BtreeBucket<V> * btree() const { 
+            return DiskLoc(*this).btree<V>();
+        }
+        template< class V >
+        BtreeBucket<V> * btreemod() const { 
+            return DiskLoc(*this).btreemod<V>();
+        }
+        operator DiskLoc() const { 
+            // endian
+            if( isNull() ) return DiskLoc();
+            unsigned a = *((unsigned *) (_a-1));
+            return DiskLoc(a >> 8, ofs);
+        }
+        int& GETOFS()      { return ofs; }
+        int getOfs() const { return ofs; }
+        bool operator<(const DiskLoc56Bit& rhs) const {
+            // the orderering of dup keys in btrees isn't too critical, but we'd like to put items that are 
+            // close together on disk close together in the tree, so we do want the file # to be the most significant
+            // bytes
+            return Z() < rhs.Z();
+        }
+        int compare(const DiskLoc56Bit& rhs) const {
+            unsigned long long a = Z();
+            unsigned long long b = rhs.Z();
+            if( a < b ) return -1;
+            return a == b ? 0 : 1;
+        }
+        bool operator==(const DiskLoc56Bit& rhs) const { return Z() == rhs.Z(); }
+        bool operator!=(const DiskLoc56Bit& rhs) const { return Z() != rhs.Z(); }
+        bool operator==(const DiskLoc& rhs) const {
+            return DiskLoc(*this) == rhs;
+        }
+        bool operator!=(const DiskLoc& rhs) const { return !(*this==rhs); }
+        bool isNull() const { return ofs < 0; }
+        void Null() { 
+            ofs = OurNullOfs; 
+            _a[0] = _a[1] = _a[2] = 0;
+        }
+        string toString() const { return DiskLoc(*this).toString(); }
+        void operator=(const DiskLoc& loc) {
+            ofs = loc.getOfs();
+            int la = loc.a();
+            assert( la <= 0xffffff ); // must fit in 3 bytes
+            if( la < 0 ) {
+                assert( la == -1 );
+                la = 0;
+                ofs = OurNullOfs;
+            }
+            memcpy(_a, &la, 3); // endian
+            dassert( ofs != 0 );
+        }
+        DiskLoc56Bit& writing() const { 
+            return *((DiskLoc56Bit*) getDur().writingPtr((void*)this, 7));
+        }
+    };
+
+    class BtreeData_V1 {
+    public:
+        typedef DiskLoc56Bit Loc;
+        //typedef DiskLoc Loc;
+        typedef __KeyNode<Loc> _KeyNode;
+        typedef KeyV1 Key;
+        typedef KeyV1Owned KeyOwned;
+        enum { BucketSize = 8192-16 }; // leave room for Record header
+        // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
+        static const int KeyMax = 1024;
+    protected:
+        /** Parent bucket of this bucket, which isNull() for the root bucket. */
+        Loc parent;
+        /** Given that there are n keys, this is the n index child. */
+        Loc nextChild;
+
+        unsigned short flags;
+
+        /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+        /** Size of the empty region. */
+        unsigned short emptySize;
+        /** Size used for bson storage, including storage of old keys. */
+        unsigned short topSize;
+        /* Number of keys in the bucket. */
+        unsigned short n;
+
+        /* Beginning of the bucket's body */
         char data[4];
+
+        void _init() { }
     };
 
+    typedef BtreeData_V0 V0;
+    typedef BtreeData_V1 V1;
+
     /**
-     * This class is all about the storage management
+     * This class adds functionality to BtreeData for managing a single bucket.
+     * The following policies are used in an attempt to encourage simplicity:
      *
      * Const member functions of this class are those which may be called on
      * an object for which writing has not been signaled.  Non const member
@@ -108,21 +301,47 @@ namespace mongo {
      *
      * DiskLoc parameters that may shadow references within the btree should
      * be passed by value rather than by reference to non const member
-     * functions or const member functions which may perform writes.  This way
+     * functions or to const member functions which may perform writes.  This way
      * a callee need not worry that write operations will change or invalidate
      * its arguments.
      *
      * The current policy for dealing with bson arguments is the opposite of
-     * what is described above for DiskLoc arguments.  We do
-     * not want to want to copy bson into memory as an intermediate step for
-     * btree changes, so if bson is to be moved it must be copied to the new
-     * location before the old location is invalidated.
+     * what is described above for DiskLoc arguments.  We do not want to copy
+     * bson into memory as an intermediate step for btree changes, and if bson
+     * is to be moved it must be copied to the new location before the old
+     * location is invalidated.  Care should be taken in cases where that invalid
+     * memory may be implicitly referenced by function arguments.
+     *
+     * A number of functions below require a thisLoc argument, which must be the
+     * disk location of the bucket mapped to 'this'.
      */
-    class BucketBasics : public BtreeData {
-        friend class BtreeBuilder;
-        friend class KeyNode;
+    template< class Version >
+    class BucketBasics : public Version {
     public:
-        /** assert write intent declared for this bucket already */
+        template <class U> friend class BtreeBuilder;
+        typedef typename Version::Key Key;
+        typedef typename Version::_KeyNode _KeyNode;
+        typedef typename Version::Loc Loc;
+
+        int getN() const { return this->n; }
+
+        /**
+         * This is an in memory wrapper for a _KeyNode, and not itself part of btree
+         * storage.  This object and its BSONObj 'key' will become invalid if the
+         * _KeyNode data that generated it is moved within the btree.  In general,
+         * a KeyNode should not be expected to be valid after a write.
+         */
+        class KeyNode {
+        public:
+            KeyNode(const BucketBasics<Version>& bb, const _KeyNode &k);
+            const Loc& prevChildBucket;
+            const Loc& recordLoc;
+            /* Points to the bson key storage for a _KeyNode */
+            Key key;
+        };
+        friend class KeyNode;
+
+        /** Assert write intent declared for this bucket already. */
         void assertWritable();
 
         void assertValid(const Ordering &order, bool force = false) const;
@@ -130,11 +349,12 @@ namespace mongo {
 
         /**
          * @return KeyNode for key at index i.  The KeyNode will become invalid
-         * if the key is moved or reassigned, or if the node is packed.
+         * if the key is moved or reassigned, or if the node is packed.  In general
+         * a KeyNode should not be expected to be valid after a write.
          */
         const KeyNode keyNode(int i) const {
-            if ( i >= n ) {
-                massert( 13000 , (string)"invalid keyNode: " +  BSON( "i" << i << "n" << n ).jsonString() , i < n );
+            if ( i >= this->n ) {
+                massert( 13000 , (string)"invalid keyNode: " +  BSON( "i" << i << "n" << this->n ).jsonString() , i < this->n );
             }
             return KeyNode(*this, k(i));
         }
@@ -143,29 +363,50 @@ namespace mongo {
             const BucketBasics *d = 0;
             return (char*)&(d->data) - (char*)&(d->parent);
         }
-        static int bodySize() { return BucketSize - headerSize(); }
+        static int bodySize() { return Version::BucketSize - headerSize(); }
+        static int lowWaterMark() { return bodySize() / 2 - Version::KeyMax - sizeof( _KeyNode ) + 1; } // see comment in btree.cpp
 
         // for testing
-        int nKeys() const { return n; }
-        const DiskLoc getNextChild() const { return nextChild; }
+        int nKeys() const { return this->n; }
+        const DiskLoc getNextChild() const { return this->nextChild; }
 
     protected:
-        char * dataAt(short ofs) { return data + ofs; }
+        char * dataAt(short ofs) { return this->data + ofs; }
 
-        void init(); // initialize a new node
+        /** Initialize the header for a new node. */
+        void init();
 
         /**
-         * @return false if node is full and must be split
-         * @keypos is where to insert -- inserted before that key #.  so keypos=0 is the leftmost one.
-         *  keypos will be updated if keys are moved as a result of pack()
-         * This function will modify the btree bucket memory representation even
-         * though it is marked const.
+         * Preconditions:
+         *  - 0 <= keypos <= n
+         *  - If key is inserted at position keypos, the bucket's keys will still be
+         *    in order.
+         * Postconditions:
+         *  - If key can fit in the bucket, the bucket may be packed and keypos
+         *    may be decreased to reflect deletion of earlier indexed keys during
+         *    packing, the key will be inserted at the updated keypos index with
+         *    a null prevChildBucket, the subsequent keys shifted to the right,
+         *    and the function will return true.
+         *  - If key cannot fit in the bucket, the bucket will be packed and
+         *    the function will return false.
+         * Although this function is marked const, it modifies the underlying
+         * btree representation through an optimized write intent mechanism.
          */
-        bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const;
+        bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const;
 
-        /** @return true if works, false if not enough space */
-        bool _pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild);
-        void pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) {
+        /**
+         * Preconditions:
+         *  - key / recordLoc are > all existing keys
+         *  - The keys in prevChild and their descendents are between all existing
+         *    keys and 'key'.
+         * Postconditions:
+         *  - If there is space for key without packing, it is inserted as the
+         *    last key with specified prevChild and true is returned.
+         *    Importantly, nextChild is not updated!
+         *  - Otherwise false is returned and there is no change.
+         */
+        bool _pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild);
+        void pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
             bool ok = _pushBack( recordLoc , key , order , prevChild );
             assert(ok);
         }
@@ -180,10 +421,30 @@ namespace mongo {
          * returns the last key without deleting it and another which simply
          * deletes the last key.  Then the caller would have enough control to
          * ensure proper memory integrity.
+         *
+         * Preconditions:
+         *  - bucket is not empty
+         *  - last key of bucket is used (not unused)
+         *  - nextChild isNull()
+         *  - _unalloc will work correctly as used - see code
+         * Postconditions:
+         *  - The last key of the bucket is removed, and its key and recLoc are
+         *    returned.  As mentioned above, the key points to unallocated memory.
          */
-        void popBack(DiskLoc& recLoc, BSONObj& key);
+        void popBack(DiskLoc& recLoc, Key &key);
 
-        void _delKeyAtPos(int keypos, bool mayEmpty = false); // low level version that doesn't deal with child ptrs.
+        /**
+         * Preconditions:
+         *  - 0 <= keypos < n
+         *  - there is no child bucket at keypos
+         *  - n > 1
+         *  - if mayEmpty == false or nextChild.isNull(), n > 0
+         * Postconditions:
+         *  - The key at keypos is removed, and remaining keys are shifted over.
+         *  - The bucket becomes unpacked.
+         *  - if mayEmpty is true and nextChild.isNull(), the bucket may have no keys.
+         */
+        void _delKeyAtPos(int keypos, bool mayEmpty = false);
 
         /* !Packed means there is deleted fragment space within the bucket.
            We "repack" when we run out of space before considering the node
@@ -191,64 +452,124 @@ namespace mongo {
            */
         enum Flags { Packed=1 };
 
-        const DiskLoc& childForPos(int p) const { return p == n ? nextChild : k(p).prevChildBucket; }
-        DiskLoc& childForPos(int p) { return p == n ? nextChild : k(p).prevChildBucket; }
+        /** n == 0 is ok */
+        const Loc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
+        Loc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
 
+        /** Same as bodySize(). */
         int totalDataSize() const;
-        /** @return true if the key may be dropped by pack() */
+        /**
+         * @return true when a key may be dropped by pack()
+         * @param index index of the key that may be dropped
+         * @param refPos index of a particular key of interest, which must not
+         *  be dropped; = 0 to safely ignore
+         */
         bool mayDropKey( int index, int refPos ) const;
 
         /**
          * Pack the bucket to reclaim space from invalidated memory.
-         * @refPos is an index in the bucket which will may be updated if we
+         * @refPos is an index in the bucket which may be updated if we
          *  delete keys from the bucket
          * This function may cast away const and perform a write.
+         * Preconditions: none
+         * Postconditions:
+         *  - Bucket will be packed
+         *  - Some unused nodes may be dropped, but not ones at index 0 or refPos
+         *  - Some used nodes may be moved
+         *  - If refPos is the index of an existing key, it will be updated to that
+         *    key's new index if the key is moved.
          */
         void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const;
         /** Pack when already writable */
         void _packReadyForMod(const Ordering &order, int &refPos);
 
+        /** @return the size the bucket's body would have if we were to call pack() */
+        int packedDataSize( int refPos ) const;
+        void setNotPacked() { this->flags &= ~Packed; }
+        void setPacked() { this->flags |= Packed; }
         /**
-         * @return the size of non header data in this bucket if we were to
-         * call pack().
+         * Preconditions: 'bytes' is <= emptySize
+         * Postconditions: A buffer of size 'bytes' is allocated on the top side,
+         *  and its offset is returned.
          */
-        int packedDataSize( int refPos ) const;
-        void setNotPacked() { flags &= ~Packed; }
-        void setPacked() { flags |= Packed; }
         int _alloc(int bytes);
+        /**
+         * This function can be used to deallocate the lowest byte index bson
+         * buffer in the top region, which in some but not all cases is for the
+         * n - 1 index key.  This function only works correctly in certain
+         * special cases, please be careful.
+         * Preconditions: 'bytes' <= topSize
+         * Postconditions: The top region is decreased
+         */
         void _unalloc(int bytes);
+        /**
+         * Preconditions: 'N' <= n
+         * Postconditions:
+         *  - All keys after the N index key are dropped.
+         *  - Then bucket is packed, without dropping refPos if < refPos N.
+         */
         void truncateTo(int N, const Ordering &order, int &refPos);
-        /** drop specified number of keys from beginning of key array, and pack */
+        /**
+         * Preconditions:
+         *  - 'nDrop' < n
+         *  - for now, refPos should be zero.
+         * Postconditions:
+         *  - All keys before the nDrop index key are dropped.
+         *  - The bucket is packed.
+         */
         void dropFront(int nDrop, const Ordering &order, int &refPos);
+        /**
+         * Preconditions: 0 <= keypos < n
+         * Postconditions: keypos indexed key is marked unused.
+         */
         void markUnused(int keypos);
 
         /**
          * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
          *   we use tempNext() when we do that to be less confusing. (one might have written a union in C)
          */
-        const DiskLoc& tempNext() const { return parent; }
-        DiskLoc& tempNext() { return parent; }
+        DiskLoc tempNext() const { return this->parent; }
+        void setTempNext(DiskLoc l) { this->parent = l; }
 
         void _shape(int level, stringstream&) const;
         int Size() const;
-        const _KeyNode& k(int i) const { return ((const _KeyNode*)data)[i]; }
-        _KeyNode& k(int i) { return ((_KeyNode*)data)[i]; }
+        
+        /** @return i-indexed _KeyNode, without bounds checking */
+    public:
+        const _KeyNode& k(int i) const { return ((const _KeyNode*)this->data)[i]; }
+        _KeyNode& _k(int i) { return ((_KeyNode*)this->data)[i]; }
+    protected:        
+        _KeyNode& k(int i) { return ((_KeyNode*)this->data)[i]; }
 
-        /** @return the key position where a split should occur on insert */
+        /**
+         * Preconditions: 'this' is packed
+         * @return the key index to be promoted on split
+         * @param keypos The requested index of a key to insert, which may affect
+         *  the choice of split position.
+         */
         int splitPos( int keypos ) const;
 
         /**
-         * Adds new entries to beginning of key array, shifting existing
-         * entries to the right.  After this is called, setKey() must be called
-         * on all the newly created entries in the key array.
+         * Preconditions: nAdd * sizeof( _KeyNode ) <= emptySize
+         * Postconditions:
+         *  - Increases indexes of existing _KeyNode objects by nAdd, reserving
+         *    space for additional _KeyNode objects at front.
+         *  - Does not initialize ofs values for the bson data of these
+         *    _KeyNode objects.
          */
         void reserveKeysFront( int nAdd );
 
         /**
-         * Sets an existing key using the given parameters.
-         * @i index of key to set
+         * Preconditions:
+         *  - 0 <= i < n
+         *  - The bson 'key' must fit in the bucket without packing.
+         *  - If 'key' and 'prevChildBucket' are set at index i, the btree
+         *    ordering properties will be maintained.
+         * Postconditions:
+         *  - The specified key is set at index i, replacing the existing
+         *    _KeyNode data and without shifting any other _KeyNode objects.
          */
-        void setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket );
+        void setKey( int i, const DiskLoc recordLoc, const Key& key, const DiskLoc prevChildBucket );
     };
 
     /**
@@ -273,22 +594,35 @@ namespace mongo {
      * standard usage.  Right now the interface is for both a node and a tree,
      * so assignment of const is sometimes nonideal.
      *
-     * TODO There are several cases in which the this pointer is invalidated
+     * TODO There are several cases in which the 'this' pointer is invalidated
      * as a result of deallocation.  A seperate class representing a btree would
      * alleviate some fragile cases where the implementation must currently
-     * behave correctly if the this pointer is suddenly invalidated by a
+     * behave correctly if the 'this' pointer is suddenly invalidated by a
      * callee.
      */
-    class BtreeBucket : public BucketBasics {
+    template< class V >
+    class BtreeBucket : public BucketBasics<V> {
         friend class BtreeCursor;
     public:
-        bool isHead() const { return parent.isNull(); }
+	// make compiler happy:
+        typedef typename V::Key Key;
+        typedef typename V::KeyOwned KeyOwned;
+	typedef typename BucketBasics<V>::KeyNode KeyNode;
+	typedef typename BucketBasics<V>::_KeyNode _KeyNode;
+	typedef typename BucketBasics<V>::Loc Loc;
+        const _KeyNode& k(int i) const     { return static_cast< const BucketBasics<V> * >(this)->k(i); }
+    protected:
+        _KeyNode& k(int i)                 { return static_cast< BucketBasics<V> * >(this)->_k(i); }
+    public:
+        const KeyNode keyNode(int i) const { return static_cast< const BucketBasics<V> * >(this)->keyNode(i); }
+
+        bool isHead() const { return this->parent.isNull(); }
         void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const;
-        int fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount = 0, bool strict = false) const; /* traverses everything */
+        long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount = 0, bool strict = false, unsigned depth=0) const; /* traverses everything */
 
-        bool isUsed( int i ) const { return k(i).isUsed(); }
+        bool isUsed( int i ) const { return this->k(i).isUsed(); }
         string bucketSummary() const;
-        void dump() const;
+        void dump(unsigned depth=0) const;
 
         /**
          * @return true if key exists in index
@@ -297,25 +631,63 @@ namespace mongo {
          *    BSONObj order = ((IndexDetails&)idx).keyPattern();
          * likewise below in bt_insert() etc.
          */
-        bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const;
+    private:
+        bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const;
+    public:
 
+        /**
+         * @param self - Don't complain about ourself already being in the index case.
+         * @return true = There is a duplicate used key.
+         */
         bool wouldCreateDup(
             const IndexDetails& idx, const DiskLoc &thisLoc,
-            const BSONObj& key, const Ordering& order,
+            const Key& key, const Ordering& order,
             const DiskLoc &self) const;
 
-        static DiskLoc addBucket(const IndexDetails&); /* start a new index off, empty */
-        /** invalidates 'this' and thisLoc */
-        void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);
+        /**
+         * Preconditions: none
+         * Postconditions: @return a new bucket allocated from pdfile storage
+         *  and init()-ed.  This bucket is suitable to for use as a new root
+         *  or any other new node in the tree.
+         */
+        static DiskLoc addBucket(const IndexDetails&);
 
-        static void renameIndexNamespace(const char *oldNs, const char *newNs);
+        /**
+         * Preconditions: none
+         * Postconditions:
+         *  - Some header values in this bucket are cleared, and the bucket is
+         *    deallocated from pdfile storage.
+         *  - The memory at thisLoc is invalidated, and 'this' is invalidated.
+         */
+        void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);
 
-        /** This function may change the btree root */
+        /**
+         * Preconditions:
+         *  - 'key' has a valid schema for this index.
+         *  - All other paramenters are valid and consistent with this index if applicable.
+         * Postconditions:
+         *  - If key is bigger than KeyMax, @return 2 or 3 and no change.
+         *  - If key / recordLoc exist in the btree as an unused key, set them
+         *    as used and @return 0
+         *  - If key / recordLoc exist in the btree as a used key, @throw
+         *    exception 10287 and no change.
+         *  - If key / recordLoc do not exist in the btree, they are inserted
+         *    and @return 0.  The root of the btree may be changed, so
+         *    'this'/thisLoc may no longer be the root upon return.
+         */
         int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
                       const BSONObj& key, const Ordering &order, bool dupsAllowed,
                       IndexDetails& idx, bool toplevel = true) const;
 
-        /** This function may change the btree root */
+        /**
+         * Preconditions:
+         *  - 'key' has a valid schema for this index, and may have objsize() > KeyMax.
+         * Postconditions:
+         *  - If key / recordLoc are in the btree, they are removed (possibly
+         *    by being marked as an unused key), @return true, and potentially
+         *    invalidate 'this' / thisLoc and change the head.
+         *  - If key / recordLoc are not in the btree, @return false and do nothing.
+         */
         bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const;
 
         /**
@@ -327,21 +699,31 @@ namespace mongo {
          */
         DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
                        int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
+        DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const Key& key, const Ordering &order,
+                       int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
 
         /**
          * find the first instance of the key
          * does not handle dups
-         * returned DiskLoc isNull if can't find anything with that
+         * WARNING: findSingle may not be compound index safe.  this may need to change.  see notes in 
+         *          findSingle code.
          * @return the record location of the first match
          */
         DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const;
 
-        /** advance one key position in the index: */
+        /**
+         * Advance to next or previous key in the index.
+         * @param direction to advance.
+         */
         DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const;
 
+        /** Advance in specified direction to the specified key */
         void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const;
+
+        /** Locate a key with fields comprised of a combination of keyBegin fields and keyEnd fields. */
         void customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const;
 
+        /** @return head of the btree by traversing from current bucket. */
         const DiskLoc getHead(const DiskLoc& thisLoc) const;
 
         /** get tree shape */
@@ -349,111 +731,275 @@ namespace mongo {
 
         static void a_test(IndexDetails&);
 
-        static int getLowWaterMark();
         static int getKeyMax();
 
     protected:
         /**
-         * Fix parent pointers for children
-         * @firstIndex first index to modify
-         * @lastIndex last index to modify (-1 means last index is n)
+         * Preconditions:
+         *  - 0 <= firstIndex <= n
+         *  - -1 <= lastIndex <= n ( -1 is equivalent to n )
+         * Postconditions:
+         *  - Any children at indexes firstIndex through lastIndex (inclusive)
+         *    will have their parent pointers set to thisLoc.
          */
         void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const;
 
-        /** invalidates this and thisLoc */
+        /**
+         * Preconditions:
+         *  - thisLoc is not the btree head.
+         *  - n == 0 is ok
+         * Postconditions:
+         *  - All cursors pointing to this bucket will be updated.
+         *  - This bucket's parent's child pointer is set to null.
+         *  - This bucket is deallocated from pdfile storage.
+         *  - 'this' and thisLoc are invalidated.
+         */
         void delBucket(const DiskLoc thisLoc, const IndexDetails&);
-        /** may invalidate this and thisLoc */
+
+        /**
+         * Preconditions: 0 <= p < n
+         * Postconditions:
+         *  - The key at index p is removed from the btree.
+         *  - 'this' and thisLoc may be invalidated.
+         *  - The tree head may change.
+         */
         void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order);
 
         /**
-         * May balance utilization of this bucket with a neighbor, either by
-         * merging the buckets or shifting nodes.
-         * @return true iff balancing was performed.
-         * NOTE This function may invalidate thisLoc.
+         * Preconditions:
+         *  - n == 0 is ok
+         * Postconditions:
+         *  - If thisLoc is head, or if its body has at least lowWaterMark bytes,
+         *    return false and do nothing.
+         *  - Otherwise, if thisLoc has left or right neighbors, either balance
+         *    or merge with them and return true.  Also, 'this' and thisLoc may
+         *    be invalidated and the tree head may change.
          */
         bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const;
 
-        /** @return true if balance succeeded */
+        /**
+         * Preconditions:
+         *  - 0 <= leftIndex < n
+         *  - The child at leftIndex or the child at leftIndex + 1 contains
+         *    fewer than lowWaterMark bytes.
+         * Postconditions:
+         *  - If the child bucket at leftIndex can merge with the child index
+         *    at leftIndex + 1, do nothing and return false.
+         *  - Otherwise, balance keys between the leftIndex child and the
+         *    leftIndex + 1 child, return true, and possibly change the tree head.
+         */
         bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const;
+
+        /**
+         * Preconditions:
+         *  - All preconditions of tryBalanceChildren.
+         *  - The leftIndex child and leftIndex + 1 child cannot be merged.
+         * Postconditions:
+         *  - Keys are moved between the leftIndex child and the leftIndex + 1
+         *    child such that neither child has fewer than lowWaterMark bytes.
+         *    The tree head may change.
+         */
         void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order );
+        
+        /**
+         * Preconditions:
+         *  - All preconditions of doBalanceChildren
+         *  - The leftIndex and leftIndex + 1 children are packed.
+         *  - The leftIndex + 1 child has fewer than lowWaterMark bytes.
+         *  - split returned by rebalancedSeparatorPos()
+         * Postconditions:
+         *  - The key in lchild at index split is set as thisLoc's key at index
+         *    leftIndex, which may trigger a split and change the tree head.
+         *    The previous key in thisLoc at index leftIndex and all keys with
+         *    indexes greater than split in lchild are moved to rchild.
+         */
         void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
-                                   BtreeBucket *l, const DiskLoc lchild,
-                                   BtreeBucket *r, const DiskLoc rchild,
+                                   BtreeBucket<V> *l, const DiskLoc lchild,
+                                   BtreeBucket<V> *r, const DiskLoc rchild,
                                    IndexDetails &id, const Ordering &order );
+        /**
+         * Preconditions:
+         *  - All preconditions of doBalanceChildren
+         *  - The leftIndex and leftIndex + 1 children are packed.
+         *  - The leftIndex child has fewer than lowWaterMark bytes.
+         *  - split returned by rebalancedSeparatorPos()
+         * Postconditions:
+         *  - The key in rchild at index split - l->n - 1 is set as thisLoc's key
+         *    at index leftIndex, which may trigger a split and change the tree
+         *    head.  The previous key in thisLoc at index leftIndex and all keys
+         *    with indexes less than split - l->n - 1 in rchild are moved to
+         *    lchild.
+         */        
         void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
-                                   BtreeBucket *l, const DiskLoc lchild,
-                                   BtreeBucket *r, const DiskLoc rchild,
+                                   BtreeBucket<V> *l, const DiskLoc lchild,
+                                   BtreeBucket<V> *r, const DiskLoc rchild,
                                    IndexDetails &id, const Ordering &order );
 
-        /** may invalidate this and thisLoc */
+        /**
+         * Preconditions:
+         *  - 0 <= leftIndex < n
+         *  - this->canMergeChildren( thisLoc, leftIndex ) == true
+         * Postconditions:
+         *  - All of the above mentioned keys will be placed in the left child.
+         *  - The tree may be updated recursively, resulting in 'this' and
+         *    thisLoc being invalidated and the tree head being changed.
+         */
         void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order);
 
-        /** will invalidate this and thisLoc */
+        /**
+         * Preconditions:
+         *  - n == 0
+         *  - !nextChild.isNull()
+         * Postconditions:
+         *  - 'this' and thisLoc are deallocated (and invalidated), any cursors
+         *    to them are updated, and the tree head may change.
+         *  - nextChild replaces thisLoc in the btree structure.
+         */
         void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id );
 
-        /** @return true iff left and right child can be merged into one node */
+        /**
+         * @return true iff the leftIndex and leftIndex + 1 children both exist,
+         *  and if their body sizes when packed and the thisLoc key at leftIndex
+         *  would fit in a single bucket body.
+         */
         bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const;
 
         /**
+         * Preconditions:
+         *  - leftIndex and leftIndex + 1 children are packed
+         *  - leftIndex or leftIndex + 1 child is below lowWaterMark
          * @return index of the rebalanced separator; the index value is
-         * determined as if we had an array
-         * <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> )
-         * This is only expected to be called if the left and right child
-         * cannot be merged.
-         * This function is expected to be called on packed buckets, see also
-         * comments for splitPos().
+         *  determined as if we had a bucket with body
+         *  <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> )
+         *  and called splitPos( 0 ) on it.
          */
         int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const;
 
-        int indexInParent( const DiskLoc &thisLoc ) const;
-        BSONObj keyAt(int keyOfs) const {
-            return keyOfs >= n ? BSONObj() : keyNode(keyOfs).key;
+        /**
+         * Preconditions: thisLoc has a parent
+         * @return parent's index of thisLoc.
+         */
+        int indexInParent( const DiskLoc &thisLoc ) const;        
+
+    public:
+        Key keyAt(int i) const {
+            if( i >= this->n ) 
+                return Key();
+            return Key(this->data + k(i).keyDataOfs());
         }
-        static BtreeBucket* allocTemp(); /* caller must release with free() */
+    protected:
+
+        /**
+         * Allocate a temporary btree bucket in ram rather than in memory mapped
+         * storage.  The caller must release this bucket with free().
+         */
+        static BtreeBucket<V> * allocTemp();
 
-        /** split bucket */
+        /**
+         * Preconditions:
+         *  - This bucket is packed.
+         *  - Cannot add a key of size KeyMax to this bucket.
+         *  - 0 <= keypos <= n is the position of a new key that will be inserted
+         *  - lchild is equal to the existing child at index keypos.
+         * Postconditions:
+         *  - The thisLoc bucket is split into two packed buckets, possibly
+         *    invalidating the initial position of keypos, with a split key
+         *    promoted to the parent.  The new key key/recordLoc will be inserted
+         *    into one of the split buckets, and lchild/rchild set appropriately.
+         *    Splitting may occur recursively, possibly changing the tree head.
+         */
         void split(const DiskLoc thisLoc, int keypos,
-                   const DiskLoc recordLoc, const BSONObj& key,
+                   const DiskLoc recordLoc, const Key& key,
                    const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx);
 
+        /**
+         * Preconditions:
+         *  - 0 <= keypos <= n
+         *  - If key / recordLoc are inserted at position keypos, with provided
+         *    lchild and rchild, the btree ordering requirements will be
+         *    maintained.
+         *  - lchild is equal to the existing child at index keypos.
+         *  - n == 0 is ok.
+         * Postconditions:
+         *  - The key / recordLoc are inserted at position keypos, and the
+         *    bucket is split if necessary, which may change the tree head.
+         *  - The bucket may be packed or split, invalidating the specified value
+         *    of keypos.
+         * This function will always modify thisLoc, but it's marked const because
+         * it commonly relies on the specialized write intent mechanism of basicInsert().
+         */
         void insertHere(const DiskLoc thisLoc, int keypos,
-                        const DiskLoc recordLoc, const BSONObj& key, const Ordering &order,
+                        const DiskLoc recordLoc, const Key& key, const Ordering &order,
                         const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const;
 
+        /** bt_insert() is basically just a wrapper around this. */
         int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
-                    const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                    const Key& key, const Ordering &order, bool dupsAllowed,
                     const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const;
-        bool find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;
+
+        bool find(const IndexDetails& idx, const Key& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;        
         bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const;
         static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
         static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction );
+        
+        /** If child is non null, set its parent to thisLoc */
         static void fix(const DiskLoc thisLoc, const DiskLoc child);
 
-        /** Replaces an existing key with the new specified key, splitting if necessary */
+        /**
+         * Preconditions:
+         *  - 0 <= keypos < n
+         *  - If the specified key and recordLoc are placed in keypos of thisLoc,
+         *    and lchild and rchild are set, the btree ordering properties will
+         *    be maintained.
+         *  - rchild == childForPos( keypos + 1 )
+         *  - childForPos( keypos ) is referenced elsewhere if nonnull.
+         * Postconditions:
+         *  - The key at keypos will be replaced with the specified key and
+         *    lchild, potentially splitting this bucket and changing the tree
+         *    head.
+         *  - childForPos( keypos ) will be orphaned.
+         */
         void setInternalKey( const DiskLoc thisLoc, int keypos,
-                             const DiskLoc recordLoc, const BSONObj &key, const Ordering &order,
+                             const DiskLoc recordLoc, const Key &key, const Ordering &order,
                              const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx);
 
         /**
-         * Deletes the specified key, replacing it with the key immediately
-         * preceding or succeeding it in the btree.  Either the left or right
-         * child of the specified key must be non null.
+         * Preconditions:
+         *  - 0 <= keypos < n
+         *  - The keypos or keypos+1 indexed child is non null.
+         * Postconditions:
+         *  - The specified key is deleted by replacing it with another key if
+         *    possible.  This replacement may cause a split and change the tree
+         *    head.  The replacement key will be deleted from its original
+         *    location, potentially causing merges and splits that may invalidate
+         *    'this' and thisLoc and change the tree head.
+         *  - If the key cannot be replaced, it will be marked as unused.  This
+         *    is only expected in legacy btrees.
          */
         void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order );
     public:
         /** simply builds and returns a dup key error message string */
-        static string dupKeyError( const IndexDetails& idx , const BSONObj& key );
+        static string dupKeyError( const IndexDetails& idx , const Key& key );
     };
 #pragma pack()
 
+    class FieldRangeVector;
+    class FieldRangeVectorIterator;
+    
     class BtreeCursor : public Cursor {
-    public:
+    protected:
         BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
         BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+    public:
+        virtual ~BtreeCursor();
+        /** makes an appropriate subclass depending on the index version */
+        static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+        static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+
         virtual bool ok() { return !bucket.isNull(); }
         virtual bool advance();
         virtual void noteLocation(); // updates keyAtKeyOfs...
-        virtual void checkLocation();
+        virtual void checkLocation() = 0;
         virtual bool supportGetMore() { return true; }
         virtual bool supportYields() { return true; }
 
@@ -462,7 +1008,7 @@ namespace mongo {
          * if a multikey index traversal:
          *   if loc has already been sent, returns true.
          *   otherwise, marks loc as sent.
-         * @return true if the loc has not been seen
+         * @return false if the loc has not been seen
          */
         virtual bool getsetdup(DiskLoc loc) {
             if( _multikey ) {
@@ -475,18 +1021,17 @@ namespace mongo {
         virtual bool modifiedKeys() const { return _multikey; }
         virtual bool isMultiKey() const { return _multikey; }
 
-        const _KeyNode& _currKeyNode() const {
+        /*const _KeyNode& _currKeyNode() const {
             assert( !bucket.isNull() );
-            const _KeyNode& kn = bucket.btree()->k(keyOfs);
+            const _KeyNode& kn = keyNode(keyOfs);
             assert( kn.isUsed() );
             return kn;
-        }
-        const KeyNode currKeyNode() const {
-            assert( !bucket.isNull() );
-            return bucket.btree()->keyNode(keyOfs);
-        }
+        }*/
 
-        virtual BSONObj currKey() const { return currKeyNode().key; }
+        /** returns BSONObj() if ofs is out of range */
+        virtual BSONObj keyAt(int ofs) const = 0;
+
+        virtual BSONObj currKey() const = 0;
         virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); }
 
         virtual void aboutToDeleteBucket(const DiskLoc& b) {
@@ -494,33 +1039,22 @@ namespace mongo {
                 keyOfs = -1;
         }
 
-        virtual DiskLoc currLoc()  { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();  }
+        virtual DiskLoc currLoc() = 0; //  { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();  }
         virtual DiskLoc refLoc()   { return currLoc(); }
         virtual Record* _current() { return currLoc().rec(); }
         virtual BSONObj current()  { return BSONObj(_current()); }
-        virtual string toString() {
-            string s = string("BtreeCursor ") + indexDetails.indexName();
-            if ( _direction < 0 ) s += " reverse";
-            if ( _bounds.get() && _bounds->size() > 1 ) s += " multi";
-            return s;
-        }
+        virtual string toString();
 
         BSONObj prettyKey( const BSONObj &key ) const {
             return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable();
         }
 
-        virtual BSONObj prettyIndexBounds() const {
-            if ( !_independentFieldRanges ) {
-                return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) );
-            }
-            else {
-                return _bounds->obj();
-            }
-        }
+        virtual BSONObj prettyIndexBounds() const;
 
         void forgetEndKey() { endKey = BSONObj(); }
 
         virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
 
         virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher;  }
 
@@ -529,12 +1063,16 @@ namespace mongo {
         /** for debugging only */
         const DiskLoc getBucket() const { return bucket; }
 
-    private:
+        // just for unit tests
+        virtual bool curKeyHasChild() = 0;
+
+    protected:
         /**
          * Our btrees may (rarely) have "unused" keys when items are deleted.
          * Skip past them.
          */
-        bool skipUnusedKeys( bool mayJump );
+        virtual bool skipUnusedKeys() = 0;
+
         bool skipOutOfRangeKeysAndCheckEnd();
         void skipAndCheck();
         void checkEnd();
@@ -542,14 +1080,17 @@ namespace mongo {
         /** selective audits on construction */
         void audit();
 
+        virtual void _audit() = 0;
+        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) = 0;
+        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
+        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) = 0;
+
         /** set initial bucket */
         void init();
 
         /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */
         void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive );
 
-        friend class BtreeBucket;
-
         set<DiskLoc> _dups;
         NamespaceDetails * const d;
         const int idxNo;
@@ -566,56 +1107,31 @@ namespace mongo {
         BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
         DiskLoc locAtKeyOfs;
         const shared_ptr< FieldRangeVector > _bounds;
-        auto_ptr< FieldRangeVector::Iterator > _boundsIterator;
+        auto_ptr< FieldRangeVectorIterator > _boundsIterator;
         const IndexSpec& _spec;
         shared_ptr< CoveredIndexMatcher > _matcher;
         bool _independentFieldRanges;
         long long _nscanned;
     };
 
-
-    inline bool IndexDetails::hasKey(const BSONObj& key) {
-        return head.btree()->exists(*this, head, key, Ordering::make(keyPattern()));
-    }
-    inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) {
-        return head.btree()->wouldCreateDup(*this, head, key, Ordering::make(keyPattern()), self);
-    }
+    /** Renames the index namespace for this btree's index. */
+    void renameIndexNamespace(const char *oldNs, const char *newNs);
 
     /**
-     * build btree from the bottom up
-     * _ TODO dropDups
+     * give us a writable version of the btree bucket (declares write intent).
+     * note it is likely more efficient to declare write intent on something smaller when you can.
      */
-    class BtreeBuilder {
-        bool dupsAllowed;
-        IndexDetails& idx;
-        unsigned long long n;
-        BSONObj keyLast;
-        BSONObj order;
-        Ordering ordering;
-        bool committed;
-
-        DiskLoc cur, first;
-        BtreeBucket *b;
-
-        void newBucket();
-        void buildNextLevel(DiskLoc);
-        void mayCommitProgressDurably();
-
-    public:
-        ~BtreeBuilder();
-
-        BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx);
-
-        /** keys must be added in order */
-        void addKey(BSONObj& key, DiskLoc loc);
-
-        /**
-         * commit work.  if not called, destructor will clean up partially completed work
-         *  (in case exception has happened).
-         */
-        void commit();
+    template< class V >
+    BtreeBucket<V> * DiskLoc::btreemod() const {
+        assert( _a != -1 );
+        BtreeBucket<V> *b = const_cast< BtreeBucket<V> * >( btree<V>() );
+        return static_cast< BtreeBucket<V>* >( getDur().writingPtr( b, V::BucketSize ) );
+    }
 
-        unsigned long long getn() { return n; }
-    };
+    template< class V >
+    BucketBasics<V>::KeyNode::KeyNode(const BucketBasics<V>& bb, const _KeyNode &k) :
+        prevChildBucket(k.prevChildBucket),
+        recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
+    { }
 
 } // namespace mongo;
diff --git a/db/btreebuilder.cpp b/db/btreebuilder.cpp
new file mode 100644
index 0000000..0ec587a
--- /dev/null
+++ b/db/btreebuilder.cpp
@@ -0,0 +1,184 @@
+// btreebuilder.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "json.h"
+#include "clientcursor.h"
+#include "client.h"
+#include "dbhelpers.h"
+#include "curop-inl.h"
+#include "stats/counters.h"
+#include "dur_commitjob.h"
+#include "btreebuilder.h"
+
+namespace mongo {
+
+    /* --- BtreeBuilder --- */
+
+    template<class V>
+    BtreeBuilder<V>::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) :
+        dupsAllowed(_dupsAllowed),
+        idx(_idx),
+        n(0),
+        order( idx.keyPattern() ),
+        ordering( Ordering::make(idx.keyPattern()) ) {
+        first = cur = BtreeBucket<V>::addBucket(idx);
+        b = cur.btreemod<V>();
+        committed = false;
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::newBucket() {
+        DiskLoc L = BtreeBucket<V>::addBucket(idx);
+        b->setTempNext(L);
+        cur = L;
+        b = cur.btreemod<V>();
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::mayCommitProgressDurably() {
+        if ( getDur().commitIfNeeded() ) {
+            b = cur.btreemod<V>();
+        }
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::addKey(BSONObj& _key, DiskLoc loc) {
+
+        auto_ptr< KeyOwned > key( new KeyOwned(_key) );
+        if ( key->dataSize() > BtreeBucket<V>::KeyMax ) {
+            problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() 
+                      << ' ' << key->dataSize() << ' ' << key->toString() << endl;
+            return;
+        }
+
+        if( !dupsAllowed ) {
+            if( n > 0 ) {
+                int cmp = keyLast->woCompare(*key, ordering);
+                massert( 10288 ,  "bad key order in BtreeBuilder - server internal error", cmp <= 0 );
+                if( cmp == 0 ) {
+                    //if( !dupsAllowed )
+                    uasserted( ASSERT_ID_DUPKEY , BtreeBucket<V>::dupKeyError( idx , *keyLast ) );
+                }
+            }
+        }
+
+        if ( ! b->_pushBack(loc, *key, ordering, DiskLoc()) ) {
+            // bucket was full
+            newBucket();
+            b->pushBack(loc, *key, ordering, DiskLoc());
+        }
+        keyLast = key;
+        n++;
+        mayCommitProgressDurably();
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::buildNextLevel(DiskLoc loc) {
+        int levels = 1;
+        while( 1 ) {
+            if( loc.btree<V>()->tempNext().isNull() ) {
+                // only 1 bucket at this level. we are done.
+                getDur().writingDiskLoc(idx.head) = loc;
+                break;
+            }
+            levels++;
+
+            DiskLoc upLoc = BtreeBucket<V>::addBucket(idx);
+            DiskLoc upStart = upLoc;
+            BtreeBucket<V> *up = upLoc.btreemod<V>();
+
+            DiskLoc xloc = loc;
+            while( !xloc.isNull() ) {
+                if ( getDur().commitIfNeeded() ) {
+                    b = cur.btreemod<V>();
+                    up = upLoc.btreemod<V>();
+                }
+
+                BtreeBucket<V> *x = xloc.btreemod<V>();
+                Key k;
+                DiskLoc r;
+                x->popBack(r,k);
+                bool keepX = ( x->n != 0 );
+                DiskLoc keepLoc = keepX ? xloc : x->nextChild;
+
+                if ( ! up->_pushBack(r, k, ordering, keepLoc) ) {
+                    // current bucket full
+                    DiskLoc n = BtreeBucket<V>::addBucket(idx);
+                    up->setTempNext(n);
+                    upLoc = n;
+                    up = upLoc.btreemod<V>();
+                    up->pushBack(r, k, ordering, keepLoc);
+                }
+
+                DiskLoc nextLoc = x->tempNext(); // get next in chain at current level
+                if ( keepX ) {
+                    x->parent = upLoc;
+                }
+                else {
+		  if ( !x->nextChild.isNull() ) {
+		    DiskLoc ll = x->nextChild;
+		    ll.btreemod<V>()->parent = upLoc;
+		    //(x->nextChild.btreemod<V>())->parent = upLoc;
+		  }
+		  x->deallocBucket( xloc, idx );
+                }
+                xloc = nextLoc;
+            }
+
+            loc = upStart;
+            mayCommitProgressDurably();
+        }
+
+        if( levels > 1 )
+            log(2) << "btree levels: " << levels << endl;
+    }
+
+    /** when all addKeys are done, we then build the higher levels of the tree */
+    template<class V>
+    void BtreeBuilder<V>::commit() {
+        buildNextLevel(first);
+        committed = true;
+    }
+
+    template<class V>
+    BtreeBuilder<V>::~BtreeBuilder() {
+        DESTRUCTOR_GUARD(
+            if( !committed ) {
+                log(2) << "Rolling back partially built index space" << endl;
+                DiskLoc x = first;
+                while( !x.isNull() ) {
+                    DiskLoc next = x.btree<V>()->tempNext();
+                    string ns = idx.indexNamespace();
+                    theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x);
+                    x = next;
+                    getDur().commitIfNeeded();
+                }
+                assert( idx.head.isNull() );
+                log(2) << "done rollback" << endl;
+            }
+        )
+    }
+
+    template class BtreeBuilder<V0>;
+    template class BtreeBuilder<V1>;
+
+}
diff --git a/db/btreebuilder.h b/db/btreebuilder.h
new file mode 100644
index 0000000..6de55d8
--- /dev/null
+++ b/db/btreebuilder.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "btree.h"
+
+namespace mongo {
+
+    /**
+     * build btree from the bottom up
+     */
+    template< class V >
+    class BtreeBuilder {
+        typedef typename V::KeyOwned KeyOwned;
+        typedef typename V::Key Key;
+        
+        bool dupsAllowed;
+        IndexDetails& idx;
+        /** Number of keys added to btree. */
+        unsigned long long n;
+        /** Last key passed to addKey(). */
+        auto_ptr< typename V::KeyOwned > keyLast;
+        BSONObj order;
+        Ordering ordering;
+        /** true iff commit() completed successfully. */
+        bool committed;
+
+        DiskLoc cur, first;
+        BtreeBucket<V> *b;
+
+        void newBucket();
+        void buildNextLevel(DiskLoc);
+        void mayCommitProgressDurably();
+
+    public:
+        ~BtreeBuilder();
+
+        BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx);
+
+        /**
+         * Preconditions: 'key' is > or >= last key passed to this function (depends on _dupsAllowed)
+         * Postconditions: 'key' is added to intermediate storage.
+         */
+        void addKey(BSONObj& key, DiskLoc loc);
+
+        /**
+         * commit work.  if not called, destructor will clean up partially completed work
+         *  (in case exception has happened).
+         */
+        void commit();
+
+        unsigned long long getn() { return n; }
+    };
+
+}
diff --git a/db/btreecursor.cpp b/db/btreecursor.cpp
index ce841ce..f39d5bb 100644
--- a/db/btreecursor.cpp
+++ b/db/btreecursor.cpp
@@ -21,10 +21,254 @@
 #include "pdfile.h"
 #include "jsobj.h"
 #include "curop-inl.h"
+#include "queryutil.h"
 
 namespace mongo {
 
-    extern int otherTraceLevel;
+    template< class V >
+    class BtreeCursorImpl : public BtreeCursor { 
+    public:
+        typedef typename BucketBasics<V>::KeyNode KeyNode;
+        typedef typename V::Key Key;
+        typedef typename V::_KeyNode _KeyNode;
+
+        BtreeCursorImpl(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) : 
+          BtreeCursor(a,b,c,d,e,f,g) { }
+        BtreeCursorImpl(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction) : 
+          BtreeCursor(_d,_idxNo,_id,_bounds,_direction) 
+        { 
+            pair< DiskLoc, int > noBestParent;
+            indexDetails.head.btree<V>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
+            skipAndCheck();
+            dassert( _dups.size() == 0 );
+        }
+
+        virtual DiskLoc currLoc() { 
+            if( bucket.isNull() ) return DiskLoc();
+            return currKeyNode().recordLoc;
+        }
+
+        virtual BSONObj keyAt(int ofs) const { 
+            assert( !bucket.isNull() );
+            const BtreeBucket<V> *b = bucket.btree<V>();
+            int n = b->getN();
+            if( n == 0xffff ) { 
+                throw UserException(15850, "keyAt bucket deleted");
+            }
+            dassert( n >= 0 && n < 10000 );
+            return ofs >= n ? BSONObj() : b->keyNode(ofs).key.toBson();
+        }
+
+        virtual BSONObj currKey() const { 
+            assert( !bucket.isNull() );
+            return bucket.btree<V>()->keyNode(keyOfs).key.toBson();
+        }
+
+        virtual bool curKeyHasChild() { 
+            return !currKeyNode().prevChildBucket.isNull();
+        }
+
+        bool skipUnusedKeys() {
+            int u = 0;
+            while ( 1 ) {
+                if ( !ok() )
+                    break;
+                const _KeyNode& kn = keyNode(keyOfs);
+                if ( kn.isUsed() )
+                    break;
+                bucket = _advance(bucket, keyOfs, _direction, "skipUnusedKeys");
+                u++;
+                //don't include unused keys in nscanned
+                //++_nscanned;
+            }
+            if ( u > 10 )
+                OCCASIONALLY log() << "btree unused skipped:" << u << '\n';
+            return u;
+        }
+
+        /* Since the last noteLocation(), our key may have moved around, and that old cached
+           information may thus be stale and wrong (although often it is right).  We check
+           that here; if we have moved, we have to search back for where we were at.
+
+           i.e., after operations on the index, the BtreeCursor's cached location info may
+           be invalid.  This function ensures validity, so you should call it before using
+           the cursor if other writers have used the database since the last noteLocation
+           call.
+        */
+        void checkLocation() {
+            if ( eof() )
+                return;
+
+            _multikey = d->isMultikey(idxNo);
+
+            if ( keyOfs >= 0 ) {
+                assert( !keyAtKeyOfs.isEmpty() );
+
+                try {
+                    // Note keyAt() returns an empty BSONObj if keyOfs is now out of range,
+                    // which is possible as keys may have been deleted.
+                    int x = 0;
+                    while( 1 ) {
+                        //  if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
+                        //       b->k(keyOfs).recordLoc == locAtKeyOfs ) {
+                        if ( keyAt(keyOfs).binaryEqual(keyAtKeyOfs) ) {
+                            const _KeyNode& kn = keyNode(keyOfs);
+                            if( kn.recordLoc == locAtKeyOfs ) {
+                                if ( !kn.isUsed() ) {
+                                    // we were deleted but still exist as an unused
+                                    // marker key. advance.
+                                    skipUnusedKeys();
+                                }
+                                return;
+                            }
+                        }
+
+                        // we check one key earlier too, in case a key was just deleted.  this is
+                        // important so that multi updates are reasonably fast.
+                        if( keyOfs == 0 || x++ )
+                            break;
+                        keyOfs--;
+                    }
+                }
+                catch(UserException& e) { 
+                    if( e.getCode() != 15850 )
+                        throw;
+                    // hack: fall through if bucket was just deleted. should only happen under deleteObjects()
+                    DEV log() << "debug info: bucket was deleted" << endl;
+                }
+            }
+
+            /* normally we don't get to here.  when we do, old position is no longer
+                valid and we must refind where we left off (which is expensive)
+            */
+
+            /* TODO: Switch to keep indexdetails and do idx.head! */
+            bucket = _locate(keyAtKeyOfs, locAtKeyOfs);
+            RARELY log() << "key seems to have moved in the index, refinding. " << bucket.toString() << endl;
+            if ( ! bucket.isNull() )
+                skipUnusedKeys();
+
+        }
+    
+    protected:
+        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) {
+            thisLoc.btree<V>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction);
+        }
+        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+            return thisLoc.btree<V>()->advance(thisLoc, keyOfs, direction, caller);
+        }
+        virtual void _audit() {
+            out() << "BtreeCursor(). dumping head bucket" << endl;
+            indexDetails.head.btree<V>()->dump();
+        }
+        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) {
+            bool found;
+            return indexDetails.head.btree<V>()->
+                     locate(indexDetails, indexDetails.head, key, _ordering, keyOfs, found, loc, _direction);
+        }
+
+        const _KeyNode& keyNode(int keyOfs) const { 
+            return bucket.btree<V>()->k(keyOfs);
+        }
+
+    private:
+        const KeyNode currKeyNode() const {
+            assert( !bucket.isNull() );
+            const BtreeBucket<V> *b = bucket.btree<V>();
+            return b->keyNode(keyOfs);
+        }
+    };
+
+    template class BtreeCursorImpl<V0>;
+    template class BtreeCursorImpl<V1>;
+
+    /*
+    class BtreeCursorV1 : public BtreeCursor { 
+    public:
+        typedef BucketBasics<V1>::KeyNode KeyNode;
+        typedef V1::Key Key;
+
+        BtreeCursorV1(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) : 
+          BtreeCursor(a,b,c,d,e,f,g) { }
+        BtreeCursorV1(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction) : 
+          BtreeCursor(_d,_idxNo,_id,_bounds,_direction) 
+        { 
+            pair< DiskLoc, int > noBestParent;
+            indexDetails.head.btree<V1>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
+            skipAndCheck();
+            dassert( _dups.size() == 0 );
+        }
+
+        virtual DiskLoc currLoc() { 
+            if( bucket.isNull() ) return DiskLoc();
+            return currKeyNode().recordLoc;
+        }
+
+        virtual BSONObj currKey() const { 
+            assert( !bucket.isNull() );
+            return bucket.btree<V1>()->keyNode(keyOfs).key.toBson();
+        }
+
+    protected:
+        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) {
+            thisLoc.btree<V1>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction);
+        }
+        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+            return thisLoc.btree<V1>()->advance(thisLoc, keyOfs, direction, caller);
+        }
+        virtual void _audit() {
+            out() << "BtreeCursor(). dumping head bucket" << endl;
+            indexDetails.head.btree<V1>()->dump();
+        }
+        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc);
+        virtual const _KeyNode& keyNode(int keyOfs) { 
+            return bucket.btree<V1>()->k(keyOfs);
+        }
+
+    private:
+        const KeyNode currKeyNode() const {
+            assert( !bucket.isNull() );
+            const BtreeBucket<V1> *b = bucket.btree<V1>();
+            return b->keyNode(keyOfs);
+        }
+    };*/
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, 
+        const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction) 
+    { 
+        int v = _id.version();
+        BtreeCursor *c = 0;
+        if( v == 1 ) {
+            c = new BtreeCursorImpl<V1>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction);
+        }
+        else if( v == 0 ) {
+            c = new BtreeCursorImpl<V0>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction);
+        }
+        else {
+            uasserted(14800, str::stream() << "unsupported index version " << v);
+        }
+        c->init();
+        dassert( c->_dups.size() == 0 );
+        return c;
+    }
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, 
+        const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+    {
+        int v = _id.version();
+        if( v == 1 )
+            return new BtreeCursorImpl<V1>(_d,_idxNo,_id,_bounds,_direction);
+        if( v == 0 )
+            return new BtreeCursorImpl<V0>(_d,_idxNo,_id,_bounds,_direction);
+        uasserted(14801, str::stream() << "unsupported index version " << v);
+
+        // just check we are in sync with this method
+        dassert( IndexDetails::isASupportedIndexVersionNumber(v) );
+
+        return 0;
+    }
 
     BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id,
                               const BSONObj &_startKey, const BSONObj &_endKey, bool endKeyInclusive, int _direction ) :
@@ -41,8 +285,6 @@ namespace mongo {
         _independentFieldRanges( false ),
         _nscanned( 0 ) {
         audit();
-        init();
-        dassert( _dups.size() == 0 );
     }
 
     BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction )
@@ -55,7 +297,7 @@ namespace mongo {
         _ordering( Ordering::make( _order ) ),
         _direction( _direction ),
         _bounds( ( assert( _bounds.get() ), _bounds ) ),
-        _boundsIterator( new FieldRangeVector::Iterator( *_bounds  ) ),
+        _boundsIterator( new FieldRangeVectorIterator( *_bounds  ) ),
         _spec( _id.getSpec() ),
         _independentFieldRanges( true ),
         _nscanned( 0 ) {
@@ -64,28 +306,15 @@ namespace mongo {
         startKey = _bounds->startKey();
         _boundsIterator->advance( startKey ); // handles initialization
         _boundsIterator->prepDive();
-        pair< DiskLoc, int > noBestParent;
         bucket = indexDetails.head;
         keyOfs = 0;
-        indexDetails.head.btree()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
-        skipAndCheck();
-        dassert( _dups.size() == 0 );
     }
 
+    /** Properly destroy forward declared class members. */
+    BtreeCursor::~BtreeCursor() {}
+    
     void BtreeCursor::audit() {
-        indexDetails.checkVersion();
         dassert( d->idxNo((IndexDetails&) indexDetails) == idxNo );
-
-        if ( otherTraceLevel >= 12 ) {
-            if ( otherTraceLevel >= 200 ) {
-                out() << "::BtreeCursor() qtl>200.  validating entire index." << endl;
-                indexDetails.head.btree()->fullValidate(indexDetails.head, _order);
-            }
-            else {
-                out() << "BTreeCursor(). dumping head bucket" << endl;
-                indexDetails.head.btree()->dump();
-            }
-        }
     }
 
     void BtreeCursor::init() {
@@ -93,24 +322,28 @@ namespace mongo {
             startKey = _spec.getType()->fixKey( startKey );
             endKey = _spec.getType()->fixKey( endKey );
         }
-        bool found;
-        bucket = indexDetails.head.btree()->
-                 locate(indexDetails, indexDetails.head, startKey, _ordering, keyOfs, found, _direction > 0 ? minDiskLoc : maxDiskLoc, _direction);
+        bucket = _locate(startKey, _direction > 0 ? minDiskLoc : maxDiskLoc);
         if ( ok() ) {
             _nscanned = 1;
         }
-        skipUnusedKeys( false );
+        skipUnusedKeys();
         checkEnd();
     }
 
     void BtreeCursor::skipAndCheck() {
-        skipUnusedKeys( true );
+        int startNscanned = _nscanned;
+        skipUnusedKeys();
         while( 1 ) {
             if ( !skipOutOfRangeKeysAndCheckEnd() ) {
                 break;
             }
-            while( skipOutOfRangeKeysAndCheckEnd() );
-            if ( !skipUnusedKeys( true ) ) {
+            do {
+                if ( _nscanned > startNscanned + 20 ) {
+                    skipUnusedKeys();
+                    return;
+                }
+            } while( skipOutOfRangeKeysAndCheckEnd() );
+            if ( !skipUnusedKeys() ) {
                 break;
             }
         }
@@ -120,7 +353,7 @@ namespace mongo {
         if ( !ok() ) {
             return false;
         }
-        int ret = _boundsIterator->advance( currKeyNode().key );
+        int ret = _boundsIterator->advance( currKey() );
         if ( ret == -2 ) {
             bucket = DiskLoc();
             return false;
@@ -130,33 +363,10 @@ namespace mongo {
             return false;
         }
         ++_nscanned;
-        advanceTo( currKeyNode().key, ret, _boundsIterator->after(), _boundsIterator->cmp(), _boundsIterator->inc() );
+        advanceTo( currKey(), ret, _boundsIterator->after(), _boundsIterator->cmp(), _boundsIterator->inc() );
         return true;
     }
 
-    /* skip unused keys. */
-    bool BtreeCursor::skipUnusedKeys( bool mayJump ) {
-        int u = 0;
-        while ( 1 ) {
-            if ( !ok() )
-                break;
-            const BtreeBucket *b = bucket.btree();
-            const _KeyNode& kn = b->k(keyOfs);
-            if ( kn.isUsed() )
-                break;
-            bucket = b->advance(bucket, keyOfs, _direction, "skipUnusedKeys");
-            u++;
-            //don't include unused keys in nscanned
-            //++_nscanned;
-            if ( mayJump && ( u % 10 == 0 ) ) {
-                skipOutOfRangeKeysAndCheckEnd();
-            }
-        }
-        if ( u > 10 )
-            OCCASIONALLY log() << "btree unused skipped:" << u << '\n';
-        return u;
-    }
-
     // Return a value in the set {-1, 0, 1} to represent the sign of parameter i.
     int sgn( int i ) {
         if ( i == 0 )
@@ -177,7 +387,7 @@ namespace mongo {
     }
 
     void BtreeCursor::advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive) {
-        bucket.btree()->advanceTo( bucket, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, _ordering, _direction );
+        _advanceTo( bucket, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, _ordering, _direction );
     }
 
     bool BtreeCursor::advance() {
@@ -185,10 +395,10 @@ namespace mongo {
         if ( bucket.isNull() )
             return false;
 
-        bucket = bucket.btree()->advance(bucket, keyOfs, _direction, "BtreeCursor::advance");
+        bucket = _advance(bucket, keyOfs, _direction, "BtreeCursor::advance");
 
         if ( !_independentFieldRanges ) {
-            skipUnusedKeys( false );
+            skipUnusedKeys();
             checkEnd();
             if ( ok() ) {
                 ++_nscanned;
@@ -202,69 +412,27 @@ namespace mongo {
 
     void BtreeCursor::noteLocation() {
         if ( !eof() ) {
-            BSONObj o = bucket.btree()->keyAt(keyOfs).copy();
+            BSONObj o = currKey().getOwned();
             keyAtKeyOfs = o;
-            locAtKeyOfs = bucket.btree()->k(keyOfs).recordLoc;
+            locAtKeyOfs = currLoc();
         }
     }
 
-    /* Since the last noteLocation(), our key may have moved around, and that old cached
-       information may thus be stale and wrong (although often it is right).  We check
-       that here; if we have moved, we have to search back for where we were at.
-
-       i.e., after operations on the index, the BtreeCursor's cached location info may
-       be invalid.  This function ensures validity, so you should call it before using
-       the cursor if other writers have used the database since the last noteLocation
-       call.
-    */
-    void BtreeCursor::checkLocation() {
-        if ( eof() )
-            return;
-
-        _multikey = d->isMultikey(idxNo);
-
-        if ( keyOfs >= 0 ) {
-            const BtreeBucket *b = bucket.btree();
-
-            assert( !keyAtKeyOfs.isEmpty() );
-
-            // Note keyAt() returns an empty BSONObj if keyOfs is now out of range,
-            // which is possible as keys may have been deleted.
-            int x = 0;
-            while( 1 ) {
-                if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
-                        b->k(keyOfs).recordLoc == locAtKeyOfs ) {
-                    if ( !b->k(keyOfs).isUsed() ) {
-                        /* we were deleted but still exist as an unused
-                        marker key. advance.
-                        */
-                        skipUnusedKeys( false );
-                    }
-                    return;
-                }
-
-                /* we check one key earlier too, in case a key was just deleted.  this is
-                   important so that multi updates are reasonably fast.
-                   */
-                if( keyOfs == 0 || x++ )
-                    break;
-                keyOfs--;
-            }
-        }
-
-        /* normally we don't get to here.  when we do, old position is no longer
-            valid and we must refind where we left off (which is expensive)
-        */
-
-        bool found;
-
-        /* TODO: Switch to keep indexdetails and do idx.head! */
-        bucket = indexDetails.head.btree()->locate(indexDetails, indexDetails.head, keyAtKeyOfs, _ordering, keyOfs, found, locAtKeyOfs, _direction);
-        RARELY log() << "  key seems to have moved in the index, refinding. found:" << found << endl;
-        if ( ! bucket.isNull() )
-            skipUnusedKeys( false );
-
+    string BtreeCursor::toString() {
+        string s = string("BtreeCursor ") + indexDetails.indexName();
+        if ( _direction < 0 ) s += " reverse";
+        if ( _bounds.get() && _bounds->size() > 1 ) s += " multi";
+        return s;
     }
+    
+    BSONObj BtreeCursor::prettyIndexBounds() const {
+        if ( !_independentFieldRanges ) {
+            return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) );
+        }
+        else {
+            return _bounds->obj();
+        }
+    }    
 
     /* ----------------------------------------------------------------------------- */
 
diff --git a/db/cap.cpp b/db/cap.cpp
index 260b311..a8be238 100644
--- a/db/cap.cpp
+++ b/db/cap.cpp
@@ -26,9 +26,8 @@
 #include "btree.h"
 #include <algorithm>
 #include <list>
-#include "query.h"
-#include "queryutil.h"
 #include "json.h"
+#include "clientcursor.h"
 
 /*
  capped collection layout
@@ -131,7 +130,12 @@ namespace mongo {
     bool NamespaceDetails::inCapExtent( const DiskLoc &dl ) const {
         assert( !dl.isNull() );
         // We could have a rec or drec, doesn't matter.
-        return dl.drec()->myExtent( dl ) == capExtent.ext();
+        bool res = dl.drec()->myExtentLoc(dl) == capExtent;
+        DEV {
+            // old implementation. this check is temp to test works the same.  new impl should be a little faster.
+            assert( res == (dl.drec()->myExtent( dl ) == capExtent.ext()) );
+        }
+        return res;
     }
 
     bool NamespaceDetails::nextIsInCapExtent( const DiskLoc &dl ) const {
@@ -443,7 +447,7 @@ namespace mongo {
         for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) {
             DiskLoc prev = ext.ext()->xprev;
             DiskLoc next = ext.ext()->xnext;
-            DiskLoc empty = ext.ext()->reuse( ns );
+            DiskLoc empty = ext.ext()->reuse( ns, true );
             ext.ext()->xprev.writing() = prev;
             ext.ext()->xnext.writing() = next;
             addDeletedRec( empty.drec(), empty );
diff --git a/db/client.cpp b/db/client.cpp
index e4fd4b9..c1a359c 100644
--- a/db/client.cpp
+++ b/db/client.cpp
@@ -32,6 +32,9 @@
 #include "dbwebserver.h"
 #include "../util/mongoutils/html.h"
 #include "../util/mongoutils/checksum.h"
+#include "../util/file_allocator.h"
+#include "repl/rs.h"
+#include "../scripting/engine.h"
 
 namespace mongo {
 
@@ -40,10 +43,50 @@ namespace mongo {
     set<Client*> Client::clients; // always be in clientsMutex when manipulating this
     boost::thread_specific_ptr<Client> currentClient;
 
+#if defined(_DEBUG)
+    struct StackChecker;
+    ThreadLocalValue<StackChecker *> checker;
+
+    struct StackChecker { 
+        enum { SZ = 256 * 1024 };
+        char buf[SZ];
+        StackChecker() { 
+            checker.set(this);
+        }
+        void init() { 
+            memset(buf, 42, sizeof(buf)); 
+        }
+        static void check(const char *tname) { 
+            static int max;
+            StackChecker *sc = checker.get();
+            const char *p = sc->buf;
+            int i = 0;
+            for( ; i < SZ; i++ ) { 
+                if( p[i] != 42 )
+                    break;
+            }
+            int z = SZ-i;
+            if( z > max ) {
+                max = z;
+                log() << "thread " << tname << " stack usage was " << z << " bytes" << endl;
+            }
+            wassert( i > 16000 );
+        }
+    };
+#endif
+
     /* each thread which does db operations has a Client object in TLS.
        call this when your thread starts.
     */
-    Client& Client::initThread(const char *desc, MessagingPort *mp) {
+    Client& Client::initThread(const char *desc, AbstractMessagingPort *mp) {
+#if defined(_DEBUG)
+        { 
+            if( sizeof(void*) == 8 ) {
+                StackChecker sc;
+                sc.init();
+            }
+        }
+#endif
         assert( currentClient.get() == 0 );
         Client *c = new Client(desc, mp);
         currentClient.reset(c);
@@ -51,7 +94,7 @@ namespace mongo {
         return *c;
     }
 
-    Client::Client(const char *desc, MessagingPort *p) :
+    Client::Client(const char *desc, AbstractMessagingPort *p) :
         _context(0),
         _shutdown(false),
         _desc(desc),
@@ -60,6 +103,11 @@ namespace mongo {
         _mp(p) {
         _connectionId = setThreadName(desc);
         _curOp = new CurOp( this );
+#ifndef _WIN32
+        stringstream temp;
+        temp << hex << showbase << pthread_self();
+        _threadId = temp.str();
+#endif
         scoped_lock bl(clientsMutex);
         clients.insert(this);
     }
@@ -74,13 +122,23 @@ namespace mongo {
             error() << "Client::shutdown not called: " << _desc << endl;
         }
 
-        scoped_lock bl(clientsMutex);
-        if ( ! _shutdown )
-            clients.erase(this);
-        delete _curOp;
+        if ( ! inShutdown() ) {
+            // we can't clean up safely once we're in shutdown
+            scoped_lock bl(clientsMutex);
+            if ( ! _shutdown )
+                clients.erase(this);
+            delete _curOp;
+        }
     }
 
     bool Client::shutdown() {
+#if defined(_DEBUG)
+        { 
+            if( sizeof(void*) == 8 ) {
+                StackChecker::check( desc() );
+            }
+        }
+#endif
         _shutdown = true;
         if ( inShutdown() )
             return false;
@@ -128,17 +186,21 @@ namespace mongo {
     void Client::Context::_finishInit( bool doauth ) {
         int lockState = dbMutex.getState();
         assert( lockState );
+        
+        if ( lockState > 0 && FileAllocator::get()->hasFailed() ) {
+            uassert(14031, "Can't take a write lock while out of disk space", false);
+        }
 
         _db = dbHolder.get( _ns , _path );
         if ( _db ) {
             _justCreated = false;
         }
-        else if ( dbMutex.getState() > 0 ) {
+        else if ( lockState > 0 ) {
             // already in a write lock
             _db = dbHolder.getOrCreate( _ns , _path , _justCreated );
             assert( _db );
         }
-        else if ( dbMutex.getState() < -1 ) {
+        else if ( lockState < -1 ) {
             // nested read lock :(
             assert( _lock );
             _lock->releaseAndWriteLock();
@@ -176,7 +238,7 @@ namespace mongo {
             break;
         default: {
             string errmsg;
-            if ( ! shardVersionOk( _ns , lockState > 0 , errmsg ) ) {
+            if ( ! shardVersionOk( _ns , errmsg ) ) {
                 ostringstream os;
                 os << "[" << _ns << "] shard version not ok in Client::Context: " << errmsg;
                 msgassertedNoTrace( StaleConfigInContextCode , os.str().c_str() );
@@ -315,6 +377,19 @@ namespace mongo {
         _client = 0;
     }
 
+    void CurOp::enter( Client::Context * context ) {
+        ensureStarted();
+        setNS( context->ns() );
+        _dbprofile = context->_db ? context->_db->profile : 0;
+    }
+    
+    void CurOp::leave( Client::Context * context ) {
+        unsigned long long now = curTimeMicros64();
+        Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command );
+        _checkpoint = now;
+    }
+
+
     BSONObj CurOp::infoNoauth() {
         BSONObjBuilder b;
         b.append("opid", _opNum);
@@ -339,20 +414,34 @@ namespace mongo {
         clientStr << _remote.toString();
         b.append("client", clientStr.str());
 
-        if ( _client )
+        if ( _client ) {
             b.append( "desc" , _client->desc() );
-
+            if ( _client->_threadId.size() ) 
+                b.append( "threadId" , _client->_threadId );
+            if ( _client->_connectionId )
+                b.appendNumber( "connectionId" , _client->_connectionId );
+        }
+        
         if ( ! _message.empty() ) {
             if ( _progressMeter.isActive() ) {
                 StringBuilder buf(128);
                 buf << _message.toString() << " " << _progressMeter.toString();
                 b.append( "msg" , buf.str() );
+                BSONObjBuilder sub( b.subobjStart( "progress" ) );
+                sub.appendNumber( "done" , (long long)_progressMeter.done() );
+                sub.appendNumber( "total" , (long long)_progressMeter.total() );
+                sub.done();
             }
             else {
                 b.append( "msg" , _message.toString() );
             }
         }
 
+        if( killed() ) 
+            b.append("killed", true);
+        
+        b.append( "numYields" , _numYields );
+
         return b.obj();
     }
 
@@ -368,7 +457,14 @@ namespace mongo {
         BSONObjBuilder b;
         while ( i.more() )
             b.append( i.next() );
+        
+        b.appendElementsUnique( _handshake );
+
         _handshake = b.obj();
+
+        if (theReplSet && o.hasField("member")) {
+            theReplSet->ghost->associateSlave(_remoteId, o["member"].Int());
+        }
     }
 
     class HandshakeCmd : public Command {
@@ -378,7 +474,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return false; }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             Client& c = cc();
             c.gotHandshake( cmdObj );
             return 1;
@@ -510,4 +606,125 @@ namespace mongo {
 
         return writers + readers;
     }
+
+    void OpDebug::reset() {
+        extra.reset();
+
+        op = 0;
+        iscommand = false;
+        ns = "";
+        query = BSONObj();
+        updateobj = BSONObj();
+        
+        cursorid = 0;
+        ntoreturn = 0;
+        ntoskip = 0;
+        exhaust = false;
+
+        nscanned = 0;
+        idhack = false;
+        scanAndOrder = false;
+        moved = false;
+        fastmod = false;
+        fastmodinsert = false;
+        upsert = false;
+        keyUpdates = 0;
+        
+        exceptionInfo.reset();
+        
+        executionTime = 0;
+        nreturned = 0;
+        responseLength = 0;
+    }
+
+
+#define OPDEBUG_TOSTRING_HELP(x) if( x ) s << " " #x ":" << (x)
+    string OpDebug::toString() const {
+        StringBuilder s( ns.size() + 64 );
+        if ( iscommand )
+            s << "command ";
+        else
+            s << opToString( op ) << ' ';
+        s << ns.toString();
+
+        if ( ! query.isEmpty() ) {
+            if ( iscommand )
+                s << " command: ";
+            else
+                s << " query: ";
+            s << query.toString();
+        }
+        
+        if ( ! updateobj.isEmpty() ) {
+            s << " update: ";
+            updateobj.toString( s );
+        }
+        
+        OPDEBUG_TOSTRING_HELP( cursorid );
+        OPDEBUG_TOSTRING_HELP( ntoreturn );
+        OPDEBUG_TOSTRING_HELP( ntoskip );
+        OPDEBUG_TOSTRING_HELP( exhaust );
+
+        OPDEBUG_TOSTRING_HELP( nscanned );
+        OPDEBUG_TOSTRING_HELP( idhack );
+        OPDEBUG_TOSTRING_HELP( scanAndOrder );
+        OPDEBUG_TOSTRING_HELP( moved );
+        OPDEBUG_TOSTRING_HELP( fastmod );
+        OPDEBUG_TOSTRING_HELP( fastmodinsert );
+        OPDEBUG_TOSTRING_HELP( upsert );
+        OPDEBUG_TOSTRING_HELP( keyUpdates );
+        
+        if ( extra.len() )
+            s << " " << extra.str();
+
+        if ( ! exceptionInfo.empty() ) {
+            s << " exception: " << exceptionInfo.msg;
+            if ( exceptionInfo.code )
+                s << " code:" << exceptionInfo.code;
+        }
+        
+        OPDEBUG_TOSTRING_HELP( nreturned );
+        if ( responseLength )
+            s << " reslen:" << responseLength;
+        s << " " << executionTime << "ms";
+
+        return s.str();
+    }
+
+#define OPDEBUG_APPEND_NUMBER(x) if( x ) b.append( #x , (x) )
+#define OPDEBUG_APPEND_BOOL(x) if( x ) b.appendBool( #x , (x) )
+    void OpDebug::append( const CurOp& curop, BSONObjBuilder& b ) const {
+        b.append( "op" , iscommand ? "command" : opToString( op ) );
+        b.append( "ns" , ns.toString() );
+        if ( ! query.isEmpty() )
+            b.append( iscommand ? "command" : "query" , query );
+        else if ( ! iscommand && curop.haveQuery() )
+            curop.appendQuery( b , "query" );
+
+        if ( ! updateobj.isEmpty() )
+            b.append( "updateobj" , updateobj );
+        
+        OPDEBUG_APPEND_NUMBER( cursorid );
+        OPDEBUG_APPEND_NUMBER( ntoreturn );
+        OPDEBUG_APPEND_NUMBER( ntoskip );
+        OPDEBUG_APPEND_BOOL( exhaust );
+
+        OPDEBUG_APPEND_NUMBER( nscanned );
+        OPDEBUG_APPEND_BOOL( idhack );
+        OPDEBUG_APPEND_BOOL( scanAndOrder );
+        OPDEBUG_APPEND_BOOL( moved );
+        OPDEBUG_APPEND_BOOL( fastmod );
+        OPDEBUG_APPEND_BOOL( fastmodinsert );
+        OPDEBUG_APPEND_BOOL( upsert );
+        OPDEBUG_APPEND_NUMBER( keyUpdates );
+
+        if ( ! exceptionInfo.empty() ) 
+            exceptionInfo.append( b , "exception" , "exceptionCode" );
+        
+        OPDEBUG_APPEND_NUMBER( nreturned );
+        OPDEBUG_APPEND_NUMBER( responseLength );
+        b.append( "millis" , executionTime );
+        
+    }
+
 }
diff --git a/db/client.h b/db/client.h
index 4e8589e..a8e3138 100644
--- a/db/client.h
+++ b/db/client.h
@@ -38,12 +38,13 @@ namespace mongo {
     class CurOp;
     class Command;
     class Client;
-    class MessagingPort;
+    class AbstractMessagingPort;
 
     extern boost::thread_specific_ptr<Client> currentClient;
 
     typedef long long ConnectionId;
 
+    /** the database's concept of an outside "client" */
     class Client : boost::noncopyable {
     public:
         class Context;
@@ -52,14 +53,14 @@ namespace mongo {
         static set<Client*> clients; // always be in clientsMutex when manipulating this
         static int recommendedYieldMicros( int * writers = 0 , int * readers = 0 );
         static int getActiveClientCount( int& writers , int& readers );
-
         static Client *syncThread;
 
-
         /* each thread which does db operations has a Client object in TLS.
            call this when your thread starts.
         */
-        static Client& initThread(const char *desc, MessagingPort *mp = 0);
+        static Client& initThread(const char *desc, AbstractMessagingPort *mp = 0);
+
+        ~Client();
 
         /*
            this has to be called as the client goes away, but before thread termination
@@ -67,17 +68,16 @@ namespace mongo {
          */
         bool shutdown();
 
-
-        ~Client();
-
+        /**  set so isSyncThread() works */
         void iAmSyncThread() {
             wassert( syncThread == 0 );
             syncThread = this;
         }
-        bool isSyncThread() const { return this == syncThread; } // true if this client is the replication secondary pull thread
-
+        /** @return true if this client is the replication secondary pull thread.  not used much, is used in create index sync code. */
+        bool isSyncThread() const { return this == syncThread; }
 
         string clientAddress(bool includePort=false) const;
+        const AuthenticationInfo * getAuthenticationInfo() const { return &_ai; }
         AuthenticationInfo * getAuthenticationInfo() { return &_ai; }
         bool isAdmin() { return _ai.isAuthorized( "admin" ); }
         CurOp* curop() const { return _curOp; }
@@ -96,13 +96,12 @@ namespace mongo {
         void gotHandshake( const BSONObj& o );
         BSONObj getRemoteID() const { return _remoteId; }
         BSONObj getHandshake() const { return _handshake; }
-
-        MessagingPort * port() const { return _mp; }
-
+        AbstractMessagingPort * port() const { return _mp; }
         ConnectionId getConnectionId() const { return _connectionId; }
 
     private:
         ConnectionId _connectionId; // > 0 for things "conn", 0 otherwise
+        string _threadId; // "" on non support systems
         CurOp * _curOp;
         Context * _context;
         bool _shutdown;
@@ -112,9 +111,9 @@ namespace mongo {
         ReplTime _lastOp;
         BSONObj _handshake;
         BSONObj _remoteId;
-        MessagingPort * const _mp;
+        AbstractMessagingPort * const _mp;
 
-        Client(const char *desc, MessagingPort *p = 0);
+        Client(const char *desc, AbstractMessagingPort *p = 0);
 
         friend class CurOp;
 
@@ -128,7 +127,6 @@ namespace mongo {
             ~GodScope();
         };
 
-
         /* Set database we want to use, then, restores when we finish (are out of scope)
            Note this is also helpful if an exception happens as the state if fixed up.
         */
diff --git a/db/clientcursor.cpp b/db/clientcursor.cpp
index bc09457..e803afd 100644
--- a/db/clientcursor.cpp
+++ b/db/clientcursor.cpp
@@ -23,17 +23,18 @@
 */
 
 #include "pch.h"
-#include "query.h"
+#include "clientcursor.h"
 #include "introspect.h"
 #include <time.h>
 #include "db.h"
 #include "commands.h"
 #include "repl_block.h"
+#include "../util/processinfo.h"
 
 namespace mongo {
 
     CCById ClientCursor::clientCursorsById;
-    boost::recursive_mutex ClientCursor::ccmutex;
+    boost::recursive_mutex& ClientCursor::ccmutex( *(new boost::recursive_mutex()) );
     long long ClientCursor::numberTimedOut = 0;
 
     void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ); // from s/d_logic.h
@@ -73,31 +74,39 @@ namespace mongo {
     //void removedKey(const DiskLoc& btreeLoc, int keyPos) {
     //}
 
-    /* todo: this implementation is incomplete.  we use it as a prefix for dropDatabase, which
-             works fine as the prefix will end with '.'.  however, when used with drop and
-             dropIndexes, this could take out cursors that belong to something else -- if you
-             drop "foo", currently, this will kill cursors for "foobar".
-    */
-    void ClientCursor::invalidate(const char *nsPrefix) {
-        vector<ClientCursor*> toDelete;
+    // ns is either a full namespace or "dbname." when invalidating for a whole db
+    void ClientCursor::invalidate(const char *ns) {
+        dbMutex.assertWriteLocked();
+        int len = strlen(ns);
+        const char* dot = strchr(ns, '.');
+        assert( len > 0 && dot);
 
-        int len = strlen(nsPrefix);
-        assert( len > 0 && strchr(nsPrefix, '.') );
+        bool isDB = (dot == &ns[len-1]); // first (and only) dot is the last char
 
         {
-            //cout << "\nTEMP invalidate " << nsPrefix << endl;
+            //cout << "\nTEMP invalidate " << ns << endl;
             recursive_scoped_lock lock(ccmutex);
 
             Database *db = cc().database();
             assert(db);
-            assert( str::startsWith(nsPrefix, db->name) );
+            assert( str::startsWith(ns, db->name) );
 
-            for( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) {
+            for( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); /*++i*/ ) {
                 ClientCursor *cc = i->second;
+
+                ++i; // we may be removing this node
+
                 if( cc->_db != db )
                     continue;
-                if ( strncmp(nsPrefix, cc->_ns.c_str(), len) == 0 ) {
-                    toDelete.push_back(i->second);
+
+                if (isDB) {
+                    // already checked that db matched above
+                    dassert( str::startsWith(cc->_ns.c_str(), ns) );
+                    delete cc; //removes self from ccByID
+                }
+                else {
+                    if ( str::equals(cc->_ns.c_str(), ns) )
+                        delete cc; //removes self from ccByID
                 }
             }
 
@@ -109,15 +118,12 @@ namespace mongo {
             CCByLoc& bl = db->ccByLoc;
             for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); ++i ) {
                 ClientCursor *cc = i->second;
-                if ( strncmp(nsPrefix, cc->ns.c_str(), len) == 0 ) {
+                if ( strncmp(ns, cc->ns.c_str(), len) == 0 ) {
                     assert( cc->_db == db );
                     toDelete.push_back(i->second);
                 }
             }*/
 
-            for ( vector<ClientCursor*>::iterator i = toDelete.begin(); i != toDelete.end(); ++i )
-                delete (*i);
-
             /*cout << "TEMP after invalidate " << endl;
             for( auto i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) {
                 cout << "  " << i->second->ns << endl;
@@ -140,11 +146,19 @@ namespace mongo {
             i++;
             if( j->second->shouldTimeout( millis ) ) {
                 numberTimedOut++;
-                log(1) << "killing old cursor " << j->second->_cursorid << ' ' << j->second->_ns
+                LOG(1) << "killing old cursor " << j->second->_cursorid << ' ' << j->second->_ns
                        << " idle:" << j->second->idleTime() << "ms\n";
                 delete j->second;
             }
         }
+        unsigned sz = clientCursorsById.size();
+        static time_t last;
+        if( sz >= 100000 ) { 
+            if( time(0) - last > 300 ) {
+                last = time(0);
+                log() << "warning number of open cursors is very large: " << sz << endl;
+            }
+        }
     }
 
     /* must call when a btree bucket going away.
@@ -157,6 +171,9 @@ namespace mongo {
         RARELY if ( bl.size() > 70 ) {
             log() << "perf warning: byLoc.size=" << bl.size() << " in aboutToDeleteBucket\n";
         }
+        if( bl.size() == 0 ) { 
+            DEV tlog() << "debug warning: no cursors found in informAboutToDeleteBucket()" << endl;
+        }
         for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); i++ )
             i->second->_c->aboutToDeleteBucket(b);
     }
@@ -225,10 +242,13 @@ namespace mongo {
             c->checkLocation();
             DiskLoc tmp1 = c->refLoc();
             if ( tmp1 != dl ) {
-                /* this might indicate a failure to call ClientCursor::updateLocation() */
+                // This might indicate a failure to call ClientCursor::updateLocation() but it can
+                // also happen during correct operation, see SERVER-2009.
                 problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl;
             }
-            c->advance();
+            else {
+                c->advance();
+            }
             if ( c->eof() ) {
                 // advanced to end
                 // leave ClientCursor in place so next getMore doesn't fail
@@ -249,6 +269,9 @@ namespace mongo {
         _query(query),  _queryOptions(queryOptions),
         _idleAgeMillis(0), _pinValue(0),
         _doingDeletes(false), _yieldSometimesTracker(128,10) {
+
+        dbMutex.assertAtLeastReadLocked();
+
         assert( _db );
         assert( str::startsWith(_ns, _db->name) );
         if( queryOptions & QueryOption_NoCursorTimeout )
@@ -277,7 +300,11 @@ namespace mongo {
 
 
     ClientCursor::~ClientCursor() {
-        assert( _pos != -2 );
+        if( _pos == -2 ) {
+            // defensive: destructor called twice
+            wassert(false);
+            return;
+        }
 
         {
             recursive_scoped_lock lock(ccmutex);
@@ -290,7 +317,7 @@ namespace mongo {
         }
     }
 
-    bool ClientCursor::getFieldsDotted( const string& name, BSONElementSet &ret ) {
+    bool ClientCursor::getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder ) {
 
         map<string,int>::const_iterator i = _indexedFields.find( name );
         if ( i == _indexedFields.end() ) {
@@ -300,7 +327,8 @@ namespace mongo {
 
         int x = i->second;
 
-        BSONObjIterator it( currKey() );
+        holder = currKey();
+        BSONObjIterator it( holder );
         while ( x && it.more() ) {
             it.next();
             x--;
@@ -310,18 +338,20 @@ namespace mongo {
         return true;
     }
 
-    BSONElement ClientCursor::getFieldDotted( const string& name , bool * fromKey ) {
+    BSONElement ClientCursor::getFieldDotted( const string& name , BSONObj& holder , bool * fromKey ) {
 
         map<string,int>::const_iterator i = _indexedFields.find( name );
         if ( i == _indexedFields.end() ) {
             if ( fromKey )
                 *fromKey = false;
-            return current().getFieldDotted( name );
+            holder = current();
+            return holder.getFieldDotted( name );
         }
         
         int x = i->second;
 
-        BSONObjIterator it( currKey() );
+        holder = currKey();
+        BSONObjIterator it( holder );
         while ( x && it.more() ) {
             it.next();
             x--;
@@ -333,6 +363,29 @@ namespace mongo {
         return it.next();
     }
 
+    BSONObj ClientCursor::extractFields(const BSONObj &pattern , bool fillWithNull ) {
+        BSONObjBuilder b( pattern.objsize() * 2 );
+
+        BSONObj holder;
+     
+        BSONObjIterator i( pattern ); 
+        while ( i.more() ) {
+            BSONElement key = i.next();
+            BSONElement value = getFieldDotted( key.fieldName() , holder );
+
+            if ( value.type() ) {
+                b.appendAs( value , key.fieldName() );
+                continue;
+            }
+
+            if ( fillWithNull ) 
+                b.appendNull( key.fieldName() );            
+            
+        }
+
+        return b.obj();
+    }
+    
 
     /* call when cursor's location changes so that we can update the
        cursorsbylocation map.  if you are locked and internally iterating, only
@@ -366,18 +419,66 @@ namespace mongo {
 
         return micros;
     }
+    
+    Record* ClientCursor::_recordForYield( ClientCursor::RecordNeeds need ) {
+        if ( need == DontNeed ) {
+            return 0;
+        }
+        else if ( need == MaybeCovered ) {
+            // TODO
+            return 0;
+        }
+        else if ( need == WillNeed ) {
+            // no-op
+        }
+        else {
+            warning() << "don't understand RecordNeeds: " << (int)need << endl;
+            return 0;
+        }
+
+        DiskLoc l = currLoc();
+        if ( l.isNull() )
+            return 0;
+        
+        Record * rec = l.rec();
+        if ( rec->likelyInPhysicalMemory() ) 
+            return 0;
+        
+        return rec;
+    }
 
-    bool ClientCursor::yieldSometimes() {
-        if ( ! _yieldSometimesTracker.ping() )
+    bool ClientCursor::yieldSometimes( RecordNeeds need, bool *yielded ) {
+        if ( yielded ) {
+            *yielded = false;   
+        }
+        if ( ! _yieldSometimesTracker.ping() ) {
+            Record* rec = _recordForYield( need );
+            if ( rec ) {
+                if ( yielded ) {
+                    *yielded = true;   
+                }
+                return yield( yieldSuggest() , rec );
+            }
             return true;
+        }
 
         int micros = yieldSuggest();
-        return ( micros > 0 ) ? yield( micros ) : true;
+        if ( micros > 0 ) {
+            if ( yielded ) {
+                *yielded = true;   
+            }
+            return yield( micros , _recordForYield( need ) );
+        }
+        return true;
     }
 
-    void ClientCursor::staticYield( int micros , const StringData& ns ) {
+    void ClientCursor::staticYield( int micros , const StringData& ns , Record * rec ) {
         killCurrentOp.checkForInterrupt( false );
         {
+            auto_ptr<RWLockRecursive::Shared> lk;
+            if ( rec )
+                lk.reset( new RWLockRecursive::Shared( MongoFile::mmmutex) );
+            
             dbtempreleasecond unlock;
             if ( unlock.unlocked() ) {
                 if ( micros == -1 )
@@ -386,14 +487,28 @@ namespace mongo {
                     sleepmicros( micros );
             }
             else {
-                warning() << "ClientCursor::yield can't unlock b/c of recursive lock ns: " << ns << endl;
+                CurOp * c = cc().curop();
+                while ( c->parent() )
+                    c = c->parent();
+                warning() << "ClientCursor::yield can't unlock b/c of recursive lock"
+                          << " ns: " << ns 
+                          << " top: " << c->info()
+                          << endl;
             }
+
+            if ( rec )
+                rec->touch();
+
+            lk.reset(0); // need to release this before dbtempreleasecond
         }
     }
 
     bool ClientCursor::prepareToYield( YieldData &data ) {
         if ( ! _c->supportYields() )
             return false;
+        if ( ! _c->prepareToYield() ) {
+            return false;   
+        }
         // need to store in case 'this' gets deleted
         data._id = _cursorid;
 
@@ -434,40 +549,34 @@ namespace mongo {
         }
 
         cc->_doingDeletes = data._doingDeletes;
-        cc->_c->checkLocation();
+        cc->_c->recoverFromYield();
         return true;
     }
 
-    bool ClientCursor::yield( int micros ) {
+    bool ClientCursor::yield( int micros , Record * recordToLoad ) {
         if ( ! _c->supportYields() )
             return true;
+
         YieldData data;
         prepareToYield( data );
 
-        staticYield( micros , _ns );
+        staticYield( micros , _ns , recordToLoad );
 
         return ClientCursor::recoverFromYield( data );
     }
 
-    int ctmLast = 0; // so we don't have to do find() which is a little slow very often.
+    long long ctmLast = 0; // so we don't have to do find() which is a little slow very often.
     long long ClientCursor::allocCursorId_inlock() {
-        if( 0 ) {
-            static long long z;
-            ++z;
-            cout << "TEMP alloccursorid " << z << endl;
-            return z;
-        }
-
+        long long ctm = curTimeMillis64();
+        dassert( ctm );
         long long x;
-        int ctm = (int) curTimeMillis();
         while ( 1 ) {
             x = (((long long)rand()) << 32);
-            x = x | ctm | 0x80000000; // OR to make sure not zero
+            x = x ^ ctm;
             if ( ctm != ctmLast || ClientCursor::find_inlock(x, false) == 0 )
                 break;
         }
         ctmLast = ctm;
-        //DEV tlog() << "  alloccursorid " << x << endl;
         return x;
     }
 
@@ -495,6 +604,19 @@ namespace mongo {
         result.appendNumber("totalOpen", clientCursorsById.size() );
         result.appendNumber("clientCursors_size", (int) numCursors());
         result.appendNumber("timedOut" , numberTimedOut);
+        unsigned pinned = 0;
+        unsigned notimeout = 0;
+        for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); i++ ) {
+            unsigned p = i->second->_pinValue;
+            if( p >= 100 )
+                pinned++;
+            else if( p > 0 )
+                notimeout++;
+        }
+        if( pinned ) 
+            result.append("pinned", pinned);
+        if( notimeout )
+            result.append("totalNoTimeout", notimeout);
     }
 
     // QUESTION: Restrict to the namespace from which this command was issued?
@@ -507,25 +629,64 @@ namespace mongo {
             help << " example: { cursorInfo : 1 }";
         }
         virtual LockType locktype() const { return NONE; }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             ClientCursor::appendStats( result );
             return true;
         }
     } cmdCursorInfo;
 
+    struct Mem { 
+        Mem() { res = virt = mapped = 0; }
+        int res;
+        int virt;
+        int mapped;
+        bool grew(const Mem& r) { 
+            return (r.res && (((double)res)/r.res)>1.1 ) ||
+              (r.virt && (((double)virt)/r.virt)>1.1 ) ||
+              (r.mapped && (((double)mapped)/r.mapped)>1.1 );
+        }
+    };
+
+    /** called once a minute from killcursors thread */
+    void sayMemoryStatus() { 
+        static time_t last;
+        static Mem mlast;
+        try {
+            ProcessInfo p;
+            if ( !cmdLine.quiet && p.supported() ) {
+                Mem m;
+                m.res = p.getResidentSize();
+                m.virt = p.getVirtualMemorySize();
+                m.mapped = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ));
+                if( time(0)-last >= 300 || m.grew(mlast) ) { 
+                    log() << "mem (MB) res:" << m.res << " virt:" << m.virt << " mapped:" << m.mapped << endl;
+                    if( m.virt - (cmdLine.dur?2:1)*m.mapped > 5000 ) { 
+                        ONCE log() << "warning virtual/mapped memory differential is large. journaling:" << cmdLine.dur << endl;
+                    }
+                    last = time(0);
+                    mlast = m;
+                }
+            }
+        }
+        catch(...) {
+            log() << "ProcessInfo exception" << endl;
+        }
+    }
+
+    /** thread for timing out old cursors */
     void ClientCursorMonitor::run() {
         Client::initThread("clientcursormon");
         Client& client = cc();
-
-        unsigned old = curTimeMillis();
-
+        Timer t;
+        const int Secs = 4;
+        unsigned n = 0;
         while ( ! inShutdown() ) {
-            unsigned now = curTimeMillis();
-            ClientCursor::idleTimeReport( now - old );
-            old = now;
-            sleepsecs(4);
+            ClientCursor::idleTimeReport( t.millisReset() );
+            sleepsecs(Secs);
+            if( ++n % (60/4) == 0 /*once a minute*/ ) { 
+                sayMemoryStatus();
+            }
         }
-
         client.shutdown();
     }
 
@@ -551,7 +712,6 @@ namespace mongo {
 
     }
 
-
     ClientCursorMonitor clientCursorMonitor;
 
 } // namespace mongo
diff --git a/db/clientcursor.h b/db/clientcursor.h
index f1d107f..75c7da8 100644
--- a/db/clientcursor.h
+++ b/db/clientcursor.h
@@ -27,13 +27,15 @@
 #include "../pch.h"
 #include "cursor.h"
 #include "jsobj.h"
-#include "../util/message.h"
+#include "../util/net/message.h"
+#include "../util/net/listen.h"
 #include "../util/background.h"
 #include "diskloc.h"
 #include "dbhelpers.h"
 #include "matcher.h"
 #include "../client/dbclient.h"
 #include "projection.h"
+#include "s/d_chunk_manager.h"
 
 namespace mongo {
 
@@ -158,14 +160,15 @@ namespace mongo {
 
         DiskLoc lastLoc() const { return _lastLoc; }
 
-        /* Get rid of cursors for namespaces that begin with nsprefix.
+        /* Get rid of cursors for namespaces 'ns'. When dropping a db, ns is "dbname."
            Used by drop, dropIndexes, dropDatabase.
         */
-        static void invalidate(const char *nsPrefix);
+        static void invalidate(const char *ns);
 
         /**
          * @param microsToSleep -1 : ask client
          *                     >=0 : sleep for that amount
+         * @param recordToLoad after yielding lock, load this record with only mmutex
          * do a dbtemprelease
          * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic -
          *       we don't do herein as this->matcher (above) is only initialized for true queries/getmore.
@@ -174,15 +177,22 @@ namespace mongo {
          *         if false is returned, then this ClientCursor should be considered deleted -
          *         in fact, the whole database could be gone.
          */
-        bool yield( int microsToSleep = -1 );
+        bool yield( int microsToSleep = -1 , Record * recordToLoad = 0 );
 
+        enum RecordNeeds {
+            DontNeed = -1 , MaybeCovered = 0 , WillNeed = 100
+        };
+            
         /**
+         * @param needRecord whether or not the next record has to be read from disk for sure
+         *                   if this is true, will yield of next record isn't in memory
+         * @param yielded true if a yield occurred, and potentially if a yield did not occur
          * @return same as yield()
          */
-        bool yieldSometimes();
+        bool yieldSometimes( RecordNeeds need, bool *yielded = 0 );
 
         static int yieldSuggest();
-        static void staticYield( int micros , const StringData& ns );
+        static void staticYield( int micros , const StringData& ns , Record * rec );
 
         struct YieldData { CursorId _id; bool _doingDeletes; };
         bool prepareToYield( YieldData &data );
@@ -235,21 +245,30 @@ namespace mongo {
         DiskLoc currLoc() { return _c->currLoc(); }
         BSONObj currKey() const { return _c->currKey(); }
 
-
         /**
          * same as BSONObj::getFieldsDotted
          * if it can be retrieved from key, it is
+         * @param holder keeps the currKey in scope by keeping a reference to it here. generally you'll want 
+         *        holder and ret to destruct about the same time.
          * @return if this was retrieved from key
          */
-        bool getFieldsDotted( const string& name, BSONElementSet &ret );
+        bool getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder );
 
         /**
          * same as BSONObj::getFieldDotted
          * if it can be retrieved from key, it is
          * @return if this was retrieved from key
          */
-        BSONElement getFieldDotted( const string& name , bool * fromKey = 0 );
-
+        BSONElement getFieldDotted( const string& name , BSONObj& holder , bool * fromKey = 0 ) ;
+        
+        /** extract items from object which match a pattern object.
+         * e.g., if pattern is { x : 1, y : 1 }, builds an object with
+         * x and y elements of this object, if they are present.
+         * returns elements with original field names
+         * NOTE: copied from BSONObj::extractFields
+        */
+        BSONObj extractFields(const BSONObj &pattern , bool fillWithNull = false) ;
+        
         bool currentIsDup() { return _c->getsetdup( _c->currLoc() ); }
 
         bool currentMatches() {
@@ -258,6 +277,9 @@ namespace mongo {
             return _c->matcher()->matchesCurrent( _c.get() );
         }
 
+        void setChunkManager( ShardChunkManagerPtr manager ){ _chunkManager = manager; }
+        ShardChunkManagerPtr getChunkManager(){ return _chunkManager; }
+
     private:
         void setLastLoc_inlock(DiskLoc);
 
@@ -342,6 +364,8 @@ namespace mongo {
         void noTimeout() { _pinValue++; }
 
         CCByLoc& byLoc() { return _db->ccByLoc; }
+        
+        Record* _recordForYield( RecordNeeds need );
 
     private:
 
@@ -371,6 +395,8 @@ namespace mongo {
         bool _doingDeletes;
         ElapsedTracker _yieldSometimesTracker;
 
+        ShardChunkManagerPtr _chunkManager;
+
     public:
         shared_ptr<ParsedQuery> pq;
         shared_ptr<Projection> fields; // which fields query wants returned
@@ -382,7 +408,7 @@ namespace mongo {
 
         static CCById clientCursorsById;
         static long long numberTimedOut;
-        static boost::recursive_mutex ccmutex;   // must use this for all statics above!
+        static boost::recursive_mutex& ccmutex;   // must use this for all statics above!
         static CursorId allocCursorId_inlock();
 
     };
@@ -396,3 +422,11 @@ namespace mongo {
     extern ClientCursorMonitor clientCursorMonitor;
 
 } // namespace mongo
+
+// ClientCursor should only be used with auto_ptr because it needs to be
+// release()ed after a yield if stillOk() returns false and these pointer types
+// do not support releasing. This will prevent them from being used accidentally
+namespace boost{
+    template<> class scoped_ptr<mongo::ClientCursor> {};
+    template<> class shared_ptr<mongo::ClientCursor> {};
+}
diff --git a/db/cloner.cpp b/db/cloner.cpp
index ec5ba99..8956133 100644
--- a/db/cloner.cpp
+++ b/db/cloner.cpp
@@ -17,11 +17,12 @@
 */
 
 #include "pch.h"
+#include "cloner.h"
 #include "pdfile.h"
 #include "../client/dbclient.h"
 #include "../bson/util/builder.h"
 #include "jsobj.h"
-#include "query.h"
+#include "ops/query.h"
 #include "commands.h"
 #include "db.h"
 #include "instance.h"
@@ -29,14 +30,30 @@
 
 namespace mongo {
 
+    BSONElement getErrField(const BSONObj& o);
+
     void ensureHaveIdIndex(const char *ns);
 
     bool replAuthenticate(DBClientBase *);
 
+    /** Selectively release the mutex based on a parameter. */
+    class dbtempreleaseif {
+    public:
+        dbtempreleaseif( bool release ) : _impl( release ? new dbtemprelease() : 0 ) {}
+    private:
+        shared_ptr< dbtemprelease > _impl;
+    };
+    
+    void mayInterrupt( bool mayBeInterrupted ) {
+     	if ( mayBeInterrupted ) {
+         	killCurrentOp.checkForInterrupt( false );   
+        }
+    }
+    
     class Cloner: boost::noncopyable {
         auto_ptr< DBClientWithCommands > conn;
         void copy(const char *from_ns, const char *to_ns, bool isindex, bool logForRepl,
-                  bool masterSameProcess, bool slaveOk, Query q = Query());
+                  bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query q = Query());
         struct Fun;
     public:
         Cloner() { }
@@ -47,9 +64,11 @@ namespace mongo {
                          for example repairDatabase need not use it.
         */
         void setConnection( DBClientWithCommands *c ) { conn.reset( c ); }
-        bool go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot);
 
-        bool copyCollection( const string& from , const string& ns , const BSONObj& query , string& errmsg , bool copyIndexes = true, bool logForRepl = true );
+        /** copy the entire database */
+        bool go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode = 0);
+
+        bool copyCollection( const string& from , const string& ns , const BSONObj& query , string& errmsg , bool mayYield, bool mayBeInterrupted, bool copyIndexes = true, bool logForRepl = true );
     };
 
     /* for index info object:
@@ -87,6 +106,8 @@ namespace mongo {
     }
 
     struct Cloner::Fun {
+        Fun() : lastLog(0) { }
+        time_t lastLog;
         void operator()( DBClientCursorBatchIterator &i ) {
             mongolock l( true );
             if ( context ) {
@@ -95,7 +116,15 @@ namespace mongo {
 
             while( i.moreInCurrentBatch() ) {
                 if ( n % 128 == 127 /*yield some*/ ) {
-                    dbtemprelease t;
+                    time_t now = time(0);
+                    if( now - lastLog >= 60 ) { 
+                        // report progress
+                        if( lastLog )
+                            log() << "clone " << to_collection << ' ' << n << endl;
+                        lastLog = now;
+                    }
+                    mayInterrupt( _mayBeInterrupted );
+                    dbtempreleaseif t( _mayYield );
                 }
 
                 BSONObj tmp = i.nextSafe();
@@ -151,12 +180,14 @@ namespace mongo {
         list<BSONObj> *storedForLater;
         bool logForRepl;
         Client::Context *context;
+        bool _mayYield;
+        bool _mayBeInterrupted;
     };
 
     /* copy the specified collection
        isindex - if true, this is system.indexes collection, in which we do some transformation when copying.
     */
-    void Cloner::copy(const char *from_collection, const char *to_collection, bool isindex, bool logForRepl, bool masterSameProcess, bool slaveOk, Query query) {
+    void Cloner::copy(const char *from_collection, const char *to_collection, bool isindex, bool logForRepl, bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query query) {
         list<BSONObj> storedForLater;
 
         Fun f;
@@ -167,11 +198,14 @@ namespace mongo {
         f.saveLast = time( 0 );
         f.storedForLater = &storedForLater;
         f.logForRepl = logForRepl;
+        f._mayYield = mayYield;
+        f._mayBeInterrupted = mayBeInterrupted;
 
         int options = QueryOption_NoCursorTimeout | ( slaveOk ? QueryOption_SlaveOk : 0 );
         {
-            dbtemprelease r;
-            f.context = r._context;
+            f.context = cc().getContext();
+            mayInterrupt( mayBeInterrupted );
+            dbtempreleaseif r( mayYield );
             DBClientConnection *remote = dynamic_cast< DBClientConnection* >( conn.get() );
             if ( remote ) {
                 remote->query( boost::function<void(DBClientCursorBatchIterator &)>( f ), from_collection, query, 0, options );
@@ -204,12 +238,12 @@ namespace mongo {
         }
     }
 
-    bool copyCollectionFromRemote(const string& host, const string& ns, const BSONObj& query, string& errmsg, bool logForRepl) {
+    bool copyCollectionFromRemote(const string& host, const string& ns, const BSONObj& query, string& errmsg, bool logForRepl, bool mayYield, bool mayBeInterrupted) {
         Cloner c;
-        return c.copyCollection(host, ns, query, errmsg , /*copyIndexes*/ true, logForRepl);
+        return c.copyCollection(host, ns, query, errmsg, mayYield, mayBeInterrupted, /*copyIndexes*/ true, logForRepl);
     }
 
-    bool Cloner::copyCollection( const string& from , const string& ns , const BSONObj& query , string& errmsg , bool copyIndexes, bool logForRepl ) {
+    bool Cloner::copyCollection( const string& from , const string& ns , const BSONObj& query , string& errmsg , bool mayYield, bool mayBeInterrupted, bool copyIndexes, bool logForRepl ) {
         auto_ptr<DBClientConnection> myconn;
         myconn.reset( new DBClientConnection() );
         if ( ! myconn->connect( from , errmsg ) )
@@ -231,7 +265,7 @@ namespace mongo {
 
         {
             // main data
-            copy( ns.c_str() , ns.c_str() , /*isindex*/false , logForRepl , false , true , Query(query).snapshot() );
+            copy( ns.c_str() , ns.c_str() , /*isindex*/false , logForRepl , false , true , mayYield, mayBeInterrupted, Query(query).snapshot() );
         }
 
         /* TODO : copyIndexes bool does not seem to be implemented! */
@@ -242,7 +276,7 @@ namespace mongo {
         {
             // indexes
             string temp = ctx.db()->name + ".system.indexes";
-            copy( temp.c_str() , temp.c_str() , /*isindex*/true , logForRepl , false , true , BSON( "ns" << ns ) );
+            copy( temp.c_str() , temp.c_str() , /*isindex*/true , logForRepl , false , true , mayYield, mayBeInterrupted, BSON( "ns" << ns ) );
         }
         getDur().commitIfNeeded();
         return true;
@@ -251,8 +285,10 @@ namespace mongo {
     extern bool inDBRepair;
     void ensureIdIndexForNewNs(const char *ns);
 
-    bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot) {
-
+    bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) {
+        if ( errCode ) {
+            *errCode = 0;
+        }
         massert( 10289 ,  "useReplAuth is not written to replication log", !useReplAuth || !logForRepl );
 
         string todb = cc().database()->name;
@@ -274,7 +310,8 @@ namespace mongo {
         string ns = fromdb + ".system.namespaces";
         list<BSONObj> toClone;
         {
-            dbtemprelease r;
+            mayInterrupt( mayBeInterrupted );
+            dbtempreleaseif r( mayYield );
 
             // just using exhaust for collection copying right now
             auto_ptr<DBClientCursor> c;
@@ -302,6 +339,18 @@ namespace mongo {
                 errmsg = "query failed " + ns;
                 return false;
             }
+            
+            if ( c->more() ) {
+                BSONObj first = c->next();
+                if( !getErrField(first).eoo() ) {
+                    if ( errCode ) {
+                        *errCode = first.getIntField("code");
+                    }
+                    errmsg = "query failed " + ns;
+                    return false;
+                }
+                c->putBack( first );
+            }
 
             while ( c->more() ) {
                 BSONObj collection = c->next();
@@ -325,7 +374,7 @@ namespace mongo {
                         continue;
                     }
                 }
-                if( ! isANormalNSName( from_name ) ) {
+                if( ! NamespaceString::normal( from_name ) ) {
                     log(2) << "\t\t not cloning because has $ " << endl;
                     continue;
                 }
@@ -335,7 +384,8 @@ namespace mongo {
 
         for ( list<BSONObj>::iterator i=toClone.begin(); i != toClone.end(); i++ ) {
             {
-                dbtemprelease r;
+                mayInterrupt( mayBeInterrupted );
+                dbtempreleaseif r( mayYield );
             }
             BSONObj collection = *i;
             log(2) << "  really will clone: " << collection << endl;
@@ -358,7 +408,7 @@ namespace mongo {
             Query q;
             if( snapshot )
                 q.snapshot();
-            copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, q);
+            copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, q);
 
             if( wantIdIndex ) {
                 /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations
@@ -385,20 +435,15 @@ namespace mongo {
                  rather than this exact value.  we should standardize.  OR, remove names - which is in the bugdb.  Anyway, this
                  is dubious here at the moment.
         */
-        copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, BSON( "name" << NE << "_id_" ) );
+        copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, BSON( "name" << NE << "_id_" ) );
 
         return true;
     }
 
-    /* slaveOk     - if true it is ok if the source of the data is !ismaster.
-       useReplAuth - use the credentials we normally use as a replication slave for the cloning
-       snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
-                     for example repairDatabase need not use it.
-    */
     bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
-                   bool slaveOk, bool useReplAuth, bool snapshot) {
+                   bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) {
         Cloner c;
-        return c.go(masterHost, errmsg, fromdb, logForReplication, slaveOk, useReplAuth, snapshot);
+        return c.go(masterHost, errmsg, fromdb, logForReplication, slaveOk, useReplAuth, snapshot, mayYield, mayBeInterrupted, errCode);
     }
 
     /* Usage:
@@ -415,7 +460,7 @@ namespace mongo {
             help << "{ clone : \"host13\" }";
         }
         CmdClone() : Command("clone") { }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string from = cmdObj.getStringField("clone");
             if ( from.empty() )
                 return false;
@@ -423,7 +468,7 @@ namespace mongo {
                were to clone it would get a different point-in-time and not match.
                */
             return cloneFrom(from.c_str(), errmsg, dbname,
-                             /*logForReplication=*/!fromRepl, /*slaveok*/false, /*usereplauth*/false, /*snapshot*/true);
+                             /*logForReplication=*/!fromRepl, /*slaveok*/false, /*usereplauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/false);
         }
     } cmdclone;
 
@@ -441,7 +486,7 @@ namespace mongo {
                  "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there."
                  ;
         }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string fromhost = cmdObj.getStringField("from");
             if ( fromhost.empty() ) {
                 errmsg = "missing 'from' parameter";
@@ -470,7 +515,7 @@ namespace mongo {
                   << " query: " << query << " " << ( copyIndexes ? "" : ", not copying indexes" ) << endl;
 
             Cloner c;
-            return c.copyCollection( fromhost , collection , query, errmsg , copyIndexes );
+            return c.copyCollection( fromhost , collection , query, errmsg , true, false, copyIndexes );
         }
     } cmdclonecollection;
 
@@ -493,7 +538,7 @@ namespace mongo {
             help << "get a nonce for subsequent copy db request from secure server\n";
             help << "usage: {copydbgetnonce: 1, fromhost: <hostname>}";
         }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string fromhost = cmdObj.getStringField("fromhost");
             if ( fromhost.empty() ) {
                 /* copy from self */
@@ -532,9 +577,10 @@ namespace mongo {
         virtual LockType locktype() const { return WRITE; }
         virtual void help( stringstream &help ) const {
             help << "copy a database from another host to this host\n";
-            help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, username: <username>, nonce: <nonce>, key: <key>]}";
+            help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, slaveOk: <bool>, username: <username>, nonce: <nonce>, key: <key>]}";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            bool slaveOk = cmdObj["slaveOk"].trueValue();
             string fromhost = cmdObj.getStringField("fromhost");
             if ( fromhost.empty() ) {
                 /* copy from self */
@@ -565,7 +611,7 @@ namespace mongo {
                 c.setConnection( authConn_.release() );
             }
             Client::Context ctx(todb);
-            bool res = c.go(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, /*slaveok*/false, /*replauth*/false, /*snapshot*/true);
+            bool res = c.go(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, slaveOk, /*replauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/ false);
             return res;
         }
     } cmdcopydb;
@@ -576,6 +622,7 @@ namespace mongo {
         virtual bool adminOnly() const {
             return true;
         }
+        virtual bool requiresAuth() { return false; } // do our own auth
         virtual bool slaveOk() const {
             return false;
         }
@@ -586,7 +633,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << " example: { renameCollection: foo.a, to: bar.b }";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string source = cmdObj.getStringField( name.c_str() );
             string target = cmdObj.getStringField( "to" );
             if ( source.empty() || target.empty() ) {
@@ -597,7 +644,7 @@ namespace mongo {
             bool capped = false;
             long long size = 0;
             {
-                Client::Context ctx( source );
+                Client::Context ctx( source ); // auths against source
                 NamespaceDetails *nsd = nsdetails( source.c_str() );
                 uassert( 10026 ,  "source namespace does not exist", nsd );
                 capped = nsd->capped;
@@ -606,7 +653,7 @@ namespace mongo {
                         size += i.ext()->length;
             }
 
-            Client::Context ctx( target );
+            Client::Context ctx( target ); //auths against target
 
             if ( nsdetails( target.c_str() ) ) {
                 uassert( 10027 ,  "target namespace exists", cmdObj["dropTarget"].trueValue() );
diff --git a/db/cloner.h b/db/cloner.h
new file mode 100644
index 0000000..94264f8
--- /dev/null
+++ b/db/cloner.h
@@ -0,0 +1,39 @@
+// cloner.h - copy a database (export/import basically)
+
+/**
+ *    Copyright (C) 2011 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+    
+    /**
+     * @param slaveOk     - if true it is ok if the source of the data is !ismaster.
+     * @param useReplAuth - use the credentials we normally use as a replication slave for the cloning
+     * @param snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
+     *                      for example repairDatabase need not use it.
+     * @param errCode     - If provided, this will be set on error to the server's error code.  Currently
+     *                      this will only be set if there is an error in the initial system.namespaces query.
+     */
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+                   bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield,
+                   bool mayBeInterrupted, int *errCode = 0);
+
+    bool copyCollectionFromRemote(const string& host, const string& ns, const BSONObj& query, string& errmsg, bool logForRepl, bool mayYield, bool mayBeInterrupted);
+
+} // namespace mongo
diff --git a/db/cmdline.cpp b/db/cmdline.cpp
index 2a10fb5..fd759a7 100644
--- a/db/cmdline.cpp
+++ b/db/cmdline.cpp
@@ -19,14 +19,17 @@
 #include "pch.h"
 #include "cmdline.h"
 #include "commands.h"
+#include "../util/password.h"
 #include "../util/processinfo.h"
-#include "../util/message.h"
-#include "security_key.h"
+#include "../util/net/listen.h"
+#include "security_common.h"
 
 #ifdef _WIN32
 #include <direct.h>
 #endif
 
+#define MAX_LINE_LENGTH 256
+
 namespace po = boost::program_options;
 namespace fs = boost::filesystem;
 
@@ -34,7 +37,8 @@ namespace mongo {
 
     void setupSignals( bool inFork );
     string getHostNameCached();
-    BSONArray argvArray;
+    static BSONArray argvArray;
+    static BSONObj parsedOpts;
 
     void CmdLine::addGlobalOptions( boost::program_options::options_description& general ,
                                     boost::program_options::options_description& hidden ) {
@@ -52,15 +56,25 @@ namespace mongo {
         ("port", po::value<int>(&cmdLine.port), "specify port number")
         ("bind_ip", po::value<string>(&cmdLine.bind_ip), "comma separated list of ip addresses to listen on - all local ips by default")
         ("maxConns",po::value<int>(), "max number of simultaneous connections")
+        ("objcheck", "inspect client data for validity on receipt")
         ("logpath", po::value<string>() , "log file to send write to instead of stdout - has to be a file, not directory" )
         ("logappend" , "append to logpath instead of over-writing" )
         ("pidfilepath", po::value<string>(), "full path to pidfile (if not set, no pidfile is created)")
         ("keyFile", po::value<string>(), "private key for cluster authentication (only for replica sets)")
 #ifndef _WIN32
+        ("nounixsocket", "disable listening on unix sockets")
         ("unixSocketPrefix", po::value<string>(), "alternative directory for UNIX domain sockets (defaults to /tmp)")
         ("fork" , "fork server process" )
 #endif
         ;
+        
+        hidden.add_options()
+#ifdef MONGO_SSL
+        ("sslOnNormalPorts" , "use ssl on configured ports" )
+        ("sslPEMKeyFile" , po::value<string>(&cmdLine.sslPEMKeyFile), "PEM file for ssl" )
+        ("sslPEMKeyPassword" , new PasswordValue(&cmdLine.sslPEMKeyPassword) , "PEM file password" )
+#endif
+        ;
 
     }
 
@@ -82,6 +96,32 @@ namespace mongo {
     }
 #endif
 
+    void CmdLine::parseConfigFile( istream &f, stringstream &ss ) {
+        string s;
+        char line[MAX_LINE_LENGTH];
+
+        while ( f ) {
+            f.getline(line, MAX_LINE_LENGTH);
+            s = line;
+            std::remove(s.begin(), s.end(), ' ');
+            std::remove(s.begin(), s.end(), '\t');
+            boost::to_upper(s);
+
+            if ( s.find( "FASTSYNC" ) != string::npos )
+                cout << "warning \"fastsync\" should not be put in your configuration file" << endl;
+
+            if ( s.c_str()[0] == '#' ) { 
+                // skipping commented line
+            } else if ( s.find( "=FALSE" ) == string::npos ) {
+                ss << line << endl;
+            } else {
+                cout << "warning: remove or comment out this line by starting it with \'#\', skipping now : " << line << endl;
+            }
+        }
+        return;
+    }
+
+
 
     bool CmdLine::store( int argc , char ** argv ,
                          boost::program_options::options_description& visible,
@@ -138,7 +178,9 @@ namespace mongo {
                     return false;
                 }
 
-                po::store( po::parse_config_file( f , all ) , params );
+                stringstream ss;
+                CmdLine::parseConfigFile( f, ss );
+                po::store( po::parse_config_file( ss , all ) , params );
                 f.close();
             }
 
@@ -178,6 +220,10 @@ namespace mongo {
             connTicketHolder.resize( newSize );
         }
 
+        if (params.count("objcheck")) {
+            cmdLine.objcheck = true;
+        }
+
         string logpath;
 
 #ifndef _WIN32
@@ -188,7 +234,11 @@ namespace mongo {
                 ::exit(-1);
             }
         }
-        
+
+        if (params.count("nounixsocket")) {
+            cmdLine.noUnixSocket = true;
+        }
+
         if (params.count("fork")) {
             if ( ! params.count( "logpath" ) ) {
                 cout << "--fork has to be used with --logpath" << endl;
@@ -252,6 +302,7 @@ namespace mongo {
             setupCoreSignals();
             setupSignals( true );
         }
+
 #endif
         if (params.count("logpath")) {
             if ( logpath.size() == 0 )
@@ -272,9 +323,66 @@ namespace mongo {
                 dbexit(EXIT_BADOPTIONS);
             }
 
+            cmdLine.keyFile = true;
             noauth = false;
         }
+        else {
+            cmdLine.keyFile = false;
+        }
 
+#ifdef MONGO_SSL
+        if (params.count("sslOnNormalPorts") ) {
+            cmdLine.sslOnNormalPorts = true;
+
+            if ( cmdLine.sslPEMKeyPassword.size() == 0 ) {
+                log() << "need sslPEMKeyPassword" << endl;
+                dbexit(EXIT_BADOPTIONS);
+            }
+            
+            if ( cmdLine.sslPEMKeyFile.size() == 0 ) {
+                log() << "need sslPEMKeyFile" << endl;
+                dbexit(EXIT_BADOPTIONS);
+            }
+            
+            cmdLine.sslServerManager = new SSLManager( false );
+            cmdLine.sslServerManager->setupPEM( cmdLine.sslPEMKeyFile , cmdLine.sslPEMKeyPassword );
+        }
+#endif
+        
+        {
+            BSONObjBuilder b;
+            for (po::variables_map::const_iterator it(params.begin()), end(params.end()); it != end; it++){
+                if (!it->second.defaulted()){
+                    const string& key = it->first;
+                    const po::variable_value& value = it->second;
+                    const type_info& type = value.value().type();
+
+                    if (type == typeid(string)){
+                        if (value.as<string>().empty())
+                            b.appendBool(key, true); // boost po uses empty string for flags like --quiet
+                        else
+                            b.append(key, value.as<string>());
+                    }
+                    else if (type == typeid(int))
+                        b.append(key, value.as<int>());
+                    else if (type == typeid(double))
+                        b.append(key, value.as<double>());
+                    else if (type == typeid(bool))
+                        b.appendBool(key, value.as<bool>());
+                    else if (type == typeid(long))
+                        b.appendNumber(key, (long long)value.as<long>());
+                    else if (type == typeid(unsigned))
+                        b.appendNumber(key, (long long)value.as<unsigned>());
+                    else if (type == typeid(unsigned long long))
+                        b.appendNumber(key, (long long)value.as<unsigned long long>());
+                    else if (type == typeid(vector<string>))
+                        b.append(key, value.as<vector<string> >());
+                    else
+                        b.append(key, "UNKNOWN TYPE: " + demangleName(type));
+                }
+            }
+            parsedOpts = b.obj();
+        }
 
         {
             BSONArrayBuilder b;
@@ -286,6 +394,10 @@ namespace mongo {
         return true;
     }
 
+    void printCommandLineOpts() {
+        log() << "options: " << parsedOpts << endl;
+    }
+
     void ignoreSignal( int sig ) {}
 
     void setupCoreSignals() {
@@ -303,8 +415,9 @@ namespace mongo {
         virtual bool adminOnly() const { return true; }
         virtual bool slaveOk() const { return true; }
 
-        virtual bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             result.append("argv", argvArray);
+            result.append("parsed", parsedOpts);
             return true;
         }
 
diff --git a/db/cmdline.h b/db/cmdline.h
index 4c8c7c4..fdf3f56 100644
--- a/db/cmdline.h
+++ b/db/cmdline.h
@@ -21,26 +21,25 @@
 
 namespace mongo {
 
+#ifdef MONGO_SSL
+    class SSLManager;
+#endif
+
+
+
     /* command line options
     */
     /* concurrency: OK/READ */
     struct CmdLine {
 
-        CmdLine() :
-            port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), smallfiles(sizeof(int*) == 4),
-            quota(false), quotaFiles(8), cpu(false), durOptions(0), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ),
-            syncdelay(60), socket("/tmp") {
-            // default may change for this later.
-#if defined(_DURABLEDEFAULTON)
-            dur = true;
-#else
-            dur = false;
-#endif
-        }
+        CmdLine();
 
         string binaryName;     // mongod or mongos
         string cwd;            // cwd of when process started
 
+        // this is suboptimal as someone could rename a binary.  todo...
+        bool isMongos() const { return binaryName == "mongos"; }
+
         int port;              // --port
         enum {
             DefaultDBPort = 27017,
@@ -70,13 +69,17 @@ namespace mongo {
         bool quiet;            // --quiet
         bool noTableScan;      // --notablescan no table scans allowed
         bool prealloc;         // --noprealloc no preallocation of data files
+        bool preallocj;        // --nopreallocj no preallocation of journal files
         bool smallfiles;       // --smallfiles allocate smaller data files
 
+        bool configsvr;        // --configsvr
+
         bool quota;            // --quota
         int quotaFiles;        // --quotaFiles
         bool cpu;              // --cpu show cpu time periodically
 
-        bool dur;              // --dur durability
+        bool dur;                       // --dur durability (now --journal)
+        unsigned journalCommitInterval; // group/batch commit interval ms
 
         /** --durOptions 7      dump journal and terminate without doing anything further
             --durOptions 4      recover and terminate without listening
@@ -86,10 +89,13 @@ namespace mongo {
             DurScanOnly = 2,      // don't do any real work, just scan and dump if dump specified
             DurRecoverOnly = 4,   // terminate after recovery step
             DurParanoid = 8,      // paranoid mode enables extra checks
-            DurAlwaysCommit = 16  // do a group commit every time the writelock is released
+            DurAlwaysCommit = 16, // do a group commit every time the writelock is released
+            DurAlwaysRemap = 32   // remap the private view after every group commit (may lag to the next write lock acquisition, but will do all files then)
         };
         int durOptions;          // --durOptions <n> for debugging
 
+        bool objcheck;         // --objcheck
+
         long long oplogSize;   // --oplogSize
         int defaultProfile;    // --profile
         int slowMS;            // --time in ms that is "slow"
@@ -98,8 +104,19 @@ namespace mongo {
         bool moveParanoia;     // for move chunk paranoia
         double syncdelay;      // seconds between fsyncs
 
+        bool noUnixSocket;     // --nounixsocket
         string socket;         // UNIX domain socket directory
 
+        bool keyFile;
+
+#ifdef MONGO_SSL
+        bool sslOnNormalPorts;      // --sslOnNormalPorts
+        string sslPEMKeyFile;       // --sslPEMKeyFile
+        string sslPEMKeyPassword;   // --sslPEMKeyPassword
+
+        SSLManager* sslServerManager; // currently leaks on close
+#endif
+
         static void addGlobalOptions( boost::program_options::options_description& general ,
                                       boost::program_options::options_description& hidden );
 
@@ -107,6 +124,7 @@ namespace mongo {
                                        boost::program_options::options_description& hidden );
 
 
+        static void parseConfigFile( istream &f, stringstream &ss);
         /**
          * @return true if should run program, false if should exit
          */
@@ -117,12 +135,37 @@ namespace mongo {
                            boost::program_options::variables_map &output );
     };
 
+    // todo move to cmdline.cpp?
+    inline CmdLine::CmdLine() :
+        port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), preallocj(true), smallfiles(sizeof(int*) == 4),
+        configsvr(false),
+        quota(false), quotaFiles(8), cpu(false), durOptions(0), objcheck(false), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ),
+        syncdelay(60), noUnixSocket(false), socket("/tmp") 
+    {
+        journalCommitInterval = 0; // 0 means use default
+        dur = false;
+#if defined(_DURABLEDEFAULTON)
+        dur = true;
+#endif
+        if( sizeof(void*) == 8 )
+            dur = true;
+#if defined(_DURABLEDEFAULTOFF)
+        dur = false;
+#endif
+
+#ifdef MONGO_SSL
+        sslOnNormalPorts = false;
+        sslServerManager = 0;
+#endif
+    }
+            
     extern CmdLine cmdLine;
 
     void setupCoreSignals();
 
     string prettyHostName();
 
+    void printCommandLineOpts();
 
     /**
      * used for setParameter
diff --git a/db/commands.cpp b/db/commands.cpp
index 30bdc54..b6c1526 100644
--- a/db/commands.cpp
+++ b/db/commands.cpp
@@ -21,7 +21,7 @@
 #include "jsobj.h"
 #include "commands.h"
 #include "client.h"
-#include "replpair.h"
+#include "replutil.h"
 
 namespace mongo {
 
@@ -121,55 +121,6 @@ namespace mongo {
         help << "no help defined";
     }
 
-    bool Command::runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder) {
-        const char *p = strchr(ns, '.');
-        if ( !p ) return false;
-        if ( strcmp(p, ".$cmd") != 0 ) return false;
-
-        bool ok = false;
-
-        BSONElement e = jsobj.firstElement();
-        map<string,Command*>::iterator i;
-
-        if ( e.eoo() )
-            ;
-        /* check for properly registered command objects.  Note that all the commands below should be
-           migrated over to the command object format.
-           */
-        else if ( (i = _commands->find(e.fieldName())) != _commands->end() ) {
-            string errmsg;
-            Command *c = i->second;
-            if ( c->adminOnly() && !startsWith(ns, "admin.") ) {
-                ok = false;
-                errmsg = "access denied - use admin db";
-            }
-            else if ( jsobj.getBoolField( "help" ) ) {
-                stringstream help;
-                help << "help for: " << e.fieldName() << " ";
-                c->help( help );
-                anObjBuilder.append( "help" , help.str() );
-            }
-            else {
-                ok = c->run( nsToDatabase( ns ) , jsobj, errmsg, anObjBuilder, false);
-            }
-
-            BSONObj tmp = anObjBuilder.asTempObj();
-            bool have_ok = tmp.hasField("ok");
-            bool have_errmsg = tmp.hasField("errmsg");
-
-            if (!have_ok)
-                anObjBuilder.append( "ok" , ok ? 1.0 : 0.0 );
-
-            if ( !ok && !have_errmsg) {
-                anObjBuilder.append("errmsg", errmsg);
-                uassert_nothrow(errmsg.c_str());
-            }
-            return true;
-        }
-
-        return false;
-    }
-
     Command* Command::findCommand( const string& name ) {
         map<string,Command*>::iterator i = _commands->find( name );
         if ( i == _commands->end() )
diff --git a/db/commands.h b/db/commands.h
index 42e46a0..c186218 100644
--- a/db/commands.h
+++ b/db/commands.h
@@ -18,15 +18,14 @@
 #pragma once
 
 #include "../pch.h"
-
 #include "jsobj.h"
 #include "../util/timer.h"
+#include "../client/dbclient.h"
 
 namespace mongo {
 
     class BSONObj;
     class BSONObjBuilder;
-    class BufBuilder;
     class Client;
 
     /** mongodb "commands" (sent via db.$cmd.findOne(...))
@@ -47,7 +46,7 @@ namespace mongo {
 
            return value is true if succeeded.  if false, set errmsg text.
         */
-        virtual bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) = 0;
+        virtual bool run(const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) = 0;
 
         /*
            note: logTheTop() MUST be false if READ
@@ -70,7 +69,7 @@ namespace mongo {
         */
         virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return false; }
 
-        /* Return true if slaves of a replication pair are allowed to execute the command
+        /* Return true if slaves are allowed to execute the command
            (the command directly from a client -- if fromRepl, always allowed).
         */
         virtual bool slaveOk() const = 0;
@@ -96,6 +95,11 @@ namespace mongo {
         */
         virtual bool requiresAuth() { return true; }
 
+        /* Return true if a replica set secondary should go into "recovering"
+           (unreadable) state while running this command.
+         */
+        virtual bool maintenanceMode() const { return false; }
+
         /** @param webUI expose the command in the web ui as localhost:28017/<name>
             @param oldName an optional old, deprecated name for the command
         */
@@ -122,12 +126,30 @@ namespace mongo {
         static const map<string,Command*>* commandsByBestName() { return _commandsByBestName; }
         static const map<string,Command*>* webCommands() { return _webCommands; }
         /** @return if command was found and executed */
-        static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder);
+        static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder, int queryOptions = 0);
         static LockType locktype( const string& name );
         static Command * findCommand( const string& name );
     };
 
-    bool _runCommands(const char *ns, BSONObj& jsobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions);
+    class CmdShutdown : public Command {
+    public:
+        virtual bool requiresAuth() { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return true; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream& help ) const;
+        CmdShutdown() : Command("shutdown") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+    private:
+        bool shutdownHelper();
+    };
 
+    bool _runCommands(const char *ns, BSONObj& jsobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions);
 
 } // namespace mongo
diff --git a/db/commands/distinct.cpp b/db/commands/distinct.cpp
index 7b2f6a8..48f4405 100644
--- a/db/commands/distinct.cpp
+++ b/db/commands/distinct.cpp
@@ -32,7 +32,7 @@ namespace mongo {
             help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }";
         }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             Timer t;
             string ns = dbname + '.' + cmdObj.firstElement().valuestr();
 
@@ -63,7 +63,7 @@ namespace mongo {
 
             shared_ptr<Cursor> cursor;
             if ( ! query.isEmpty() ) {
-                cursor = bestGuessCursor(ns.c_str() , query , BSONObj() );
+                cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() );
             }
             else {
 
@@ -78,29 +78,33 @@ namespace mongo {
 
                     if ( idx.inKeyPattern( key ) ) {
                         cursor = bestGuessCursor( ns.c_str() , BSONObj() , idx.keyPattern() );
-                        break;
+                        if( cursor.get() ) break;
                     }
 
                 }
 
                 if ( ! cursor.get() )
-                    cursor = bestGuessCursor(ns.c_str() , query , BSONObj() );
+                    cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() );
 
             }
 
-
-
-            scoped_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns));
+            
+            assert( cursor );
+            string cursorName = cursor->toString();
+            
+            auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns));
 
             while ( cursor->ok() ) {
                 nscanned++;
                 bool loadedObject = false;
 
-                if ( !cursor->matcher() || cursor->matcher()->matchesCurrent( cursor.get() , &md ) ) {
+                if ( ( !cursor->matcher() || cursor->matcher()->matchesCurrent( cursor.get() , &md ) ) &&
+                    !cursor->getsetdup( cursor->currLoc() ) ) {
                     n++;
 
+                    BSONObj holder;
                     BSONElementSet temp;
-                    loadedObject = ! cc->getFieldsDotted( key , temp );
+                    loadedObject = ! cc->getFieldsDotted( key , temp, holder );
 
                     for ( BSONElementSet::iterator i=temp.begin(); i!=temp.end(); ++i ) {
                         BSONElement e = *i;
@@ -118,13 +122,15 @@ namespace mongo {
                     }
                 }
 
-                if ( loadedObject || md.loadedObject )
+                if ( loadedObject || md._loadedObject )
                     nscannedObjects++;
 
                 cursor->advance();
 
-                if (!cc->yieldSometimes())
+                if (!cc->yieldSometimes( ClientCursor::MaybeCovered )) {
+                    cc.release();
                     break;
+                }
 
                 RARELY killCurrentOp.checkForInterrupt();
             }
@@ -139,6 +145,7 @@ namespace mongo {
                 b.appendNumber( "nscanned" , nscanned );
                 b.appendNumber( "nscannedObjects" , nscannedObjects );
                 b.appendNumber( "timems" , t.millis() );
+                b.append( "cursor" , cursorName );
                 result.append( "stats" , b.obj() );
             }
 
diff --git a/db/commands/find_and_modify.cpp b/db/commands/find_and_modify.cpp
new file mode 100644
index 0000000..0cf766f
--- /dev/null
+++ b/db/commands/find_and_modify.cpp
@@ -0,0 +1,153 @@
+// find_and_modify.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    /* Find and Modify an object returning either the old (default) or new value*/
+    class CmdFindAndModify : public Command {
+    public:
+        virtual void help( stringstream &help ) const {
+            help <<
+                 "{ findAndModify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n"
+                 "{ findAndModify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n"
+                 "Either update or remove is required, all other fields have default values.\n"
+                 "Output is in the \"value\" field\n";
+        }
+
+        CmdFindAndModify() : Command("findAndModify", false, "findandmodify") { }
+        virtual bool logTheOp() { return false; } // the modifications will be logged directly
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            static DBDirectClient db;
+
+            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+
+            BSONObj origQuery = cmdObj.getObjectField("query"); // defaults to {}
+            Query q (origQuery);
+            BSONElement sort = cmdObj["sort"];
+            if (!sort.eoo())
+                q.sort(sort.embeddedObjectUserCheck());
+
+            bool upsert = cmdObj["upsert"].trueValue();
+
+            BSONObj fieldsHolder (cmdObj.getObjectField("fields"));
+            const BSONObj* fields = (fieldsHolder.isEmpty() ? NULL : &fieldsHolder);
+
+            Projection projection;
+            if (fields) {
+                projection.init(fieldsHolder);
+                if (!projection.includeID())
+                    fields = NULL; // do projection in post-processing
+            }
+
+            BSONObj out = db.findOne(ns, q, fields);
+            if (out.isEmpty()) {
+                if (!upsert) {
+                    result.appendNull("value");
+                    return true;
+                }
+
+                BSONElement update = cmdObj["update"];
+                uassert(13329, "upsert mode requires update field", !update.eoo());
+                uassert(13330, "upsert mode requires query field", !origQuery.isEmpty());
+                db.update(ns, origQuery, update.embeddedObjectUserCheck(), true);
+
+                BSONObj gle = db.getLastErrorDetailed();
+                result.append("lastErrorObject", gle);
+                if (gle["err"].type() == String) {
+                    errmsg = gle["err"].String();
+                    return false;
+                }
+
+                if (cmdObj["new"].trueValue()) {
+                    BSONElement _id = gle["upserted"];
+                    if (_id.eoo())
+                        _id = origQuery["_id"];
+
+                    out = db.findOne(ns, QUERY("_id" << _id), fields);
+                }
+
+            }
+            else {
+
+                if (cmdObj["remove"].trueValue()) {
+                    uassert(12515, "can't remove and update", cmdObj["update"].eoo());
+                    db.remove(ns, QUERY("_id" << out["_id"]), 1);
+
+                    BSONObj gle = db.getLastErrorDetailed();
+                    result.append("lastErrorObject", gle);
+                    if (gle["err"].type() == String) {
+                        errmsg = gle["err"].String();
+                        return false;
+                    }
+
+                }
+                else {   // update
+
+                    BSONElement queryId = origQuery["_id"];
+                    if (queryId.eoo() || getGtLtOp(queryId) != BSONObj::Equality) {
+                        // need to include original query for $ positional operator
+
+                        BSONObjBuilder b;
+                        b.append(out["_id"]);
+                        BSONObjIterator it(origQuery);
+                        while (it.more()) {
+                            BSONElement e = it.next();
+                            if (strcmp(e.fieldName(), "_id"))
+                                b.append(e);
+                        }
+                        q = Query(b.obj());
+                    }
+
+                    if (q.isComplex()) // update doesn't work with complex queries
+                        q = Query(q.getFilter().getOwned());
+
+                    BSONElement update = cmdObj["update"];
+                    uassert(12516, "must specify remove or update", !update.eoo());
+                    db.update(ns, q, update.embeddedObjectUserCheck());
+
+                    BSONObj gle = db.getLastErrorDetailed();
+                    result.append("lastErrorObject", gle);
+                    if (gle["err"].type() == String) {
+                        errmsg = gle["err"].String();
+                        return false;
+                    }
+
+                    if (cmdObj["new"].trueValue())
+                        out = db.findOne(ns, QUERY("_id" << out["_id"]), fields);
+                }
+            }
+
+            if (!fieldsHolder.isEmpty() && !fields){
+                // we need to run projection but haven't yet
+                out = projection.transform(out);
+            }
+
+            result.append("value", out);
+
+            return true;
+        }
+    } cmdFindAndModify;
+
+
+}
diff --git a/db/commands/group.cpp b/db/commands/group.cpp
index 0cc6ab3..d3e5839 100644
--- a/db/commands/group.cpp
+++ b/db/commands/group.cpp
@@ -19,6 +19,8 @@
 #include "../commands.h"
 #include "../instance.h"
 #include "../queryoptimizer.h"
+#include "../../scripting/engine.h"
+#include "../clientcursor.h"
 
 namespace mongo {
 
@@ -36,13 +38,14 @@ namespace mongo {
             if ( func ) {
                 BSONObjBuilder b( obj.objsize() + 32 );
                 b.append( "0" , obj );
-                int res = s->invoke( func , b.obj() );
+                const BSONObj& key = b.obj();
+                int res = s->invoke( func , &key, 0 );
                 uassert( 10041 ,  (string)"invoke failed in $keyf: " + s->getError() , res == 0 );
                 int type = s->type("return");
                 uassert( 10042 ,  "return of $key has to be an object" , type == Object );
                 return s->getObject( "return" );
             }
-            return obj.extractFields( keyPattern , true );
+            return obj.extractFields( keyPattern , true ).getOwned();
         }
 
         bool group( string realdbname , const string& ns , const BSONObj& query ,
@@ -85,14 +88,28 @@ namespace mongo {
             map<BSONObj,int,BSONObjCmp> map;
             list<BSONObj> blah;
 
-            shared_ptr<Cursor> cursor = bestGuessCursor(ns.c_str() , query , BSONObj() );
+            shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query);
+            ClientCursor::CleanupPointer ccPointer;
+            ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) );
 
             while ( cursor->ok() ) {
-                if ( cursor->matcher() && ! cursor->matcher()->matchesCurrent( cursor.get() ) ) {
+                
+                if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered ) ||
+                    !cursor->ok() ) {
+                    break;
+                }
+                
+                if ( ( cursor->matcher() && !cursor->matcher()->matchesCurrent( cursor.get() ) ) ||
+                    cursor->getsetdup( cursor->currLoc() ) ) {
                     cursor->advance();
                     continue;
                 }
 
+                if ( !ccPointer->yieldSometimes( ClientCursor::WillNeed ) ||
+                    !cursor->ok() ) {
+                    break;
+                }
+                
                 BSONObj obj = cursor->current();
                 cursor->advance();
 
@@ -110,10 +127,11 @@ namespace mongo {
 
                 s->setObject( "obj" , obj , true );
                 s->setNumber( "n" , n - 1 );
-                if ( s->invoke( f , BSONObj() , 0 , true ) ) {
+                if ( s->invoke( f , 0, 0 , 0 , true ) ) {
                     throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() );
                 }
             }
+            ccPointer.reset();
 
             if (!finalize.empty()) {
                 s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 );
@@ -125,7 +143,7 @@ namespace mongo {
                                           "    $arr[i] = ret; "
                                           "  } "
                                           "}" );
-                s->invoke( g , BSONObj() , 0 , true );
+                s->invoke( g , 0, 0 , 0 , true );
             }
 
             result.appendArray( "retval" , s->getObject( "$arr" ) );
@@ -137,8 +155,13 @@ namespace mongo {
             return true;
         }
 
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
 
+            if ( !globalScriptEngine ) {
+                errmsg = "server-side JavaScript execution is disabled";
+                return false;
+            }
+            
             /* db.$cmd.findOne( { group : <p> } ) */
             const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck();
 
diff --git a/db/commands/isself.cpp b/db/commands/isself.cpp
index b97f51e..5a868de 100644
--- a/db/commands/isself.cpp
+++ b/db/commands/isself.cpp
@@ -1,7 +1,7 @@
 // isself.cpp
 
 #include "pch.h"
-#include "../../util/message.h"
+#include "../../util/net/listen.h"
 #include "../commands.h"
 #include "../../client/dbclient.h"
 
@@ -11,6 +11,20 @@
 # endif
 # include <sys/resource.h>
 # include <sys/stat.h>
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netdb.h>
+#ifdef __openbsd__
+# include <sys/uio.h>
+#endif
+
 #endif
 
 
@@ -116,7 +130,7 @@ namespace mongo {
             help << "{ _isSelf : 1 } INTERNAL ONLY";
         }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             init();
             result.append( "id" , _id );
             return true;
diff --git a/db/commands/mr.cpp b/db/commands/mr.cpp
index b9f5b59..56e9770 100644
--- a/db/commands/mr.cpp
+++ b/db/commands/mr.cpp
@@ -26,7 +26,7 @@
 #include "../queryoptimizer.h"
 #include "../matcher.h"
 #include "../clientcursor.h"
-#include "../replpair.h"
+#include "../replutil.h"
 #include "../../s/d_chunk_manager.h"
 #include "../../s/d_logic.h"
 
@@ -53,6 +53,9 @@ namespace mongo {
 
             _func = _scope->createFunction( _code.c_str() );
             uassert( 13598 , str::stream() << "couldn't compile code for: " << _type , _func );
+
+            // install in JS scope so that it can be called in JS mode
+            _scope->setFunction(_type.c_str(), _code.c_str());
         }
 
         void JSMapper::init( State * state ) {
@@ -66,8 +69,7 @@ namespace mongo {
         void JSMapper::map( const BSONObj& o ) {
             Scope * s = _func.scope();
             assert( s );
-            s->setThis( &o );
-            if ( s->invoke( _func.func() , _params , 0 , true ) )
+            if ( s->invoke( _func.func() , &_params, &o , 0 , true, false, true ) )
                 throw UserException( 9014, str::stream() << "map invoke failed: " + s->getError() );
         }
 
@@ -79,7 +81,7 @@ namespace mongo {
             Scope * s = _func.scope();
 
             Scope::NoDBAccess no = s->disableDBAccess( "can't access db inside finalize" );
-            s->invokeSafe( _func.func() , o );
+            s->invokeSafe( _func.func() , &o, 0 );
 
             // don't want to use o.objsize() to size b
             // since there are many cases where the point of finalize
@@ -90,6 +92,10 @@ namespace mongo {
             return b.obj();
         }
 
+        void JSReducer::init( State * state ) {
+            _func.init( state );
+        }
+
         /**
          * Reduces a list of tuple objects (key, value) to a single tuple {"0": key, "1": value}
          */
@@ -183,7 +189,8 @@ namespace mongo {
 
             Scope * s = _func.scope();
 
-            s->invokeSafe( _func.func() , args );
+            s->invokeSafe( _func.func() , &args, 0 );
+            ++numReduces;
 
             if ( s->type( "return" ) == Array ) {
                 uasserted( 10075 , "reduce -> multiple not supported yet");
@@ -214,6 +221,11 @@ namespace mongo {
             ns = dbname + "." + cmdObj.firstElement().valuestr();
 
             verbose = cmdObj["verbose"].trueValue();
+            jsMode = cmdObj["jsMode"].trueValue();
+
+            jsMaxKeys = 500000;
+            reduceTriggerRatio = 2.0;
+            maxInMemSize = 5 * 1024 * 1024;
 
             uassert( 13602 , "outType is no longer a valid option" , cmdObj["outType"].eoo() );
 
@@ -255,7 +267,7 @@ namespace mongo {
             }
 
             if ( outType != INMEMORY ) { // setup names
-                tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << finalShort << "_" << JOB_NUMBER++;
+                tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << JOB_NUMBER++;
 
                 incLong = tempLong + "_inc";
 
@@ -308,10 +320,25 @@ namespace mongo {
             if ( ! _onDisk )
                 return;
 
-            _db.dropCollection( _config.tempLong );
+            if (_config.incLong != _config.tempLong) {
+                // create the inc collection and make sure we have index on "0" key
+                _db.dropCollection( _config.incLong );
+                {
+                    writelock l( _config.incLong );
+                    Client::Context ctx( _config.incLong );
+                    string err;
+                    if ( ! userCreateNS( _config.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ) {
+                        uasserted( 13631 , str::stream() << "userCreateNS failed for mr incLong ns: " << _config.incLong << " err: " << err );
+                    }
+                }
 
+                BSONObj sortKey = BSON( "0" << 1 );
+                _db.ensureIndex( _config.incLong , sortKey );
+            }
+
+            // create temp collection
+            _db.dropCollection( _config.tempLong );
             {
-                // create
                 writelock lock( _config.tempLong.c_str() );
                 Client::Context ctx( _config.tempLong.c_str() );
                 string errmsg;
@@ -320,7 +347,6 @@ namespace mongo {
                 }
             }
 
-
             {
                 // copy indexes
                 auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.finalLong );
@@ -355,6 +381,14 @@ namespace mongo {
             if ( _onDisk )
                 return;
 
+            if (_jsMode) {
+                ScriptingFunction getResult = _scope->createFunction("var map = _mrMap; var result = []; for (key in map) { result.push({_id: key, value: map[key]}) } return result;");
+                _scope->invoke(getResult, 0, 0, 0, false);
+                BSONObj obj = _scope->getObject("return");
+                final.append("results", BSONArray(obj));
+                return;
+            }
+
             uassert( 13604 , "too much data for in memory map/reduce" , _size < ( BSONObjMaxUserSize / 2 ) );
 
             BSONArrayBuilder b( (int)(_size * 1.2) ); // _size is data size, doesn't count overhead and keys
@@ -397,8 +431,10 @@ namespace mongo {
                 // replace: just rename from temp to final collection name, dropping previous collection
                 _db.dropCollection( _config.finalLong );
                 BSONObj info;
-                uassert( 10076 ,  "rename failed" ,
-                         _db.runCommand( "admin" , BSON( "renameCollection" << _config.tempLong << "to" << _config.finalLong ) , info ) );
+                if ( ! _db.runCommand( "admin" , BSON( "renameCollection" << _config.tempLong << "to" << _config.finalLong ) , info ) ) {
+                    uasserted( 10076 ,  str::stream() << "rename failed: " << info );
+                }
+                         
                 _db.dropCollection( _config.tempLong );
             }
             else if ( _config.outType == Config::MERGE ) {
@@ -447,7 +483,7 @@ namespace mongo {
         /**
          * Insert doc in collection
          */
-        void State::insert( const string& ns , BSONObj& o ) {
+        void State::insert( const string& ns , const BSONObj& o ) {
             assert( _onDisk );
 
             writelock l( ns );
@@ -457,6 +493,15 @@ namespace mongo {
         }
 
         /**
+         * Insert doc into the inc collection, taking proper lock
+         */
+        void State::insertToInc( BSONObj& o ) {
+            writelock l(_config.incLong);
+            Client::Context ctx(_config.incLong);
+            _insertToInc(o);
+        }
+
+        /**
          * Insert doc into the inc collection
          */
         void State::_insertToInc( BSONObj& o ) {
@@ -465,7 +510,7 @@ namespace mongo {
             getDur().commitIfNeeded();
         }
 
-        State::State( const Config& c ) : _config( c ), _size(0), _numEmits(0) {
+        State::State( const Config& c ) : _config( c ), _size(0), _dupCount(0), _numEmits(0) {
             _temp.reset( new InMemory() );
             _onDisk = _config.outType != Config::INMEMORY;
         }
@@ -488,6 +533,12 @@ namespace mongo {
                     error() << "couldn't cleanup after map reduce: " << e.what() << endl;
                 }
             }
+
+            if (_scope) {
+                // cleanup js objects
+                ScriptingFunction cleanup = _scope->createFunction("delete _emitCt; delete _keyCt; delete _mrMap;");
+                _scope->invoke(cleanup, 0, 0, 0, true);
+            }
         }
 
         /**
@@ -505,29 +556,50 @@ namespace mongo {
             _config.reducer->init( this );
             if ( _config.finalizer )
                 _config.finalizer->init( this );
+            _scope->setBoolean("_doFinal", _config.finalizer);
+
+            // by default start in JS mode, will be faster for small jobs
+            _jsMode = _config.jsMode;
+//            _jsMode = true;
+            switchMode(_jsMode);
+
+            // global JS map/reduce hashmap
+            // we use a standard JS object which means keys are only simple types
+            // we could also add a real hashmap from a library, still we need to add object comparison methods
+//            _scope->setObject("_mrMap", BSONObj(), false);
+            ScriptingFunction init = _scope->createFunction("_emitCt = 0; _keyCt = 0; _dupCt = 0; _redCt = 0; if (typeof(_mrMap) === 'undefined') { _mrMap = {}; }");
+            _scope->invoke(init, 0, 0, 0, true);
+
+            // js function to run reduce on all keys
+//            redfunc = _scope->createFunction("for (var key in hashmap) {  print('Key is ' + key); list = hashmap[key]; ret = reduce(key, list); print('Value is ' + ret); };");
+            _reduceAll = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length != 1) { ret = _reduce(key, list); map[key] = [ret]; ++_redCt; } } _dupCt = 0;");
+            _reduceAndEmit = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; } emit(key, ret); }; delete _mrMap;");
+            _reduceAndFinalize = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { if (!_doFinal) {continue;} ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(ret); } map[key] = ret; }");
+            _reduceAndFinalizeAndInsert = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(ret); } _nativeToTemp({_id: key, value: ret}); }");
 
-            _scope->injectNative( "emit" , fast_emit );
-
-            if ( _onDisk ) {
-                // clear temp collections
-                _db.dropCollection( _config.tempLong );
-                _db.dropCollection( _config.incLong );
-
-                // create the inc collection and make sure we have index on "0" key
-                {
-                    writelock l( _config.incLong );
-                    Client::Context ctx( _config.incLong );
-                    string err;
-                    if ( ! userCreateNS( _config.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ) {
-                        uasserted( 13631 , str::stream() << "userCreateNS failed for mr incLong ns: " << _config.incLong << " err: " << err );
-                    }
-                }
-
-                BSONObj sortKey = BSON( "0" << 1 );
-                _db.ensureIndex( _config.incLong , sortKey );
+        }
 
+        void State::switchMode(bool jsMode) {
+            _jsMode = jsMode;
+            if (jsMode) {
+                // emit function that stays in JS
+                _scope->setFunction("emit", "function(key, value) { if (typeof(key) === 'object') { _bailFromJS(key, value); return; }; ++_emitCt; var map = _mrMap; var list = map[key]; if (!list) { ++_keyCt; list = []; map[key] = list; } else { ++_dupCt; } list.push(value); }");
+                _scope->injectNative("_bailFromJS", _bailFromJS, this);
+            } else {
+                // emit now populates C++ map
+                _scope->injectNative( "emit" , fast_emit, this );
             }
+        }
+
+        void State::bailFromJS() {
+            log(1) << "M/R: Switching from JS mode to mixed mode" << endl;
 
+            // reduce and reemit into c++
+            switchMode(false);
+            _scope->invoke(_reduceAndEmit, 0, 0, 0, true);
+            // need to get the real number emitted so far
+            _numEmits = _scope->getNumberInt("_emitCt");
+            _config.reducer->numReduces = _scope->getNumberInt("_redCt");
         }
 
         /**
@@ -542,12 +614,40 @@ namespace mongo {
             insert( _config.tempLong , res );
         }
 
+        BSONObj _nativeToTemp( const BSONObj& args, void* data ) {
+            State* state = (State*) data;
+            BSONObjIterator it(args);
+            state->insert(state->_config.tempLong, it.next().Obj());
+            return BSONObj();
+        }
+
+//        BSONObj _nativeToInc( const BSONObj& args, void* data ) {
+//            State* state = (State*) data;
+//            BSONObjIterator it(args);
+//            const BSONObj& obj = it.next().Obj();
+//            state->_insertToInc(const_cast<BSONObj&>(obj));
+//            return BSONObj();
+//        }
+
         /**
          * Applies last reduce and finalize.
          * After calling this method, the temp collection will be completed.
          * If inline, the results will be in the in memory map
          */
         void State::finalReduce( CurOp * op , ProgressMeterHolder& pm ) {
+
+            if (_jsMode) {
+                // apply the reduce within JS
+                if (_onDisk) {
+                    _scope->injectNative("_nativeToTemp", _nativeToTemp, this);
+                    _scope->invoke(_reduceAndFinalizeAndInsert, 0, 0, 0, true);
+                    return;
+                } else {
+                    _scope->invoke(_reduceAndFinalize, 0, 0, 0, true);
+                    return;
+                }
+            }
+
             if ( ! _onDisk ) {
                 // all data has already been reduced, just finalize
                 if ( _config.finalizer ) {
@@ -619,8 +719,16 @@ namespace mongo {
                 }
 
                 ClientCursor::YieldLock yield (cursor.get());
-                // reduce an finalize array
-                finalReduce( all );
+
+                try {
+                    // reduce a finalize array
+                    finalReduce( all );
+                }
+                catch (...) {
+                    yield.relock();
+                    cursor.release();
+                    throw;
+                }
 
                 all.clear();
                 prev = o;
@@ -656,9 +764,14 @@ namespace mongo {
          */
         void State::reduceInMemory() {
 
+            if (_jsMode) {
+                // in js mode the reduce is applied when writing to collection
+                return;
+            }
+
             auto_ptr<InMemory> n( new InMemory() ); // for new data
             long nSize = 0;
-            long dupCount = 0;
+            _dupCount = 0;
 
             for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
                 BSONObj key = i->first;
@@ -674,20 +787,19 @@ namespace mongo {
                     }
                     else {
                         // add to new map
-                        _add( n.get() , all[0] , nSize, dupCount );
+                        _add( n.get() , all[0] , nSize );
                     }
                 }
                 else if ( all.size() > 1 ) {
                     // several values, reduce and add to map
                     BSONObj res = _config.reducer->reduce( all );
-                    _add( n.get() , res , nSize, dupCount );
+                    _add( n.get() , res , nSize );
                 }
             }
 
             // swap maps
             _temp.reset( n.release() );
             _size = nSize;
-            _dupCount = dupCount;
         }
 
         /**
@@ -718,57 +830,87 @@ namespace mongo {
          */
         void State::emit( const BSONObj& a ) {
             _numEmits++;
-            _add( _temp.get() , a , _size, _dupCount );
+            _add( _temp.get() , a , _size );
         }
 
-        void State::_add( InMemory* im, const BSONObj& a , long& size, long& dupCount ) {
+        void State::_add( InMemory* im, const BSONObj& a , long& size ) {
             BSONList& all = (*im)[a];
             all.push_back( a );
             size += a.objsize() + 16;
             if (all.size() > 1)
-            	++dupCount;
+            	++_dupCount;
         }
 
         /**
          * this method checks the size of in memory map and potentially flushes to disk
          */
         void State::checkSize() {
-            if ( _size < 1024 * 50 )
+            if (_jsMode) {
+                // try to reduce if it is beneficial
+                int dupCt = _scope->getNumberInt("_dupCt");
+                int keyCt = _scope->getNumberInt("_keyCt");
+
+                if (keyCt > _config.jsMaxKeys) {
+                    // too many keys for JS, switch to mixed
+                    _bailFromJS(BSONObj(), this);
+                    // then fall through to check map size
+                } else if (dupCt > (keyCt * _config.reduceTriggerRatio)) {
+                    // reduce now to lower mem usage
+                    _scope->invoke(_reduceAll, 0, 0, 0, true);
+                    return;
+                }
+            }
+
+            if (_jsMode)
                 return;
 
+            bool dump = _onDisk && _size > _config.maxInMemSize;
             // attempt to reduce in memory map, if we've seen duplicates
-            if ( _dupCount > 0) {
+            if ( dump || _dupCount > (_temp->size() * _config.reduceTriggerRatio)) {
 				long before = _size;
 				reduceInMemory();
 				log(1) << "  mr: did reduceInMemory  " << before << " -->> " << _size << endl;
             }
 
-            if ( ! _onDisk || _size < 1024 * 100 )
-                return;
-
-            dumpToInc();
-            log(1) << "  mr: dumping to db" << endl;
+            // reevaluate size and potentially dump
+            if ( dump &&  _size > _config.maxInMemSize) {
+                dumpToInc();
+                log(1) << "  mr: dumping to db" << endl;
+            }
         }
 
-        boost::thread_specific_ptr<State*> _tl;
-
         /**
          * emit that will be called by js function
          */
-        BSONObj fast_emit( const BSONObj& args ) {
+        BSONObj fast_emit( const BSONObj& args, void* data ) {
             uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 );
             uassert( 13069 , "an emit can't be more than half max bson size" , args.objsize() < ( BSONObjMaxUserSize / 2 ) );
             
+            State* state = (State*) data;
             if ( args.firstElement().type() == Undefined ) {
                 BSONObjBuilder b( args.objsize() );
                 b.appendNull( "" );
                 BSONObjIterator i( args );
                 i.next();
                 b.append( i.next() );
-                (*_tl)->emit( b.obj() );
+                state->emit( b.obj() );
             }
             else {
-                (*_tl)->emit( args );
+                state->emit( args );
+            }
+            return BSONObj();
+        }
+
+        /**
+         * function is called when we realize we cant use js mode for m/r on the 1st key
+         */
+        BSONObj _bailFromJS( const BSONObj& args, void* data ) {
+            State* state = (State*) data;
+            state->bailFromJS();
+
+            // emit this particular key if there is one
+            if (!args.isEmpty()) {
+                fast_emit(args, data);
             }
             return BSONObj();
         }
@@ -788,7 +930,7 @@ namespace mongo {
                 help << "http://www.mongodb.org/display/DOCS/MapReduce";
             }
             virtual LockType locktype() const { return NONE; }
-            bool run(const string& dbname , BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            bool run(const string& dbname , BSONObj& cmd, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
                 Timer t;
                 Client::GodScope cg;
                 Client& client = cc();
@@ -806,7 +948,6 @@ namespace mongo {
                 BSONObjBuilder countsBuilder;
                 BSONObjBuilder timingBuilder;
                 State state( config );
-
                 if ( ! state.sourceExists() ) {
                     errmsg = "ns doesn't exist";
                     return false;
@@ -823,12 +964,7 @@ namespace mongo {
 
                 try {
                     state.init();
-
-                    {
-                        State** s = new State*();
-                        s[0] = &state;
-                        _tl.reset( s );
-                    }
+                    state.prepTempCollection();
 
                     wassert( config.limit < 0x4000000 ); // see case on next line to 32 bit unsigned
                     ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , state.incomingDocuments() ) );
@@ -843,23 +979,26 @@ namespace mongo {
                         }
 
                         // obtain cursor on data to apply mr to, sorted
-                        shared_ptr<Cursor> temp = bestGuessCursor( config.ns.c_str(), config.filter, config.sort );
+                        shared_ptr<Cursor> temp = NamespaceDetailsTransient::getCursor( config.ns.c_str(), config.filter, config.sort );
+                        uassert( 15876, str::stream() << "could not create cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, temp.get() );
                         auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , config.ns.c_str() ) );
+                        uassert( 15877, str::stream() << "could not create client cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, cursor.get() );
 
                         Timer mt;
                         // go through each doc
                         while ( cursor->ok() ) {
-                            // make sure we dont process duplicates in case data gets moved around during map
-                            if ( cursor->currentIsDup() ) {
+                            if ( ! cursor->currentMatches() ) {
                                 cursor->advance();
                                 continue;
                             }
 
-                            if ( ! cursor->currentMatches() ) {
+                            // make sure we dont process duplicates in case data gets moved around during map
+                            // TODO This won't actually help when data gets moved, it's to handle multikeys.
+                            if ( cursor->currentIsDup() ) {
                                 cursor->advance();
                                 continue;
                             }
-
+                                                        
                             BSONObj o = cursor->current();
                             cursor->advance();
 
@@ -874,7 +1013,7 @@ namespace mongo {
                             if ( config.verbose ) mapTime += mt.micros();
 
                             num++;
-                            if ( num % 100 == 0 ) {
+                            if ( num % 1000 == 0 ) {
                                 // try to yield lock regularly
                                 ClientCursor::YieldLock yield (cursor.get());
                                 Timer t;
@@ -908,19 +1047,31 @@ namespace mongo {
                     timingBuilder.append( "emitLoop" , t.millis() );
 
                     op->setMessage( "m/r: (2/3) final reduce in memory" );
+                    Timer t;
                     // do reduce in memory
                     // this will be the last reduce needed for inline mode
                     state.reduceInMemory();
                     // if not inline: dump the in memory map to inc collection, all data is on disk
                     state.dumpToInc();
-                    state.prepTempCollection();
                     // final reduce
                     state.finalReduce( op , pm );
-
-                    _tl.reset();
+                    inReduce += t.micros();
+                    countsBuilder.appendNumber( "reduce" , state.numReduces() );
+                    timingBuilder.append( "reduceTime" , inReduce / 1000 );
+                    timingBuilder.append( "mode" , state.jsMode() ? "js" : "mixed" );
+                }
+                // TODO:  The error handling code for queries is v. fragile,
+                // *requires* rethrow AssertionExceptions - should probably fix.
+                catch ( AssertionException& e ){
+                    log() << "mr failed, removing collection" << causedBy(e) << endl;
+                    throw e;
+                }
+                catch ( std::exception& e ){
+                    log() << "mr failed, removing collection" << causedBy(e) << endl;
+                    throw e;
                 }
                 catch ( ... ) {
-                    log() << "mr failed, removing collection" << endl;
+                    log() << "mr failed for unknown reason, removing collection" << endl;
                     throw;
                 }
 
@@ -967,113 +1118,127 @@ namespace mongo {
             virtual bool slaveOverrideOk() { return true; }
 
             virtual LockType locktype() const { return NONE; }
-            bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
                 string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe();
+                string postProcessCollection = cmdObj["postProcessCollection"].valuestrsafe();
+                bool postProcessOnly = !(postProcessCollection.empty());
 
                 Config config( dbname , cmdObj.firstElement().embeddedObjectUserCheck() );
+                State state(config);
+                state.init();
+                if (postProcessOnly) {
+                    // the temp collection has been decided by mongos
+                    config.tempLong = dbname + "." + postProcessCollection;
+                }
+                // no need for incremental collection because records are already sorted
                 config.incLong = config.tempLong;
 
-                set<ServerAndQuery> servers;
-
-                BSONObjBuilder shardCounts;
-                map<string,long long> counts;
-
                 BSONObj shards = cmdObj["shards"].embeddedObjectUserCheck();
-                vector< auto_ptr<DBClientCursor> > shardCursors;
-
-                {
-                    // parse per shard results
-                    BSONObjIterator i( shards );
-                    while ( i.more() ) {
-                        BSONElement e = i.next();
-                        string shard = e.fieldName();
-
-                        BSONObj res = e.embeddedObjectUserCheck();
-
-                        uassert( 10078 ,  "something bad happened" , shardedOutputCollection == res["result"].valuestrsafe() );
-                        servers.insert( shard );
-                        shardCounts.appendAs( res["counts"] , shard );
-
-                        BSONObjIterator j( res["counts"].embeddedObjectUserCheck() );
-                        while ( j.more() ) {
-                            BSONElement temp = j.next();
-                            counts[temp.fieldName()] += temp.numberLong();
-                        }
+                BSONObj shardCounts = cmdObj["shardCounts"].embeddedObjectUserCheck();
+                BSONObj counts = cmdObj["counts"].embeddedObjectUserCheck();
 
+                if (postProcessOnly) {
+                    if (!state._db.exists(config.tempLong)) {
+                        // nothing to do
+                        return 1;
                     }
+                } else {
+                    set<ServerAndQuery> servers;
+                    vector< auto_ptr<DBClientCursor> > shardCursors;
 
-                }
-
-                State state(config);
-                state.prepTempCollection();
+                    {
+                        // parse per shard results
+                        BSONObjIterator i( shards );
+                        while ( i.more() ) {
+                            BSONElement e = i.next();
+                            string shard = e.fieldName();
 
-                {
-                    // reduce from each stream
+                            BSONObj res = e.embeddedObjectUserCheck();
 
-                    BSONObj sortKey = BSON( "_id" << 1 );
+                            uassert( 10078 ,  "something bad happened" , shardedOutputCollection == res["result"].valuestrsafe() );
+                            servers.insert( shard );
 
-                    ParallelSortClusteredCursor cursor( servers , dbname + "." + shardedOutputCollection ,
-                                                        Query().sort( sortKey ) );
-                    cursor.init();
-                    state.init();
+                        }
 
-                    BSONList values;
-                    if (!config.outDB.empty()) {
-                        BSONObjBuilder loc;
-                        if ( !config.outDB.empty())
-                            loc.append( "db" , config.outDB );
-                        if ( !config.finalShort.empty() )
-                            loc.append( "collection" , config.finalShort );
-                        result.append("result", loc.obj());
-                    }
-                    else {
-                        if ( !config.finalShort.empty() )
-                            result.append( "result" , config.finalShort );
                     }
 
-                    while ( cursor.more() ) {
-                        BSONObj t = cursor.next().getOwned();
+                    state.prepTempCollection();
 
-                        if ( values.size() == 0 ) {
-                            values.push_back( t );
-                            continue;
+                    {
+                        // reduce from each stream
+
+                        BSONObj sortKey = BSON( "_id" << 1 );
+
+                        ParallelSortClusteredCursor cursor( servers , dbname + "." + shardedOutputCollection ,
+                                                            Query().sort( sortKey ) );
+                        cursor.init();
+
+                        BSONList values;
+                        if (!config.outDB.empty()) {
+                            BSONObjBuilder loc;
+                            if ( !config.outDB.empty())
+                                loc.append( "db" , config.outDB );
+                            if ( !config.finalShort.empty() )
+                                loc.append( "collection" , config.finalShort );
+                            result.append("result", loc.obj());
                         }
-
-                        if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ) {
-                            values.push_back( t );
-                            continue;
+                        else {
+                            if ( !config.finalShort.empty() )
+                                result.append( "result" , config.finalShort );
                         }
 
+                        while ( cursor.more() || !values.empty() ) {
+                            BSONObj t;
+                            if (cursor.more()) {
+                                t = cursor.next().getOwned();
 
-                        state.emit( config.reducer->finalReduce( values , config.finalizer.get() ) );
-                        values.clear();
-                        values.push_back( t );
-                    }
+                                if ( values.size() == 0 ) {
+                                    values.push_back( t );
+                                    continue;
+                                }
 
-                    if ( values.size() )
-                        state.emit( config.reducer->finalReduce( values , config.finalizer.get() ) );
-                }
+                                if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ) {
+                                    values.push_back( t );
+                                    continue;
+                                }
+                            }
 
+                            BSONObj res = config.reducer->finalReduce( values , config.finalizer.get());
+                            if (state.isOnDisk())
+                                state.insertToInc(res);
+                            else
+                                state.emit(res);
+                            values.clear();
+                            if (!t.isEmpty())
+                                values.push_back( t );
+                        }
+                    }
 
-                state.dumpToInc();
-                state.postProcessCollection();
-                state.appendResults( result );
+                    for ( set<ServerAndQuery>::iterator i=servers.begin(); i!=servers.end(); i++ ) {
+                        ScopedDbConnection conn( i->_server );
+                        conn->dropCollection( dbname + "." + shardedOutputCollection );
+                        conn.done();
+                    }
 
-                for ( set<ServerAndQuery>::iterator i=servers.begin(); i!=servers.end(); i++ ) {
-                    ScopedDbConnection conn( i->_server );
-                    conn->dropCollection( dbname + "." + shardedOutputCollection );
-                    conn.done();
+                    result.append( "shardCounts" , shardCounts );
                 }
 
-                result.append( "shardCounts" , shardCounts.obj() );
+                long long finalCount = state.postProcessCollection();
+                state.appendResults( result );
 
-                {
-                    BSONObjBuilder c;
-                    for ( map<string,long long>::iterator i=counts.begin(); i!=counts.end(); i++ ) {
-                        c.append( i->first , i->second );
-                    }
-                    result.append( "counts" , c.obj() );
+                // fix the global counts
+                BSONObjBuilder countsB(32);
+                BSONObjIterator j(counts);
+                while (j.more()) {
+                    BSONElement elmt = j.next();
+                    if (!strcmp(elmt.fieldName(), "reduce"))
+                        countsB.append("reduce", elmt.numberLong() + state.numReduces());
+                    else if (!strcmp(elmt.fieldName(), "output"))
+                        countsB.append("output", finalCount);
+                    else
+                        countsB.append(elmt);
                 }
+                result.append( "counts" , countsB.obj() );
 
                 return 1;
             }
diff --git a/db/commands/mr.h b/db/commands/mr.h
index f505a45..3fa8146 100644
--- a/db/commands/mr.h
+++ b/db/commands/mr.h
@@ -50,12 +50,15 @@ namespace mongo {
 
         class Reducer : boost::noncopyable {
         public:
+            Reducer() : numReduces(0) {}
             virtual ~Reducer() {}
             virtual void init( State * state ) = 0;
 
             virtual BSONObj reduce( const BSONList& tuples ) = 0;
             /** this means its a final reduce, even if there is no finalizer */
             virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ) = 0;
+
+            long long numReduces;
         };
 
         // ------------  js function implementations -----------
@@ -88,7 +91,7 @@ namespace mongo {
 
         class JSMapper : public Mapper {
         public:
-            JSMapper( const BSONElement & code ) : _func( "map" , code ) {}
+            JSMapper( const BSONElement & code ) : _func( "_map" , code ) {}
             virtual void map( const BSONObj& o );
             virtual void init( State * state );
 
@@ -99,8 +102,8 @@ namespace mongo {
 
         class JSReducer : public Reducer {
         public:
-            JSReducer( const BSONElement& code ) : _func( "reduce" , code ) {}
-            virtual void init( State * state ) { _func.init( state ); }
+            JSReducer( const BSONElement& code ) : _func( "_reduce" , code ) {}
+            virtual void init( State * state );
 
             virtual BSONObj reduce( const BSONList& tuples );
             virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer );
@@ -115,12 +118,11 @@ namespace mongo {
             void _reduce( const BSONList& values , BSONObj& key , int& endSizeEstimate );
 
             JSFunction _func;
-
         };
 
         class JSFinalizer : public Finalizer  {
         public:
-            JSFinalizer( const BSONElement& code ) : _func( "finalize" , code ) {}
+            JSFinalizer( const BSONElement& code ) : _func( "_finalize" , code ) {}
             virtual BSONObj finalize( const BSONObj& o );
             virtual void init( State * state ) { _func.init( state ); }
         private:
@@ -153,6 +155,7 @@ namespace mongo {
 
             // options
             bool verbose;
+            bool jsMode;
 
             // query options
 
@@ -178,6 +181,13 @@ namespace mongo {
 
             string outDB;
 
+            // max number of keys allowed in JS map before switching mode
+            long jsMaxKeys;
+            // ratio of duplicates vs unique keys before reduce is triggered in js mode
+            float reduceTriggerRatio;
+            // maximum size of map before it gets dumped to disk
+            long maxInMemSize;
+
             enum { REPLACE , // atomically replace the collection
                    MERGE ,  // merge keys, override dups
                    REDUCE , // merge keys, reduce dups
@@ -225,6 +235,8 @@ namespace mongo {
              * transfers in memory storage to temp collection
              */
             void dumpToInc();
+            void insertToInc( BSONObj& o );
+            void _insertToInc( BSONObj& o );
 
             // ------ reduce stage -----------
 
@@ -252,7 +264,7 @@ namespace mongo {
             /**
              * inserts with correct replication semantics
              */
-            void insert( const string& ns , BSONObj& o );
+            void insert( const string& ns , const BSONObj& o );
 
             // ------ simple accessors -----
 
@@ -263,27 +275,38 @@ namespace mongo {
 
             const bool isOnDisk() { return _onDisk; }
 
-            long long numEmits() const { return _numEmits; }
+            long long numEmits() const { if (_jsMode) return _scope->getNumberLongLong("_emitCt"); return _numEmits; }
+            long long numReduces() const { if (_jsMode) return _scope->getNumberLongLong("_redCt"); return _config.reducer->numReduces; }
+
+            bool jsMode() {return _jsMode;}
+            void switchMode(bool jsMode);
+            void bailFromJS();
+
+            const Config& _config;
+            DBDirectClient _db;
 
         protected:
 
-            void _insertToInc( BSONObj& o );
-            static void _add( InMemory* im , const BSONObj& a , long& size, long& dupCount );
+            void _add( InMemory* im , const BSONObj& a , long& size );
 
             scoped_ptr<Scope> _scope;
-            const Config& _config;
             bool _onDisk; // if the end result of this map reduce is disk or not
 
-            DBDirectClient _db;
-
             scoped_ptr<InMemory> _temp;
             long _size; // bytes in _temp
             long _dupCount; // number of duplicate key entries
 
             long long _numEmits;
+
+            bool _jsMode;
+            ScriptingFunction _reduceAll;
+            ScriptingFunction _reduceAndEmit;
+            ScriptingFunction _reduceAndFinalize;
+            ScriptingFunction _reduceAndFinalizeAndInsert;
         };
 
-        BSONObj fast_emit( const BSONObj& args );
+        BSONObj fast_emit( const BSONObj& args, void* data );
+        BSONObj _bailFromJS( const BSONObj& args, void* data );
 
     } // end mr namespace
 }
diff --git a/db/common.cpp b/db/common.cpp
index 44bc54d..0f82bef 100644
--- a/db/common.cpp
+++ b/db/common.cpp
@@ -1,4 +1,5 @@
-// common.cpp
+// @file common.cpp
+
 /*
  *    Copyright (C) 2010 10gen Inc.
  *
@@ -17,17 +18,51 @@
 
 #include "pch.h"
 #include "concurrency.h"
+#include "jsobjmanipulator.h"
 
 /**
  * this just has globals
  */
 namespace mongo {
 
+    /** called by mongos, mongod, test. do not call from clients and such. 
+        invoked before about everything except global var construction.
+     */
+    void doPreServerStatupInits() { 
+    }
+
     /* we use new here so we don't have to worry about destructor orders at program shutdown */
-    MongoMutex &dbMutex( *(new MongoMutex("rw:dbMutex")) );
+    MongoMutex &dbMutex( *(new MongoMutex("dbMutex")) );
 
     MongoMutex::MongoMutex(const char *name) : _m(name) {
+        static int n = 0;
+        assert( ++n == 1 ); // below releasingWriteLock we assume MongoMutex is a singleton, and uses dbMutex ref above
         _remapPrivateViewRequested = false;
     }
 
+    // OpTime::now() uses dbMutex, thus it is in this file not in the cpp files used by drivers and such
+    void BSONElementManipulator::initTimestamp() {
+        massert( 10332 ,  "Expected CurrentTime type", _element.type() == Timestamp );
+        unsigned long long &timestamp = *( reinterpret_cast< unsigned long long* >( value() ) );
+        if ( timestamp == 0 )
+            timestamp = OpTime::now().asDate();
+    }
+
+    NOINLINE_DECL OpTime OpTime::skewed() {
+        bool toLog = false;
+        ONCE toLog = true;
+        RARELY toLog = true;
+        last.i++;
+        if ( last.i & 0x80000000 )
+            toLog = true;
+        if ( toLog ) {
+            log() << "clock skew detected  prev: " << last.secs << " now: " << (unsigned) time(0) << endl;
+        }
+        if ( last.i & 0x80000000 ) {
+            log() << "error large clock skew detected, shutting down" << endl;
+            throw ClockSkewException();
+        }
+        return last;
+    }
+
 }
diff --git a/db/compact.cpp b/db/compact.cpp
index 6bafd91..c6e5f77 100644
--- a/db/compact.cpp
+++ b/db/compact.cpp
@@ -1,4 +1,4 @@
-/* @file compact.cpp
+/** @file compact.cpp
    compaction of deleted space in pdfiles (datafiles)
 */
 
@@ -25,174 +25,273 @@
 #include "concurrency.h"
 #include "commands.h"
 #include "curop-inl.h"
+#include "background.h"
+#include "extsort.h"
+#include "compact.h"
 #include "../util/concurrency/task.h"
 
 namespace mongo {
 
-    class CompactJob : public task::Task {
-    public:
-        CompactJob(string ns) : _ns(ns) { }
-    private:
-        virtual string name() const { return "compact"; }
-        virtual void doWork();
-        NamespaceDetails * beginBlock();
-        void doBatch();
-        void prep();
-        const string _ns;
-        unsigned long long _nrecords;
-        unsigned long long _ncompacted;
-        DiskLoc _firstExtent;
-    };
+    char faux;
 
-    // lock & set context first.  this checks that collection still exists, and that it hasn't
-    // morphed into a capped collection between locks (which is possible)
-    NamespaceDetails * CompactJob::beginBlock() {
-        NamespaceDetails *nsd = nsdetails(_ns.c_str());
-        if( nsd == 0 ) throw "ns no longer present";
-        if( nsd->firstExtent.isNull() )
-            throw "no first extent";
-        if( nsd->capped )
-            throw "capped collection";
-        return nsd;
-    }
+    void addRecordToRecListInExtent(Record *r, DiskLoc loc);
+    DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god);
+    void freeExtents(DiskLoc firstExt, DiskLoc lastExt);
+
+    /** @return number of skipped (invalid) documents */
+    unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n,
+                const scoped_array<IndexSpec> &indexSpecs,
+                scoped_array<SortPhaseOne>& phase1, int nidx, bool validate)
+    {
+        log() << "compact extent #" << n << endl;
+
+        Extent *e = ext.ext();
+        e->assertOk();
+        assert( e->validates() );
+        unsigned skipped = 0;
 
-    void CompactJob::doBatch() {
-        unsigned n = 0;
         {
-            /* pre-touch records in a read lock so that paging happens in read not write lock.
-               note we are only touching the records though; if indexes aren't in RAM, they will
-               page later.  So the concept is only partial.
-               */
-            readlock lk;
+            // the next/prev pointers within the extent might not be in order so we first page the whole thing in 
+            // sequentially
+            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
             Timer t;
-            Client::Context ctx(_ns);
-            NamespaceDetails *nsd = beginBlock();
-            if( nsd->firstExtent != _firstExtent )  {
-                // TEMP DEV - stop after 1st extent
-                throw "change of first extent";
-            }
-            DiskLoc loc = nsd->firstExtent.ext()->firstRecord;
-            while( !loc.isNull() ) {
-                Record *r = loc.rec();
-                loc = r->getNext(loc);
-                if( ++n >= 100 || (n % 8 == 0 && t.millis() > 50) )
-                    break;
+            MAdvise adv(e, e->length, MAdvise::Sequential);
+            const char *p = (const char *) e;
+            for( int i = 0; i < e->length; i += 4096 ) { 
+                faux += *p;
             }
+            int ms = t.millis();
+            if( ms > 1000 ) 
+                log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl;
         }
+
         {
-            writelock lk;
-            Client::Context ctx(_ns);
-            NamespaceDetails *nsd = beginBlock();
-            for( unsigned i = 0; i < n; i++ ) {
-                if( nsd->firstExtent != _firstExtent )  {
-                    // TEMP DEV - stop after 1st extent
-                    throw "change of first extent (or it is now null)";
+            log() << "compact copying records" << endl;
+            unsigned totalSize = 0;
+            int nrecs = 0;
+            DiskLoc L = e->firstRecord;
+            if( !L.isNull() )
+            while( 1 ) {
+                Record *recOld = L.rec();
+                L = recOld->nextInExtent(L);
+                nrecs++;
+                BSONObj objOld(recOld);
+
+                if( !validate || objOld.valid() ) {
+                    unsigned sz = objOld.objsize();
+                    unsigned lenWHdr = sz + Record::HeaderSize;
+                    totalSize += lenWHdr;
+                    DiskLoc extentLoc;
+                    DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, false);
+                    uassert(14024, "compact error out of space during compaction", !loc.isNull());
+                    Record *recNew = loc.rec();
+                    recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
+                    addRecordToRecListInExtent(recNew, loc);
+                    memcpy(recNew->data, objOld.objdata(), sz);
+
+                    {
+                        // extract keys for all indexes we will be rebuilding
+                        for( int x = 0; x < nidx; x++ ) { 
+                            phase1[x].addKeys(indexSpecs[x], objOld, loc);
+                        }
+                    }
                 }
-                DiskLoc loc = nsd->firstExtent.ext()->firstRecord;
-                Record *rec = loc.rec();
-                BSONObj o = loc.obj().getOwned(); // todo: inefficient, double mem copy...
-                try {
-                    theDataFileMgr.deleteRecord(_ns.c_str(), rec, loc, false);
+                else { 
+                    if( ++skipped <= 10 )
+                        log() << "compact skipping invalid object" << endl;
                 }
-                catch(DBException&) { throw "error deleting record"; }
-                try {
-                    theDataFileMgr.insertNoReturnVal(_ns.c_str(), o);
+
+                if( L.isNull() ) { 
+                    // we just did the very last record from the old extent.  it's still pointed to 
+                    // by the old extent ext, but that will be fixed below after this loop
+                    break;
                 }
-                catch(DBException&) {
-                    /* todo: save the record somehow??? try again with 'avoid' logic? */
-                    log() << "compact: error re-inserting record ns:" << _ns << " n:" << _nrecords << " _id:" << o["_id"].toString() << endl;
-                    throw "error re-inserting record";
+
+                // remove the old records (orphan them) periodically so our commit block doesn't get too large
+                bool stopping = false;
+                RARELY stopping = *killCurrentOp.checkForInterruptNoAssert(false) != 0;
+                if( stopping || getDur().aCommitIsNeeded() ) {
+                    e->firstRecord.writing() = L;
+                    Record *r = L.rec();
+                    getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs;
+                    getDur().commitIfNeeded();
+                    killCurrentOp.checkForInterrupt(false);
                 }
-                ++_ncompacted;
-                if( killCurrentOp.globalInterruptCheck() )
-                    throw "interrupted";
             }
+
+            assert( d->firstExtent == ext );
+            assert( d->lastExtent != ext );
+            DiskLoc newFirst = e->xnext;
+            d->firstExtent.writing() = newFirst;
+            newFirst.ext()->xprev.writing().Null();
+            getDur().writing(e)->markEmpty();
+            freeExtents(ext,ext);
+            getDur().commitIfNeeded();
+
+            log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB" << endl;
         }
-    }
 
-    void CompactJob::prep() {
-        readlock lk;
-        Client::Context ctx(_ns);
-        NamespaceDetails *nsd = beginBlock();
-        DiskLoc L = nsd->firstExtent;
-        assert( !L.isNull() );
-        _firstExtent = L;
-        _nrecords = nsd->stats.nrecords;
-        _ncompacted = 0;
+        return skipped;
     }
 
-    static mutex m("compact");
-    static volatile bool running;
-
-    void CompactJob::doWork() {
-        Client::initThread("compact");
-        cc().curop()->reset();
-        cc().curop()->setNS(_ns.c_str());
-        cc().curop()->markCommand();
-        sleepsecs(60);
-        try {
-            prep();
-            while( _ncompacted < _nrecords )
-                doBatch();
+    extern SortPhaseOne *precalced;
+
+    bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result) { 
+        //int les = d->lastExtentSize;
+
+        // this is a big job, so might as well make things tidy before we start just to be nice.
+        getDur().commitNow();
+
+        list<DiskLoc> extents;
+        for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) 
+            extents.push_back(L);
+        log() << "compact " << extents.size() << " extents" << endl;
+
+        ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) );
+
+        // same data, but might perform a little different after compact?
+        NamespaceDetailsTransient::get_w(ns).clearQueryCache();
+
+        int nidx = d->nIndexes;
+        scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] );
+        scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] );
+        {
+            NamespaceDetails::IndexIterator ii = d->ii(); 
+            int x = 0;
+            while( ii.more() ) { 
+                BSONObjBuilder b;
+                IndexDetails& idx = ii.next();
+                BSONObj::iterator i(idx.info.obj());
+                while( i.more() ) { 
+                    BSONElement e = i.next();
+                    if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) {
+                        b.append(e);
+                    }
+                }
+                BSONObj o = b.obj().getOwned();
+                phase1[x].sorter.reset( new BSONObjExternalSorter( idx.idxInterface(), o.getObjectField("key") ) );
+                phase1[x].sorter->hintNumObjects( d->stats.nrecords );
+                indexSpecs[x++].reset(o);
+            }
         }
-        catch(const char *p) {
-            log() << "info: exception compact " << p << endl;
+
+        log() << "compact orphan deleted lists" << endl;
+        for( int i = 0; i < Buckets; i++ ) { 
+            d->deletedList[i].writing().Null();
         }
-        catch(...) {
-            log() << "info: exception compact" << endl;
+
+        // before dropping indexes, at least make sure we can allocate one extent!
+        uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull());
+
+        // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here
+        log() << "compact dropping indexes" << endl;
+        BSONObjBuilder b;
+        if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { 
+            errmsg = "compact drop indexes failed";
+            log() << errmsg << endl;
+            return false;
         }
-        mongo::running = false;
-        cc().shutdown();
-    }
 
-    /* --- CompactCmd --- */
+        getDur().commitNow();
 
-    class CompactCmd : public Command {
-    public:
-        virtual bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            string coll = cmdObj.firstElement().valuestr();
-            if( coll.empty() || db.empty() ) {
-                errmsg = "no collection name specified";
-                return false;
+        long long skipped = 0;
+        int n = 0;
+        for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { 
+            skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate);
+            pm.hit();
+        }
+
+        if( skipped ) {
+            result.append("invalidObjects", skipped);
+        }
+
+        assert( d->firstExtent.ext()->xprev.isNull() );
+
+        // indexes will do their own progress meter?
+        pm.finished();
+
+        // build indexes
+        NamespaceString s(ns);
+        string si = s.db + ".system.indexes";
+        for( int i = 0; i < nidx; i++ ) {
+            killCurrentOp.checkForInterrupt(false);
+            BSONObj info = indexSpecs[i].info;
+            log() << "compact create index " << info["key"].Obj().toString() << endl;
+            try {
+                precalced = &phase1[i];
+                theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize());
             }
-            string ns = db + '.' + coll;
-            assert( isANormalNSName(ns.c_str()) );
-            {
-                readlock lk;
-                Client::Context ctx(ns);
-                if( nsdetails(ns.c_str()) == 0 ) {
-                    errmsg = "namespace " + ns + " does not exist";
-                    return false;
-                }
+            catch(...) { 
+                precalced = 0;
+                throw;
             }
-            {
-                scoped_lock lk(m);
-                if( running ) {
-                    errmsg = "a compaction is already running";
-                    return false;
-                }
-                running = true;
-                task::fork( new CompactJob(ns) );
-                return true;
+            precalced = 0;
+        }
+
+        return true;
+    }
+
+    bool compact(const string& ns, string &errmsg, bool validate, BSONObjBuilder& result) {
+        massert( 14028, "bad ns", NamespaceString::normal(ns.c_str()) );
+        massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails
+
+        bool ok;
+        {
+            writelock lk;
+            BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str());
+            Client::Context ctx(ns);
+            NamespaceDetails *d = nsdetails(ns.c_str());
+            massert( 13660, str::stream() << "namespace " << ns << " does not exist", d );
+            massert( 13661, "cannot compact capped collection", !d->capped );
+            log() << "compact " << ns << " begin" << endl;
+            try { 
+                ok = _compact(ns.c_str(), d, errmsg, validate, result);
             }
-            errmsg = "not done";
-            return false;
+            catch(...) { 
+                log() << "compact " << ns << " end (with error)" << endl;
+                throw;
+            }
+            log() << "compact " << ns << " end" << endl;
         }
+        return ok;
+    }
+
+    bool isCurrentlyAReplSetPrimary();
 
+    class CompactCmd : public Command {
+    public:
         virtual LockType locktype() const { return NONE; }
         virtual bool adminOnly() const { return false; }
         virtual bool slaveOk() const { return true; }
+        virtual bool maintenanceMode() const { return true; }
         virtual bool logTheOp() { return false; }
         virtual void help( stringstream& help ) const {
-            help << "compact / defragment a collection in the background, slowly, attempting to minimize disruptions to other operations\n"
-                 "{ compact : <collection> }";
+            help << "compact collection\n"
+                "warning: this operation blocks the server and is slow. you can cancel with cancelOp()\n"
+                "{ compact : <collection_name>, [force:true], [validate:true] }\n"
+                "  force - allows to run on a replica set primary\n"
+                "  validate - check records are noncorrupt before adding to newly compacting extents. slower but safer (default is true in this version)\n";
         }
         virtual bool requiresAuth() { return true; }
-
-        /** @param webUI expose the command in the web ui as localhost:28017/<name>
-            @param oldName an optional old, deprecated name for the command
-        */
         CompactCmd() : Command("compact") { }
+
+        virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string coll = cmdObj.firstElement().valuestr();
+            if( coll.empty() || db.empty() ) {
+                errmsg = "no collection name specified";
+                return false;
+            }
+
+            if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) { 
+                errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force";
+                return false;
+            }
+
+            string ns = db + '.' + coll;
+            bool validate = !cmdObj.hasElement("validate") || cmdObj["validate"].trueValue(); // default is true at the moment
+            bool ok = compact(ns, errmsg, validate, result);
+            return ok;
+        }
     };
     static CompactCmd compactCmd;
 
diff --git a/db/compact.h b/db/compact.h
new file mode 100644
index 0000000..7bf49c8
--- /dev/null
+++ b/db/compact.h
@@ -0,0 +1,50 @@
+// compact.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /** for bottom up fastbuildindex (where we presort keys) */
+    struct SortPhaseOne { 
+        SortPhaseOne() { 
+            n = 0;
+            nkeys = 0;
+            multi = false;
+        }
+        shared_ptr<BSONObjExternalSorter> sorter;
+        unsigned long long n; // # of records
+        unsigned long long nkeys;
+        bool multi; // multikey index
+
+        void addKeys(const IndexSpec& spec, const BSONObj& o, DiskLoc loc) { 
+            BSONObjSet keys;
+            spec.getKeys(o, keys);
+            int k = 0;
+            for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+                if( ++k == 2 ) {
+                    multi = true;
+                }
+                sorter->add(*i, loc);
+                nkeys++;
+            }
+            n++;
+        }
+    };
+
+}
diff --git a/db/concurrency.h b/db/concurrency.h
index 39cd853..3d6d02d 100644
--- a/db/concurrency.h
+++ b/db/concurrency.h
@@ -78,15 +78,11 @@ namespace mongo {
 
 namespace mongo {
 
-    inline void dbunlocking_write() { }
-    inline void dbunlocking_read() { }
-
     struct writelock {
         writelock() { dbMutex.lock(); }
         writelock(const string& ns) { dbMutex.lock(); }
         ~writelock() {
             DESTRUCTOR_GUARD(
-                dbunlocking_write();
                 dbMutex.unlock();
             );
         }
@@ -99,7 +95,6 @@ namespace mongo {
         readlock() { dbMutex.lock_shared(); }
         ~readlock() {
             DESTRUCTOR_GUARD(
-                dbunlocking_read();
                 dbMutex.unlock_shared();
             );
         }
@@ -111,7 +106,6 @@ namespace mongo {
         }
         ~readlocktry() {
             if ( _got ) {
-                dbunlocking_read();
                 dbMutex.unlock_shared();
             }
         }
@@ -126,7 +120,6 @@ namespace mongo {
         }
         ~writelocktry() {
             if ( _got ) {
-                dbunlocking_read();
                 dbMutex.unlock();
             }
         }
@@ -175,11 +168,9 @@ namespace mongo {
         ~mongolock() {
             DESTRUCTOR_GUARD(
             if( _writelock ) {
-            dbunlocking_write();
                 dbMutex.unlock();
             }
             else {
-                dbunlocking_read();
                 dbMutex.unlock_shared();
             }
             );
diff --git a/db/curop.h b/db/curop.h
index c6e949b..2717d78 100644
--- a/db/curop.h
+++ b/db/curop.h
@@ -24,16 +24,56 @@
 #include "../bson/util/atomic_int.h"
 #include "../util/concurrency/spin_lock.h"
 #include "../util/time_support.h"
-#include "db.h"
-#include "../scripting/engine.h"
+#include "../util/net/hostandport.h"
 
 namespace mongo {
 
+    class CurOp;
+
     /* lifespan is different than CurOp because of recursives with DBDirectClient */
     class OpDebug {
     public:
-        StringBuilder str;
-        void reset() { str.reset(); }
+        OpDebug() : ns(""){ reset(); }
+
+        void reset();
+        
+        string toString() const;
+        void append( const CurOp& curop, BSONObjBuilder& b ) const;
+
+        // -------------------
+        
+        StringBuilder extra; // weird things we need to fix later
+        
+        // basic options
+        int op;
+        bool iscommand;
+        Namespace ns;
+        BSONObj query;
+        BSONObj updateobj;
+        
+        // detailed options
+        long long cursorid;
+        int ntoreturn;
+        int ntoskip;
+        bool exhaust;
+
+        // debugging/profile info
+        int nscanned;
+        bool idhack;
+        bool scanAndOrder;
+        bool moved;
+        bool fastmod;
+        bool fastmodinsert;
+        bool upsert;
+        unsigned keyUpdates;
+
+        // error handling
+        ExceptionInfo exceptionInfo;
+        
+        // response info
+        int executionTime;
+        int nreturned;
+        int responseLength;
     };
 
     /**
@@ -81,7 +121,7 @@ namespace mongo {
         int size() const { return *_size; }
         bool have() const { return size() > 0; }
 
-        BSONObj get() {
+        BSONObj get() const {
             _lock.lock();
             BSONObj o;
             try {
@@ -95,22 +135,15 @@ namespace mongo {
             return o;
         }
 
-        void append( BSONObjBuilder& b , const StringData& name ) {
-            _lock.lock();
-            try {
-                BSONObj temp = _get();
-                b.append( name , temp );
-                _lock.unlock();
-            }
-            catch ( ... ) {
-                _lock.unlock();
-                throw;
-            }
+        void append( BSONObjBuilder& b , const StringData& name ) const {
+            scoped_spinlock lk(_lock);
+            BSONObj temp = _get();
+            b.append( name , temp );
         }
 
     private:
         /** you have to be locked when you call this */
-        BSONObj _get() {
+        BSONObj _get() const {
             int sz = size();
             if ( sz == 0 )
                 return BSONObj();
@@ -122,7 +155,7 @@ namespace mongo {
         /** you have to be locked when you call this */
         void _reset( int sz ) { _size[0] = sz; }
 
-        SpinLock _lock;
+        mutable SpinLock _lock;
         int * _size;
         char _buf[512];
     };
@@ -137,35 +170,29 @@ namespace mongo {
 
         bool haveQuery() const { return _query.have(); }
         BSONObj query() { return _query.get();  }
-
+        void appendQuery( BSONObjBuilder& b , const StringData& name ) const { _query.append( b , name ); }
+        
         void ensureStarted() {
             if ( _start == 0 )
                 _start = _checkpoint = curTimeMicros64();
         }
-        void enter( Client::Context * context ) {
-            ensureStarted();
-            setNS( context->ns() );
-            if ( context->_db && context->_db->profile > _dbprofile )
-                _dbprofile = context->_db->profile;
-        }
+        bool isStarted() const { return _start > 0; }
 
-        void leave( Client::Context * context ) {
-            unsigned long long now = curTimeMicros64();
-            Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command );
-            _checkpoint = now;
-        }
+        void enter( Client::Context * context );
+
+        void leave( Client::Context * context );
 
         void reset() {
             _reset();
             _start = _checkpoint = 0;
-            _active = true;
             _opNum = _nextOpNum++;
-            _ns[0] = '?'; // just in case not set later
+            _ns[0] = 0;
             _debug.reset();
             _query.reset();
+            _active = true; // this should be last for ui clarity
         }
 
-        void reset( const SockAddr & remote, int op ) {
+        void reset( const HostAndPort& remote, int op ) {
             reset();
             _remote = remote;
             _op = op;
@@ -265,6 +292,7 @@ namespace mongo {
         CurOp *parent() const { return _wrapped; }
         void kill() { _killed = true; }
         bool killed() const { return _killed; }
+        void yielded() { _numYields++; }
         void setNS(const char *ns) {
             strncpy(_ns, ns, Namespace::MaxNsLen);
             _ns[Namespace::MaxNsLen] = 0;
@@ -286,12 +314,13 @@ namespace mongo {
         int _dbprofile; // 0=off, 1=slow, 2=all
         AtomicUInt _opNum;
         char _ns[Namespace::MaxNsLen+2];
-        struct SockAddr _remote;
+        HostAndPort _remote;
         CachedBSONObj _query;
         OpDebug _debug;
         ThreadSafeString _message;
         ProgressMeter _progressMeter;
         volatile bool _killed;
+        int _numYields;
 
         void _reset() {
             _command = false;
@@ -302,6 +331,7 @@ namespace mongo {
             _message = "";
             _progressMeter.finished();
             _killed = false;
+            _numYields = 0;
         }
     };
 
diff --git a/db/cursor.h b/db/cursor.h
index d17b698..9639b26 100644
--- a/db/cursor.h
+++ b/db/cursor.h
@@ -70,6 +70,8 @@ namespace mongo {
             return BSONObj();
         }
 
+        virtual bool supportGetMore() = 0;
+
         /* called after every query block is iterated -- i.e. between getMore() blocks
            so you can note where we are, if necessary.
            */
@@ -78,18 +80,20 @@ namespace mongo {
         /* called before query getmore block is iterated */
         virtual void checkLocation() { }
 
-        virtual bool supportGetMore() = 0;
         virtual bool supportYields() = 0;
 
+        /** Called before a ClientCursor yield. */
+        virtual bool prepareToYield() { noteLocation(); return supportYields(); }
+        
+        /** Called after a ClientCursor yield. */
+        virtual void recoverFromYield() { checkLocation(); }
+
         virtual string toString() { return "abstract?"; }
 
         /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
            if a multikey index traversal:
              if loc has already been sent, returns true.
              otherwise, marks loc as sent.
-           @param deep - match was against an array, so we know it is multikey.  this is legacy and kept
-                         for backwards datafile compatibility.  'deep' can be eliminated next time we
-                         force a data file conversion. 7Jul09
         */
         virtual bool getsetdup(DiskLoc loc) = 0;
 
@@ -115,7 +119,12 @@ namespace mongo {
         // matcher() should be checked each time advance() is called.
         // Implementations which generate their own matcher should return this
         // to avoid a matcher being set manually.
+        // Note that the return values differ subtly here
+
+        // Used when we want fast matcher lookup
         virtual CoveredIndexMatcher *matcher() const { return 0; }
+        // Used when we need to share this matcher with someone else
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return shared_ptr< CoveredIndexMatcher >(); }
 
         // A convenience function for setting the value of matcher() manually
         // so it may accessed later.  Implementations which must generate
@@ -123,6 +132,8 @@ namespace mongo {
         virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) {
             massert( 13285, "manual matcher config not allowed", false );
         }
+
+        virtual void explainDetails( BSONObjBuilder& b ) { return; }
     };
 
     // strategy object implementing direction of traversal.
@@ -170,6 +181,7 @@ namespace mongo {
         virtual bool supportGetMore() { return true; }
         virtual bool supportYields() { return true; }
         virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
         virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; }
         virtual long long nscanned() { return _nscanned; }
 
diff --git a/db/database.cpp b/db/database.cpp
index d164ba5..97b3fa0 100644
--- a/db/database.cpp
+++ b/db/database.cpp
@@ -52,26 +52,9 @@ namespace mongo {
         }
 
         newDb = namespaceIndex.exists();
-        profile = 0;
-
-        {
-            vector<string> others;
-            getDatabaseNames( others , path );
-
-            for ( unsigned i=0; i<others.size(); i++ ) {
-
-                if ( strcasecmp( others[i].c_str() , nm ) )
-                    continue;
-
-                if ( strcmp( others[i].c_str() , nm ) == 0 )
-                    continue;
-
-                stringstream ss;
-                ss << "db already exists with different case other: [" << others[i] << "] me [" << nm << "]";
-                uasserted( DatabaseDifferCaseCode , ss.str() );
-            }
-        }
+        profile = cmdLine.defaultProfile;
 
+        checkDuplicateUncasedNames();
 
         // If already exists, open.  Otherwise behave as if empty until
         // there's a write, then open.
@@ -91,7 +74,49 @@ namespace mongo {
             throw;
         }
     }
+    
+    void Database::checkDuplicateUncasedNames() const {
+        string duplicate = duplicateUncasedName( name, path );
+        if ( !duplicate.empty() ) {
+            stringstream ss;
+            ss << "db already exists with different case other: [" << duplicate << "] me [" << name << "]";
+            uasserted( DatabaseDifferCaseCode , ss.str() );
+        }
+    }
 
+    string Database::duplicateUncasedName( const string &name, const string &path, set< string > *duplicates ) {
+        if ( duplicates ) {
+            duplicates->clear();   
+        }
+        
+        vector<string> others;
+        getDatabaseNames( others , path );
+        
+        set<string> allShortNames;
+        dbHolder.getAllShortNames( allShortNames );
+        
+        others.insert( others.end(), allShortNames.begin(), allShortNames.end() );
+        
+        for ( unsigned i=0; i<others.size(); i++ ) {
+
+            if ( strcasecmp( others[i].c_str() , name.c_str() ) )
+                continue;
+            
+            if ( strcmp( others[i].c_str() , name.c_str() ) == 0 )
+                continue;
+
+            if ( duplicates ) {
+                duplicates->insert( others[i] );
+            } else {
+                return others[i];
+            }
+        }
+        if ( duplicates ) {
+            return duplicates->empty() ? "" : *duplicates->begin();
+        }
+        return "";
+    }
+    
     boost::filesystem::path Database::fileName( int n ) const {
         stringstream ss;
         ss << name << '.' << n;
@@ -167,15 +192,33 @@ namespace mongo {
         return ret;
     }
 
-    MongoDataFile* Database::suitableFile( int sizeNeeded, bool preallocate ) {
+    bool fileIndexExceedsQuota( const char *ns, int fileIndex, bool enforceQuota ) {
+        return
+            cmdLine.quota &&
+            enforceQuota &&
+            fileIndex >= cmdLine.quotaFiles &&
+            // we don't enforce the quota on "special" namespaces as that could lead to problems -- e.g.
+            // rejecting an index insert after inserting the main record.
+            !NamespaceString::special( ns ) &&
+            NamespaceString( ns ).db != "local";
+    }
+    
+    MongoDataFile* Database::suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota ) {
 
         // check existing files
         for ( int i=numFiles()-1; i>=0; i-- ) {
             MongoDataFile* f = getFile( i );
-            if ( f->getHeader()->unusedLength >= sizeNeeded )
-                return f;
+            if ( f->getHeader()->unusedLength >= sizeNeeded ) {
+                if ( fileIndexExceedsQuota( ns, i-1, enforceQuota ) ) // NOTE i-1 is the value used historically for this check.
+                    ;
+                else
+                    return f;
+            }
         }
 
+        if ( fileIndexExceedsQuota( ns, numFiles(), enforceQuota ) )
+            uasserted(12501, "quota exceeded");
+
         // allocate files until we either get one big enough or hit maxSize
         for ( int i = 0; i < 8; i++ ) {
             MongoDataFile* f = addAFile( sizeNeeded, preallocate );
@@ -187,6 +230,7 @@ namespace mongo {
                 return f;
         }
 
+        uasserted(14810, "couldn't allocate space (suitableFile)"); // callers don't check for null return code
         return 0;
     }
 
@@ -198,11 +242,11 @@ namespace mongo {
     }
 
 
-    Extent* Database::allocExtent( const char *ns, int size, bool capped ) {
+    Extent* Database::allocExtent( const char *ns, int size, bool capped, bool enforceQuota ) {
         Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped );
         if( e )
             return e;
-        return suitableFile( size, !capped )->createExtent( ns, size, capped );
+        return suitableFile( ns, size, !capped, enforceQuota )->createExtent( ns, size, capped );
     }
 
 
@@ -223,11 +267,11 @@ namespace mongo {
         assert( cc().database() == this );
 
         if ( ! namespaceIndex.details( profileName.c_str() ) ) {
-            log(1) << "creating profile ns: " << profileName << endl;
+            log() << "creating profile collection: " << profileName << endl;
             BSONObjBuilder spec;
             spec.appendBool( "capped", true );
-            spec.append( "size", 131072.0 );
-            if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , true ) ) {
+            spec.append( "size", 1024*1024 );
+            if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , false /* we don't replica profile messages */ ) ) {
                 return false;
             }
         }
@@ -235,14 +279,6 @@ namespace mongo {
         return true;
     }
 
-    void Database::finishInit() {
-        if ( cmdLine.defaultProfile == profile )
-            return;
-
-        string errmsg;
-        massert( 12506 , errmsg , setProfilingLevel( cmdLine.defaultProfile , errmsg ) );
-    }
-
     bool Database::validDBName( const string& ns ) {
         if ( ns.size() == 0 || ns.size() > 64 )
             return false;
diff --git a/db/database.h b/db/database.h
index 6e72ba8..3522f52 100644
--- a/db/database.h
+++ b/db/database.h
@@ -46,8 +46,6 @@ namespace mongo {
 
         void openAllFiles();
 
-        void finishInit();
-
         /**
          * tries to make sure that this hasn't been deleted
          */
@@ -82,9 +80,9 @@ namespace mongo {
          */
         void preallocateAFile() { getFile( numFiles() , 0, true ); }
 
-        MongoDataFile* suitableFile( int sizeNeeded, bool preallocate );
+        MongoDataFile* suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota );
 
-        Extent* allocExtent( const char *ns, int size, bool capped );
+        Extent* allocExtent( const char *ns, int size, bool capped, bool enforceQuota );
 
         MongoDataFile* newestFile();
 
@@ -93,7 +91,6 @@ namespace mongo {
          */
         bool setProfilingLevel( int newLevel , string& errmsg );
 
-
         void flushFiles( bool sync ) const;
 
         /**
@@ -107,7 +104,20 @@ namespace mongo {
         }
 
         static bool validDBName( const string& ns );
+        
+        /**
+         * @throws DatabaseDifferCaseCode if the name is a duplicate based on
+         * case insensitive matching.
+         */
+        void checkDuplicateUncasedNames() const;
 
+        /**
+         * @return name of an existing database with same text name but different
+         * casing, if one exists.  Otherwise the empty string is returned.  If
+         * 'duplicates' is specified, it is filled with all duplicate names.
+         */
+        static string duplicateUncasedName( const string &name, const string &path, set< string > *duplicates = 0 );
+        
     public: // this should be private later
 
         vector<MongoDataFile*> files;
diff --git a/db/db.cpp b/db/db.cpp
index 4f4575c..e6281d7 100644
--- a/db/db.cpp
+++ b/db/db.cpp
@@ -18,12 +18,12 @@
 
 #include "pch.h"
 #include "db.h"
-#include "query.h"
 #include "introspect.h"
 #include "repl.h"
 #include "../util/unittest.h"
 #include "../util/file_allocator.h"
 #include "../util/background.h"
+#include "../util/text.h"
 #include "dbmessage.h"
 #include "instance.h"
 #include "clientcursor.h"
@@ -36,28 +36,33 @@
 #include "stats/snapshots.h"
 #include "../util/concurrency/task.h"
 #include "../util/version.h"
+#include "../util/ramlog.h"
+#include "../util/net/message_server.h"
 #include "client.h"
 #include "restapi.h"
 #include "dbwebserver.h"
 #include "dur.h"
 #include "concurrency.h"
+#include "../s/d_writeback.h"
 
 #if defined(_WIN32)
 # include "../util/ntservice.h"
 #else
 # include <sys/file.h>
-# include <sys/resource.h>
 #endif
 
 namespace mongo {
 
+    namespace dur { 
+        extern unsigned long long DataLimitPerJournalFile;
+    }
+
     /* only off if --nocursors which is for debugging. */
     extern bool useCursors;
 
     /* only off if --nohints */
     extern bool useHints;
 
-    extern char *appsrvPath;
     extern int diagLogging;
     extern unsigned lenForNewNsFiles;
     extern int lockFile;
@@ -65,9 +70,7 @@ namespace mongo {
     extern string repairpath;
 
     void setupSignals( bool inFork );
-    void startReplSets(ReplSetCmdline*);
     void startReplication();
-    void pairWith(const char *remoteEnd, const char *arb);
     void exitCleanly( ExitCode code );
 
     CmdLine cmdLine;
@@ -93,65 +96,6 @@ namespace mongo {
 
     QueryResult* emptyMoreResult(long long);
 
-    void connThread( MessagingPort * p );
-
-    class OurListener : public Listener {
-    public:
-        OurListener(const string &ip, int p) : Listener(ip, p) { }
-        virtual void accepted(MessagingPort *mp) {
-
-            if ( ! connTicketHolder.tryAcquire() ) {
-                log() << "connection refused because too many open connections: " << connTicketHolder.used() << " of " << connTicketHolder.outof() << endl;
-                // TODO: would be nice if we notified them...
-                mp->shutdown();
-                delete mp;
-                return;
-            }
-
-            try {
-#ifndef __linux__  // TODO: consider making this ifdef _WIN32
-                boost::thread thr(boost::bind(&connThread,mp));
-#else
-                pthread_attr_t attrs;
-                pthread_attr_init(&attrs);
-                pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
-
-                static const size_t STACK_SIZE = 4*1024*1024;
-
-                struct rlimit limits;
-                assert(getrlimit(RLIMIT_STACK, &limits) == 0);
-                if (limits.rlim_cur > STACK_SIZE) {
-                    pthread_attr_setstacksize(&attrs, (DEBUG_BUILD
-                                                        ? (STACK_SIZE / 2)
-                                                        : STACK_SIZE));
-                }
-                else if (limits.rlim_cur < 1024*1024) {
-                    warning() << "Stack size set to " << (limits.rlim_cur/1024) << "KB. We suggest at least 1MB" << endl;
-                }
-
-                pthread_t thread;
-                int failed = pthread_create(&thread, &attrs, (void*(*)(void*)) &connThread, mp);
-
-                pthread_attr_destroy(&attrs);
-
-                if (failed) {
-                    log() << "pthread_create failed: " << errnoWithDescription(failed) << endl;
-                    throw boost::thread_resource_error(); // for consistency with boost::thread
-                }
-#endif
-            }
-            catch ( boost::thread_resource_error& ) {
-                log() << "can't create new thread, closing connection" << endl;
-                mp->shutdown();
-                delete mp;
-            }
-            catch ( ... ) {
-                log() << "unkonwn exception starting connThread" << endl;
-                mp->shutdown();
-                delete mp;
-            }
-        }
-    };
 
     /* todo: make this a real test.  the stuff in dbtests/ seem to do all dbdirectclient which exhaust doesn't support yet. */
 // QueryOption_Exhaust
@@ -193,23 +137,8 @@ namespace mongo {
     };
 #endif
 
-    void listen(int port) {
-        //testTheDb();
-        log() << "waiting for connections on port " << port << endl;
-        OurListener l(cmdLine.bind_ip, port);
-        l.setAsTimeTracker();
-        startReplication();
-        if ( !noHttpInterface )
-            boost::thread web( boost::bind(&webServerThread, new RestAdminAccess() /* takes ownership */));
-
-#if(TESTEXHAUST)
-        boost::thread thr(testExhaust);
-#endif
-        l.initAndListen();
-    }
-
     void sysRuntimeInfo() {
-        out() << "sysinfo:\n";
+        out() << "sysinfo:" << endl;
 #if defined(_SC_PAGE_SIZE)
         out() << "  page size: " << (int) sysconf(_SC_PAGE_SIZE) << endl;
 #endif
@@ -226,36 +155,15 @@ namespace mongo {
         sleepmicros( Client::recommendedYieldMicros() );
     }
 
-    /* we create one thread for each connection from an app server database.
-       app server will open a pool of threads.
-       todo: one day, asio...
-    */
-    void connThread( MessagingPort * inPort ) {
-        TicketHolderReleaser connTicketReleaser( &connTicketHolder );
-
-        /* todo: move to Client object */
-        LastError *le = new LastError();
-        lastError.reset(le);
-
-        inPort->_logLevel = 1;
-        auto_ptr<MessagingPort> dbMsgPort( inPort );
-        Client& c = Client::initThread("conn", inPort);
-
-        try {
-
-            c.getAuthenticationInfo()->isLocalHost = dbMsgPort->farEnd.isLocalHost();
-
-            Message m;
-            while ( 1 ) {
-                inPort->clearCounters();
+    class MyMessageHandler : public MessageHandler {
+    public:
+        virtual void connected( AbstractMessagingPort* p ) {
+            Client& c = Client::initThread("conn", p);
+            c.getAuthenticationInfo()->isLocalHost = p->remote().isLocalHost();
+        }
 
-                if ( !dbMsgPort->recv(m) ) {
-                    if( !cmdLine.quiet )
-                        log() << "end connection " << dbMsgPort->farEnd.toString() << endl;
-                    dbMsgPort->shutdown();
-                    break;
-                }
-sendmore:
+        virtual void process( Message& m , AbstractMessagingPort* port , LastError * le) {
+            while ( true ) {
                 if ( inShutdown() ) {
                     log() << "got request after shutdown()" << endl;
                     break;
@@ -264,10 +172,10 @@ sendmore:
                 lastError.startRequest( m , le );
 
                 DbResponse dbresponse;
-                assembleResponse( m, dbresponse, dbMsgPort->farEnd );
+                assembleResponse( m, dbresponse, port->remote() );
 
                 if ( dbresponse.response ) {
-                    dbMsgPort->reply(m, *dbresponse.response, dbresponse.responseTo);
+                    port->reply(m, *dbresponse.response, dbresponse.responseTo);
                     if( dbresponse.exhaust ) {
                         MsgData *header = dbresponse.response->header();
                         QueryResult *qr = (QueryResult *) header;
@@ -289,46 +197,42 @@ sendmore:
                             b.decouple();
                             DEV log() << "exhaust=true sending more" << endl;
                             beNice();
-                            goto sendmore;
+                            continue; // this goes back to top loop
                         }
                     }
                 }
-
-                networkCounter.hit( inPort->getBytesIn() , inPort->getBytesOut() );
-
-                m.reset();
+                break;
             }
-
-        }
-        catch ( AssertionException& e ) {
-            log() << "AssertionException in connThread, closing client connection" << endl;
-            log() << ' ' << e.what() << endl;
-            dbMsgPort->shutdown();
-        }
-        catch ( SocketException& ) {
-            problem() << "SocketException in connThread, closing client connection" << endl;
-            dbMsgPort->shutdown();
-        }
-        catch ( const ClockSkewException & ) {
-            exitCleanly( EXIT_CLOCK_SKEW );
-        }
-        catch ( std::exception &e ) {
-            problem() << "Uncaught std::exception: " << e.what() << ", terminating" << endl;
-            dbexit( EXIT_UNCAUGHT );
-        }
-        catch ( ... ) {
-            problem() << "Uncaught exception, terminating" << endl;
-            dbexit( EXIT_UNCAUGHT );
         }
 
-        // thread ending...
-        {
+        virtual void disconnected( AbstractMessagingPort* p ) {
             Client * c = currentClient.get();
             if( c ) c->shutdown();
+            globalScriptEngine->threadDone();
         }
-        globalScriptEngine->threadDone();
+
+    };
+
+    void listen(int port) {
+        //testTheDb();
+        MessageServer::Options options;
+        options.port = port;
+        options.ipList = cmdLine.bind_ip;
+
+        MessageServer * server = createServer( options , new MyMessageHandler() );
+        server->setAsTimeTracker();
+
+        startReplication();
+        if ( !noHttpInterface )
+            boost::thread web( boost::bind(&webServerThread, new RestAdminAccess() /* takes ownership */));
+
+#if(TESTEXHAUST)
+        boost::thread thr(testExhaust);
+#endif
+        server->run();
     }
 
+
     bool doDBUpgrade( const string& dbName , string errmsg , DataFileHeader * h ) {
         static DBDirectClient db;
 
@@ -378,7 +282,9 @@ sendmore:
             if ( !h->isCurrentVersion() || forceRepair ) {
 
                 if( h->version <= 0 ) {
-                    uasserted(10000, str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version << " info: " << h->versionMinor << ' ' << h->fileLength);
+                    uasserted(14026, 
+                      str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version 
+							        << " info: " << h->versionMinor << ' ' << h->fileLength);
                 }
 
                 log() << "****" << endl;
@@ -494,10 +400,12 @@ sendmore:
         return cc().curop()->opNum();
     }
 
-    void _initAndListen(int listenPort, const char *appserverLoc = NULL) {
+    void _initAndListen(int listenPort ) {
 
         Client::initThread("initandlisten");
 
+        Logstream::get().addGlobalTee( new RamLog("global") );
+
         bool is32bit = sizeof(int*) == 4;
 
         {
@@ -510,13 +418,14 @@ sendmore:
             l << "MongoDB starting : pid=" << pid << " port=" << cmdLine.port << " dbpath=" << dbpath;
             if( replSettings.master ) l << " master=" << replSettings.master;
             if( replSettings.slave )  l << " slave=" << (int) replSettings.slave;
-            l << ( is32bit ? " 32" : " 64" ) << "-bit " << endl;
+            l << ( is32bit ? " 32" : " 64" ) << "-bit host=" << getHostNameCached() << endl;
         }
         DEV log() << "_DEBUG build (which is slower)" << endl;
         show_warnings();
         log() << mongodVersion() << endl;
         printGitVersion();
         printSysInfo();
+        printCommandLineOpts();
 
         {
             stringstream ss;
@@ -529,12 +438,12 @@ sendmore:
             uassert( 12590 ,  ss.str().c_str(), boost::filesystem::exists( repairpath ) );
         }
 
-        acquirePathLock();
+        acquirePathLock(forceRepair);
         remove_all( dbpath + "/_tmp/" );
 
         FileAllocator::get()->start();
 
-        BOOST_CHECK_EXCEPTION( clearTmpFiles() );
+        MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( clearTmpFiles(), "clear tmp files" );
 
         _diaglog.init();
 
@@ -556,7 +465,7 @@ sendmore:
 
         repairDatabasesAndCheckVersion();
 
-        /* we didn't want to pre-open all fiels for the repair check above. for regular
+        /* we didn't want to pre-open all files for the repair check above. for regular
            operation we do for read/write lock concurrency reasons.
         */
         Database::_openAllFiles = true;
@@ -569,12 +478,7 @@ sendmore:
 
         snapshotThread.go();
         clientCursorMonitor.go();
-
-        if( !cmdLine._replSet.empty() ) {
-            replSet = true;
-            ReplSetCmdline *replSetCmdline = new ReplSetCmdline(cmdLine._replSet);
-            boost::thread t( boost::bind( &startReplSets, replSetCmdline) );
-        }
+        PeriodicTask::theRunner->go();
 
         listen(listenPort);
 
@@ -584,8 +488,14 @@ sendmore:
 
     void testPretouch();
 
-    void initAndListen(int listenPort, const char *appserverLoc = NULL) {
-        try { _initAndListen(listenPort, appserverLoc); }
+    void initAndListen(int listenPort) {
+        try { 
+            _initAndListen(listenPort); 
+        }
+        catch ( DBException &e ) {
+            log() << "exception in initAndListen: " << e.toString() << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
         catch ( std::exception &e ) {
             log() << "exception in initAndListen std::exception: " << e.what() << ", terminating" << endl;
             dbexit( EXIT_UNCAUGHT );
@@ -603,7 +513,7 @@ sendmore:
 #if defined(_WIN32)
     bool initService() {
         ServiceController::reportStatus( SERVICE_RUNNING );
-        initAndListen( cmdLine.port, appsrvPath );
+        initAndListen( cmdLine.port );
         return true;
     }
 #endif
@@ -625,23 +535,12 @@ void show_help_text(po::options_description options) {
 
 /* Return error string or "" if no errors. */
 string arg_error_check(int argc, char* argv[]) {
-    for (int i = 1; i < argc; i++) {
-        string s = argv[i];
-        /* check for inclusion of old-style arbiter setting. */
-        if (s == "--pairwith") {
-            if (argc > i + 2) {
-                string old_arbiter = argv[i + 2];
-                if (old_arbiter == "-" || old_arbiter.substr(0, 1) != "-") {
-                    return "Specifying arbiter using --pairwith is no longer supported, please use --arbiter";
-                }
-            }
-        }
-    }
     return "";
 }
 
 int main(int argc, char* argv[]) {
     static StaticObserver staticObserver;
+    doPreServerStatupInits();
     getcurns = ourgetns;
 
     po::options_description general_options("General options");
@@ -667,18 +566,16 @@ int main(int argc, char* argv[]) {
     ("directoryperdb", "each database will be stored in a separate directory")
     ("journal", "enable journaling")
     ("journalOptions", po::value<int>(), "journal diagnostic options")
+    ("journalCommitInterval", po::value<unsigned>(), "how often to group/batch commit (ms)")
     ("ipv6", "enable IPv6 support (disabled by default)")
     ("jsonp","allow JSONP access via http (has security implications)")
     ("noauth", "run without security")
     ("nohttpinterface", "disable http interface")
+    ("nojournal", "disable journaling (journaling is on by default for 64 bit)")
     ("noprealloc", "disable data file preallocation - will often hurt performance")
     ("noscripting", "disable scripting engine")
     ("notablescan", "do not allow table scans")
-#if !defined(_WIN32)
-    ("nounixsocket", "disable listening on unix sockets")
-#endif
     ("nssize", po::value<int>()->default_value(16), ".ns file size (in MB) for new databases")
-    ("objcheck", "inspect client data for validity on receipt")
     ("profile",po::value<int>(), "0=off 1=slow, 2=all")
     ("quota", "limits each database to a certain number of files (8 default)")
     ("quotaFiles", po::value<int>(), "number of files allower per db, requires --quota")
@@ -687,6 +584,9 @@ int main(int argc, char* argv[]) {
     ("repairpath", po::value<string>() , "root directory for repair files - defaults to dbpath" )
     ("slowms",po::value<int>(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" )
     ("smallfiles", "use a smaller default file size")
+#if defined(__linux__)
+    ("shutdown", "kill a running server (for init scripts)")
+#endif
     ("syncdelay",po::value<double>(&cmdLine.syncdelay)->default_value(60), "seconds between disk syncs (0=never, but not recommended)")
     ("sysinfo", "print some diagnostic system information")
     ("upgrade", "upgrade db if needed")
@@ -698,7 +598,6 @@ int main(int argc, char* argv[]) {
 
     replication_options.add_options()
     ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer")
-    ("autoresync", "automatically resync if slave data is stale")
     ("oplogSize", po::value<int>(), "size limit (in MB) for op log")
     ;
 
@@ -708,6 +607,7 @@ int main(int argc, char* argv[]) {
     ("source", po::value<string>(), "when slave: specify master as <server:port>")
     ("only", po::value<string>(), "when slave: specify a single database to replicate")
     ("slavedelay", po::value<int>(), "specify delay (in seconds) to be used when applying master ops to slave")
+    ("autoresync", "automatically resync if slave data is stale")
     ;
 
     rs_options.add_options()
@@ -724,17 +624,17 @@ int main(int argc, char* argv[]) {
     ("pretouch", po::value<int>(), "n pretouch threads for applying replicationed operations")
     ("command", po::value< vector<string> >(), "command")
     ("cacheSize", po::value<long>(), "cache size (in MB) for rec store")
-    // these move to unhidden later:
-    ("opIdMem", po::value<long>(), "size limit (in bytes) for in memory storage of op ids for replica pairs DEPRECATED")
-    ("pairwith", po::value<string>(), "address of server to pair with DEPRECATED")
-    ("arbiter", po::value<string>(), "address of replica pair arbiter server DEPRECATED")
     ("nodur", "disable journaling (currently the default)")
-    ("nojournal", "disable journaling (currently the default)")
-    ("appsrvpath", po::value<string>(), "root directory for the babble app server")
+    // things we don't want people to use
     ("nocursors", "diagnostic/debugging option that turns off cursors DO NOT USE IN PRODUCTION")
     ("nohints", "ignore query hints")
+    ("nopreallocj", "don't preallocate journal files")
     ("dur", "enable journaling") // deprecated version
     ("durOptions", po::value<int>(), "durability diagnostic options") // deprecated version
+    // deprecated pairing command line options
+    ("pairwith", "DEPRECATED")
+    ("arbiter", "DEPRECATED")
+    ("opIdMem", "DEPRECATED")
     ;
 
 
@@ -828,44 +728,46 @@ int main(int argc, char* argv[]) {
             cmdLine.quota = true;
             cmdLine.quotaFiles = params["quotaFiles"].as<int>() - 1;
         }
-        if( params.count("nodur") ) {
-            cmdLine.dur = false;
-        }
-        if( params.count("nojournal") ) {
+        bool journalExplicit = false;
+        if( params.count("nodur") || params.count( "nojournal" ) ) {
+            journalExplicit = true;
             cmdLine.dur = false;
         }
         if( params.count("dur") || params.count( "journal" ) ) {
+            journalExplicit = true;
             cmdLine.dur = true;
         }
         if (params.count("durOptions")) {
             cmdLine.durOptions = params["durOptions"].as<int>();
         }
+        if( params.count("journalCommitInterval") ) { 
+            // don't check if dur is false here as many will just use the default, and will default to off on win32. 
+            // ie no point making life a little more complex by giving an error on a dev environment.
+            cmdLine.journalCommitInterval = params["journalCommitInterval"].as<unsigned>();
+            if( cmdLine.journalCommitInterval <= 1 || cmdLine.journalCommitInterval > 300 ) {
+                out() << "--journalCommitInterval out of allowed range (0-300ms)" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
         if (params.count("journalOptions")) {
             cmdLine.durOptions = params["journalOptions"].as<int>();
         }
-        if (params.count("objcheck")) {
-            objcheck = true;
-        }
-        if (params.count("appsrvpath")) {
-            /* casting away the const-ness here */
-            appsrvPath = (char*)(params["appsrvpath"].as<string>().c_str());
-        }
         if (params.count("repairpath")) {
             repairpath = params["repairpath"].as<string>();
             if (!repairpath.size()) {
-                out() << "repairpath has to be non-zero" << endl;
+                out() << "repairpath is empty" << endl;
                 dbexit( EXIT_BADOPTIONS );
             }
         }
-        else {
-            repairpath = dbpath;
-        }
         if (params.count("nocursors")) {
             useCursors = false;
         }
         if (params.count("nohints")) {
             useHints = false;
         }
+        if (params.count("nopreallocj")) {
+            cmdLine.preallocj = false;
+        }
         if (params.count("nohttpinterface")) {
             noHttpInterface = true;
         }
@@ -884,6 +786,8 @@ int main(int argc, char* argv[]) {
         }
         if (params.count("smallfiles")) {
             cmdLine.smallfiles = true;
+            assert( dur::DataLimitPerJournalFile >= 128 * 1024 * 1024 );
+            dur::DataLimitPerJournalFile = 128 * 1024 * 1024;
         }
         if (params.count("diaglog")) {
             int x = params["diaglog"].as<int>();
@@ -898,10 +802,12 @@ int main(int argc, char* argv[]) {
             return 0;
         }
         if (params.count("repair")) {
+            Record::MemoryTrackingEnabled = false;
             shouldRepairDatabases = 1;
             forceRepair = 1;
         }
         if (params.count("upgrade")) {
+            Record::MemoryTrackingEnabled = false;
             shouldRepairDatabases = 1;
         }
         if (params.count("notablescan")) {
@@ -921,6 +827,11 @@ int main(int argc, char* argv[]) {
         }
         if (params.count("autoresync")) {
             replSettings.autoresync = true;
+            if( params.count("replSet") ) {
+                out() << "--autoresync is not used with --replSet" << endl;
+                out() << "see http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
         }
         if (params.count("source")) {
             /* specifies what the source in local.sources should be */
@@ -944,25 +855,6 @@ int main(int argc, char* argv[]) {
         if (params.count("only")) {
             cmdLine.only = params["only"].as<string>().c_str();
         }
-        if (params.count("pairwith")) {
-            cout << "***********************************\n"
-                 << "WARNING WARNING WARNING\n"
-                 << " replica pairs are deprecated\n"
-                 << " see: http://www.mongodb.org/display/DOCS/Replica+Pairs \n"
-                 << "***********************************" << endl;
-            string paired = params["pairwith"].as<string>();
-            if (params.count("arbiter")) {
-                string arbiter = params["arbiter"].as<string>();
-                pairWith(paired.c_str(), arbiter.c_str());
-            }
-            else {
-                pairWith(paired.c_str(), "-");
-            }
-        }
-        else if (params.count("arbiter")) {
-            out() << "specifying --arbiter without --pairwith" << endl;
-            dbexit( EXIT_BADOPTIONS );
-        }
         if( params.count("nssize") ) {
             int x = params["nssize"].as<int>();
             if (x <= 0 || x > (0x7fffffff/1024/1024)) {
@@ -986,15 +878,6 @@ int main(int argc, char* argv[]) {
             cmdLine.oplogSize = x * 1024 * 1024;
             assert(cmdLine.oplogSize > 0);
         }
-        if (params.count("opIdMem")) {
-            long x = params["opIdMem"].as<long>();
-            if (x <= 0) {
-                out() << "bad --opIdMem arg" << endl;
-                dbexit( EXIT_BADOPTIONS );
-            }
-            replSettings.opIdMem = x;
-            assert(replSettings.opIdMem > 0);
-        }
         if (params.count("cacheSize")) {
             long x = params["cacheSize"].as<long>();
             if (x <= 0) {
@@ -1007,8 +890,13 @@ int main(int argc, char* argv[]) {
             if( params.count("configsvr") ) {
                 cmdLine.port = CmdLine::ConfigServerPort;
             }
-            if( params.count("shardsvr") )
+            if( params.count("shardsvr") ) {
+                if( params.count("configsvr") ) {
+                    log() << "can't do --shardsvr and --configsvr at the same time" << endl;
+                    dbexit( EXIT_BADOPTIONS );
+                }
                 cmdLine.port = CmdLine::ShardServerPort;
+            }
         }
         else {
             if ( cmdLine.port <= 0 || cmdLine.port > 65535 ) {
@@ -1017,27 +905,36 @@ int main(int argc, char* argv[]) {
             }
         }
         if ( params.count("configsvr" ) ) {
+            cmdLine.configsvr = true;
             if (cmdLine.usingReplSets() || replSettings.master || replSettings.slave) {
                 log() << "replication should not be enabled on a config server" << endl;
                 ::exit(-1);
             }
-            if ( params.count( "diaglog" ) == 0 )
-                _diaglog.level = 1;
+            if ( params.count( "nodur" ) == 0 && params.count( "nojournal" ) == 0 )
+                cmdLine.dur = true;
             if ( params.count( "dbpath" ) == 0 )
                 dbpath = "/data/configdb";
         }
         if ( params.count( "profile" ) ) {
             cmdLine.defaultProfile = params["profile"].as<int>();
         }
-        if (params.count("nounixsocket")) {
-            noUnixSocket = true;
-        }
         if (params.count("ipv6")) {
             enableIPv6();
         }
         if (params.count("noMoveParanoia")) {
             cmdLine.moveParanoia = false;
         }
+        if (params.count("pairwith") || params.count("arbiter") || params.count("opIdMem")) {
+            out() << "****" << endl;
+            out() << "Replica Pairs have been deprecated." << endl;
+            out() << "<http://www.mongodb.org/display/DOCS/Replica+Pairs>" << endl;
+            out() << "****" << endl;
+            dbexit( EXIT_BADOPTIONS );
+        }
+
+        // needs to be after things like --configsvr parsing, thus here.
+        if( repairpath.empty() )
+            repairpath = dbpath;
 
         Module::configAll( params );
         dataFileSync.go();
@@ -1069,15 +966,85 @@ int main(int argc, char* argv[]) {
         if( cmdLine.pretouch )
             log() << "--pretouch " << cmdLine.pretouch << endl;
 
+#ifdef __linux__
+        if (params.count("shutdown")){
+            bool failed = false;
+
+            string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
+            if ( !boost::filesystem::exists( name ) || boost::filesystem::file_size( name ) == 0 )
+                failed = true;
+
+            pid_t pid;
+            string procPath;
+            if (!failed){
+                try {
+                    ifstream f (name.c_str());
+                    f >> pid;
+                    procPath = (str::stream() << "/proc/" << pid);
+                    if (!boost::filesystem::exists(procPath))
+                        failed = true;
+
+                    string exePath = procPath + "/exe";
+                    if (boost::filesystem::exists(exePath)){
+                        char buf[256];
+                        int ret = readlink(exePath.c_str(), buf, sizeof(buf)-1);
+                        buf[ret] = '\0'; // readlink doesn't terminate string
+                        if (ret == -1) {
+                            int e = errno;
+                            cerr << "Error resolving " << exePath << ": " << errnoWithDescription(e);
+                            failed = true;
+                        }
+                        else if (!endsWith(buf, "mongod")){
+                            cerr << "Process " << pid << " is running " << buf << " not mongod" << endl;
+                            ::exit(-1);
+                        }
+                    }
+                }
+                catch (const std::exception& e){
+                    cerr << "Error reading pid from lock file [" << name << "]: " << e.what() << endl;
+                    failed = true;
+                }
+            }
+
+            if (failed) {
+                cerr << "There doesn't seem to be a server running with dbpath: " << dbpath << endl;
+                ::exit(-1);
+            }
+
+            cout << "killing process with pid: " << pid << endl;
+            int ret = kill(pid, SIGTERM);
+            if (ret) {
+                int e = errno;
+                cerr << "failed to kill process: " << errnoWithDescription(e) << endl;
+                ::exit(-1);
+            }
+
+            while (boost::filesystem::exists(procPath)) {
+                sleepsecs(1);
+            }
+
+            ::exit(0);
+        }
+#endif
+
 #if defined(_WIN32)
         if (serviceParamsCheck( params, dbpath, argc, argv )) {
             return 0;
         }
 #endif
+
+
+        if (sizeof(void*) == 4 && !journalExplicit){
+            // trying to make this stand out more like startup warnings
+            log() << endl;
+            warning() << "32-bit servers don't have journaling enabled by default. Please use --journal if you want durability." << endl;
+            log() << endl;
+        }
+
     }
 
     UnitTest::runTests();
-    initAndListen(cmdLine.port, appsrvPath);
+    initAndListen(cmdLine.port);
     dbexit(EXIT_CLEAN);
     return 0;
 }
@@ -1088,14 +1055,6 @@ namespace mongo {
 
 #undef out
 
-    void exitCleanly( ExitCode code ) {
-        killCurrentOp.killAll();
-        {
-            dblock lk;
-            log() << "now exiting" << endl;
-            dbexit( code );
-        }
-    }
 
 #if !defined(_WIN32)
 
@@ -1166,7 +1125,7 @@ namespace mongo {
     void myterminate() {
         rawOut( "terminate() called, printing stack:" );
         printStackTrace();
-        abort();
+        ::abort();
     }
 
     void setupSignals_ignoreHelper( int signal ) {}
@@ -1235,19 +1194,63 @@ namespace mongo {
         }
     }
 
+    LPTOP_LEVEL_EXCEPTION_FILTER filtLast = 0;
+    ::HANDLE standardOut = GetStdHandle(STD_OUTPUT_HANDLE);
+    LONG WINAPI exceptionFilter(struct _EXCEPTION_POINTERS *ExceptionInfo) { 
+        {
+            // given the severity of the event we write to console in addition to the --logFile
+            // (rawOut writes to the logfile, if a special one were specified)
+            DWORD written;
+            WriteFile(standardOut, "unhandled windows exception\n", 20, &written, 0);
+            FlushFileBuffers(standardOut);
+        }
+
+        DWORD ec = ExceptionInfo->ExceptionRecord->ExceptionCode;
+        if( ec == EXCEPTION_ACCESS_VIOLATION ) {
+            rawOut("access violation");
+        } 
+        else {
+            rawOut("unhandled windows exception");
+            char buf[64];
+            strcpy(buf, "ec=0x");
+            _ui64toa(ec, buf+5, 16);
+            rawOut(buf);
+        }
+        if( filtLast ) 
+            return filtLast(ExceptionInfo);
+        return EXCEPTION_EXECUTE_HANDLER;
+    }
+
+    // called by mongoAbort()
+    extern void (*reportEventToSystem)(const char *msg);
+    void reportEventToSystemImpl(const char *msg) { 
+        static ::HANDLE hEventLog = RegisterEventSource( NULL, TEXT("mongod") );
+        if( hEventLog ) { 
+            std::wstring s = toNativeString(msg);
+            LPCTSTR txt = s.c_str();
+            BOOL ok = ReportEvent(
+              hEventLog, EVENTLOG_ERROR_TYPE, 
+              0, 0, NULL,
+              1, 
+              0, 
+              &txt,
+              0);
+            wassert(ok);
+        }
+    }
+
     void myPurecallHandler() {
-        rawOut( "pure virtual method called, printing stack:" );
         printStackTrace();
-        abort();
+        mongoAbort("pure virtual");
     }
 
     void setupSignals( bool inFork ) {
-        if( SetConsoleCtrlHandler( (PHANDLER_ROUTINE) CtrlHandler, TRUE ) )
-            ;
-        else
-            massert( 10297 , "Couldn't register Windows Ctrl-C handler", false);
+        reportEventToSystem = reportEventToSystemImpl;
+        filtLast = SetUnhandledExceptionFilter(exceptionFilter);
+        massert(10297 , "Couldn't register Windows Ctrl-C handler", SetConsoleCtrlHandler((PHANDLER_ROUTINE) CtrlHandler, TRUE));
         _set_purecall_handler( myPurecallHandler );
     }
+
 #endif
 
 } // namespace mongo
diff --git a/db/db.h b/db/db.h
index 7ef7d03..f3e6b05 100644
--- a/db/db.h
+++ b/db/db.h
@@ -17,9 +17,10 @@
 #pragma once
 
 #include "../pch.h"
-#include "../util/message.h"
+#include "../util/net/message.h"
 #include "concurrency.h"
 #include "pdfile.h"
+#include "curop.h"
 #include "client.h"
 
 namespace mongo {
@@ -142,7 +143,8 @@ namespace mongo {
         int _locktype;
 
         dbtemprelease() {
-            _context = cc().getContext();
+            const Client& c = cc();
+            _context = c.getContext();
             _locktype = dbMutex.getState();
             assert( _locktype );
 
@@ -156,7 +158,10 @@ namespace mongo {
                 if ( _context ) _context->unlocked();
                 dbMutex.unlock_shared();
             }
-
+            
+            verify( 14814 , c.curop() );
+            c.curop()->yielded();
+            
         }
         ~dbtemprelease() {
             if ( _locktype > 0 )
@@ -168,6 +173,33 @@ namespace mongo {
         }
     };
 
+    /** must be write locked
+        no assert (and no release) if nested write lock 
+        a lot like dbtempreleasecond but no malloc so should be a tiny bit faster
+    */
+    struct dbtempreleasewritelock {
+        Client::Context * _context;
+        int _locktype;
+        dbtempreleasewritelock() {
+            const Client& c = cc();
+            _context = c.getContext();
+            _locktype = dbMutex.getState();
+            assert( _locktype >= 1 );
+            if( _locktype > 1 ) 
+                return; // nested
+            if ( _context ) 
+                _context->unlocked();
+            dbMutex.unlock();
+            verify( 14845 , c.curop() );
+            c.curop()->yielded();            
+        }
+        ~dbtempreleasewritelock() {
+            if ( _locktype == 1 )
+                dbMutex.lock();
+            if ( _context ) 
+                _context->relocked();
+        }
+    };
 
     /**
        only does a temp release if we're not nested and have a lock
diff --git a/db/db.vcxproj b/db/db.vcxproj
index ad9c6d2..b3bfcfb 100644..100755
--- a/db/db.vcxproj
+++ b/db/db.vcxproj
@@ -1,791 +1,838 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectName>mongod</ProjectName>
-    <ProjectGuid>{215B2D68-0A70-4D10-8E75-B31010C62A91}</ProjectGuid>
-    <RootNamespace>db</RootNamespace>
-    <Keyword>Win32Proj</Keyword>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseOfMfc>false</UseOfMfc>
-    <UseOfAtl>false</UseOfAtl>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseOfMfc>false</UseOfMfc>
-    <UseOfAtl>false</UseOfAtl>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup>
-    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Configuration)\</OutDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Configuration)\</IntDir>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Configuration)\</OutDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Configuration)\</IntDir>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
-    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.;..;$(IncludePath)</IncludePath>
-    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..;$(IncludePath)</IncludePath>
-    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..;$(IncludePath)</IncludePath>
-    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..;$(IncludePath)</IncludePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\js\src;..\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_UNICODE;UNICODE;SUPPORT_UCP;SUPPORT_UTF8;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <MinimalRebuild>No</MinimalRebuild>
-      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
-      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-      <TargetMachine>MachineX86</TargetMachine>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\js\src;..\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>SUPPORT_UCP;SUPPORT_UTF8;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-      <MinimalRebuild>No</MinimalRebuild>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <Optimization>MaxSpeed</Optimization>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>..\..\js\src;..\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_UNICODE;UNICODE;SUPPORT_UCP;SUPPORT_UTF8;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-      <MinimalRebuild>No</MinimalRebuild>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <TargetMachine>MachineX86</TargetMachine>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <Optimization>MaxSpeed</Optimization>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>..\..\js\src;..\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>SUPPORT_UCP;SUPPORT_UTF8;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-      <MinimalRebuild>No</MinimalRebuild>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\bson\oid.cpp" />
-    <ClCompile Include="..\client\dbclientcursor.cpp" />
-    <ClCompile Include="..\client\dbclient_rs.cpp" />
-    <ClCompile Include="..\client\distlock.cpp" />
-    <ClCompile Include="..\client\model.cpp" />
-    <ClCompile Include="..\pcre-7.4\pcrecpp.cc">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_chartables.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_compile.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_config.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_dfa_exec.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_exec.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_fullinfo.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_get.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_globals.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_info.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_maketables.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_newline.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_ord2utf8.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_refcount.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_scanner.cc">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_stringpiece.cc">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_study.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_tables.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_try_flipped.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_ucp_searchfuncs.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_valid_utf8.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_version.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcre_xclass.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\pcre-7.4\pcreposix.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\scripting\bench.cpp" />
-    <ClCompile Include="..\shell\mongo_vstudio.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\s\chunk.cpp" />
-    <ClCompile Include="..\s\config.cpp" />
-    <ClCompile Include="..\s\d_chunk_manager.cpp" />
-    <ClCompile Include="..\s\d_migrate.cpp" />
-    <ClCompile Include="..\s\d_split.cpp" />
-    <ClCompile Include="..\s\d_state.cpp" />
-    <ClCompile Include="..\s\d_writeback.cpp" />
-    <ClCompile Include="..\s\grid.cpp" />
-    <ClCompile Include="..\s\shard.cpp" />
-    <ClCompile Include="..\s\shardconnection.cpp" />
-    <ClCompile Include="..\s\shardkey.cpp" />
-    <ClCompile Include="..\util\alignedbuilder.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
-    <ClCompile Include="..\util\concurrency\synchronization.cpp" />
-    <ClCompile Include="..\util\concurrency\task.cpp" />
-    <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
-    <ClCompile Include="..\util\concurrency\vars.cpp" />
-    <ClCompile Include="..\util\file_allocator.cpp" />
-    <ClCompile Include="..\util\log.cpp" />
-    <ClCompile Include="..\util\logfile.cpp" />
-    <ClCompile Include="..\util\processinfo.cpp" />
-    <ClCompile Include="..\util\stringutils.cpp" />
-    <ClCompile Include="..\util\text.cpp" />
-    <ClCompile Include="..\util\version.cpp" />
-    <ClCompile Include="cap.cpp" />
-    <ClCompile Include="commands\distinct.cpp" />
-    <ClCompile Include="commands\group.cpp" />
-    <ClCompile Include="commands\isself.cpp" />
-    <ClCompile Include="commands\mr.cpp" />
-    <ClCompile Include="compact.cpp" />
-    <ClCompile Include="dbcommands_generic.cpp" />
-    <ClCompile Include="dur.cpp" />
-    <ClCompile Include="durop.cpp" />
-    <ClCompile Include="dur_commitjob.cpp" />
-    <ClCompile Include="dur_journal.cpp" />
-    <ClCompile Include="dur_preplogbuffer.cpp" />
-    <ClCompile Include="dur_recover.cpp" />
-    <ClCompile Include="dur_writetodatafiles.cpp" />
-    <ClCompile Include="geo\2d.cpp" />
-    <ClCompile Include="geo\haystack.cpp" />
-    <ClCompile Include="mongommf.cpp" />
-    <ClCompile Include="oplog.cpp" />
-    <ClCompile Include="projection.cpp" />
-    <ClCompile Include="repl.cpp" />
-    <ClCompile Include="repl\consensus.cpp" />
-    <ClCompile Include="repl\heartbeat.cpp" />
-    <ClCompile Include="repl\manager.cpp" />
-    <ClCompile Include="repl\rs_initialsync.cpp" />
-    <ClCompile Include="repl\rs_initiate.cpp" />
-    <ClCompile Include="repl\rs_rollback.cpp" />
-    <ClCompile Include="repl\rs_sync.cpp" />
-    <ClCompile Include="repl_block.cpp" />
-    <ClCompile Include="restapi.cpp" />
-    <ClCompile Include="..\client\connpool.cpp" />
-    <ClCompile Include="..\client\dbclient.cpp" />
-    <ClCompile Include="..\client\syncclusterconnection.cpp" />
-    <ClCompile Include="..\pch.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="client.cpp" />
-    <ClCompile Include="clientcursor.cpp" />
-    <ClCompile Include="cloner.cpp" />
-    <ClCompile Include="commands.cpp" />
-    <ClCompile Include="common.cpp" />
-    <ClCompile Include="cursor.cpp" />
-    <ClCompile Include="database.cpp" />
-    <ClCompile Include="db.cpp" />
-    <ClCompile Include="dbcommands.cpp" />
-    <ClCompile Include="dbcommands_admin.cpp" />
-    <ClCompile Include="dbeval.cpp" />
-    <ClCompile Include="dbhelpers.cpp" />
-    <ClCompile Include="dbwebserver.cpp" />
-    <ClCompile Include="extsort.cpp" />
-    <ClCompile Include="index.cpp" />
-    <ClCompile Include="indexkey.cpp" />
-    <ClCompile Include="instance.cpp" />
-    <ClCompile Include="introspect.cpp" />
-    <ClCompile Include="jsobj.cpp" />
-    <ClCompile Include="json.cpp" />
-    <ClCompile Include="lasterror.cpp" />
-    <ClCompile Include="matcher.cpp" />
-    <ClCompile Include="matcher_covered.cpp" />
-    <ClCompile Include="..\util\mmap_win.cpp" />
-    <ClCompile Include="modules\mms.cpp" />
-    <ClCompile Include="module.cpp" />
-    <ClCompile Include="namespace.cpp" />
-    <ClCompile Include="nonce.cpp" />
-    <ClCompile Include="..\client\parallel.cpp" />
-    <ClCompile Include="pdfile.cpp" />
-    <ClCompile Include="query.cpp" />
-    <ClCompile Include="queryoptimizer.cpp" />
-    <ClCompile Include="security.cpp" />
-    <ClCompile Include="security_commands.cpp" />
-    <ClCompile Include="security_key.cpp" />
-    <ClCompile Include="tests.cpp" />
-    <ClCompile Include="update.cpp" />
-    <ClCompile Include="cmdline.cpp" />
-    <ClCompile Include="queryutil.cpp" />
-    <ClCompile Include="..\util\assert_util.cpp" />
-    <ClCompile Include="..\util\background.cpp" />
-    <ClCompile Include="..\util\base64.cpp" />
-    <ClCompile Include="..\util\mmap.cpp" />
-    <ClCompile Include="..\util\ntservice.cpp" />
-    <ClCompile Include="..\util\processinfo_win32.cpp" />
-    <ClCompile Include="..\util\util.cpp" />
-    <ClCompile Include="..\util\httpclient.cpp" />
-    <ClCompile Include="..\util\miniwebserver.cpp" />
-    <ClCompile Include="..\util\md5.c">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeader>
-      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeaderFile>
-      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </PrecompiledHeaderFile>
-    </ClCompile>
-    <ClCompile Include="..\util\md5main.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Use</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\util\message.cpp" />
-    <ClCompile Include="..\util\message_server_port.cpp" />
-    <ClCompile Include="..\util\sock.cpp" />
-    <ClCompile Include="..\s\d_logic.cpp" />
-    <ClCompile Include="..\scripting\engine.cpp" />
-    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
-    <ClCompile Include="..\scripting\utils.cpp" />
-    <ClCompile Include="stats\counters.cpp" />
-    <ClCompile Include="stats\snapshots.cpp" />
-    <ClCompile Include="stats\top.cpp" />
-    <ClCompile Include="btree.cpp" />
-    <ClCompile Include="btreecursor.cpp" />
-    <ClCompile Include="repl\health.cpp" />
-    <ClCompile Include="repl\rs.cpp" />
-    <ClCompile Include="repl\replset_commands.cpp" />
-    <ClCompile Include="repl\rs_config.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="..\jstests\dur\basic1.sh" />
-    <None Include="..\jstests\dur\dur1.js" />
-    <None Include="..\jstests\replsets\replset1.js" />
-    <None Include="..\jstests\replsets\replset2.js" />
-    <None Include="..\jstests\replsets\replset3.js" />
-    <None Include="..\jstests\replsets\replset4.js" />
-    <None Include="..\jstests\replsets\replset5.js" />
-    <None Include="..\jstests\replsets\replsetadd.js" />
-    <None Include="..\jstests\replsets\replsetarb1.js" />
-    <None Include="..\jstests\replsets\replsetarb2.js" />
-    <None Include="..\jstests\replsets\replsetprio1.js" />
-    <None Include="..\jstests\replsets\replsetrestart1.js" />
-    <None Include="..\jstests\replsets\replsetrestart2.js" />
-    <None Include="..\jstests\replsets\replset_remove_node.js" />
-    <None Include="..\jstests\replsets\rollback.js" />
-    <None Include="..\jstests\replsets\rollback2.js" />
-    <None Include="..\jstests\replsets\sync1.js" />
-    <None Include="..\jstests\replsets\twosets.js" />
-    <None Include="..\SConstruct" />
-    <None Include="..\util\mongoutils\README" />
-    <None Include="mongo.ico" />
-    <None Include="repl\notes.txt" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\client\dbclientcursor.h" />
-    <ClInclude Include="..\client\distlock.h" />
-    <ClInclude Include="..\client\gridfs.h" />
-    <ClInclude Include="..\client\parallel.h" />
-    <ClInclude Include="..\s\d_logic.h" />
-    <ClInclude Include="..\targetver.h" />
-    <ClInclude Include="..\pcre-7.4\config.h" />
-    <ClInclude Include="..\pcre-7.4\pcre.h" />
-    <ClInclude Include="..\util\concurrency\race.h" />
-    <ClInclude Include="..\util\concurrency\rwlock.h" />
-    <ClInclude Include="..\util\concurrency\msg.h" />
-    <ClInclude Include="..\util\concurrency\mutex.h" />
-    <ClInclude Include="..\util\concurrency\mvar.h" />
-    <ClInclude Include="..\util\concurrency\task.h" />
-    <ClInclude Include="..\util\concurrency\thread_pool.h" />
-    <ClInclude Include="..\util\logfile.h" />
-    <ClInclude Include="..\util\mongoutils\checksum.h" />
-    <ClInclude Include="..\util\mongoutils\html.h" />
-    <ClInclude Include="..\util\mongoutils\str.h" />
-    <ClInclude Include="..\util\paths.h" />
-    <ClInclude Include="..\util\ramlog.h" />
-    <ClInclude Include="..\util\text.h" />
-    <ClInclude Include="..\util\time_support.h" />
-    <ClInclude Include="durop.h" />
-    <ClInclude Include="dur_commitjob.h" />
-    <ClInclude Include="dur_journal.h" />
-    <ClInclude Include="dur_journalformat.h" />
-    <ClInclude Include="dur_journalimpl.h" />
-    <ClInclude Include="dur_stats.h" />
-    <ClInclude Include="geo\core.h" />
-    <ClInclude Include="helpers\dblogger.h" />
-    <ClInclude Include="instance.h" />
-    <ClInclude Include="mongommf.h" />
-    <ClInclude Include="mongomutex.h" />
-    <ClInclude Include="namespace-inl.h" />
-    <ClInclude Include="oplogreader.h" />
-    <ClInclude Include="projection.h" />
-    <ClInclude Include="repl.h" />
-    <ClInclude Include="replpair.h" />
-    <ClInclude Include="repl\connections.h" />
-    <ClInclude Include="repl\multicmd.h" />
-    <ClInclude Include="repl\rsmember.h" />
-    <ClInclude Include="repl\rs_optime.h" />
-    <ClInclude Include="stats\counters.h" />
-    <ClInclude Include="stats\snapshots.h" />
-    <ClInclude Include="stats\top.h" />
-    <ClInclude Include="..\client\connpool.h" />
-    <ClInclude Include="..\client\dbclient.h" />
-    <ClInclude Include="..\client\model.h" />
-    <ClInclude Include="..\client\redef_macros.h" />
-    <ClInclude Include="..\client\syncclusterconnection.h" />
-    <ClInclude Include="..\client\undef_macros.h" />
-    <ClInclude Include="background.h" />
-    <ClInclude Include="client.h" />
-    <ClInclude Include="clientcursor.h" />
-    <ClInclude Include="cmdline.h" />
-    <ClInclude Include="commands.h" />
-    <ClInclude Include="concurrency.h" />
-    <ClInclude Include="curop.h" />
-    <ClInclude Include="cursor.h" />
-    <ClInclude Include="database.h" />
-    <ClInclude Include="db.h" />
-    <ClInclude Include="dbhelpers.h" />
-    <ClInclude Include="dbinfo.h" />
-    <ClInclude Include="dbmessage.h" />
-    <ClInclude Include="diskloc.h" />
-    <ClInclude Include="index.h" />
-    <ClInclude Include="indexkey.h" />
-    <ClInclude Include="introspect.h" />
-    <ClInclude Include="json.h" />
-    <ClInclude Include="matcher.h" />
-    <ClInclude Include="namespace.h" />
-    <ClInclude Include="..\pch.h" />
-    <ClInclude Include="pdfile.h" />
-    <ClInclude Include="..\grid\protocol.h" />
-    <ClInclude Include="query.h" />
-    <ClInclude Include="queryoptimizer.h" />
-    <ClInclude Include="resource.h" />
-    <ClInclude Include="scanandorder.h" />
-    <ClInclude Include="security.h" />
-    <ClInclude Include="update.h" />
-    <ClInclude Include="..\util\allocator.h" />
-    <ClInclude Include="..\util\array.h" />
-    <ClInclude Include="..\util\assert_util.h" />
-    <ClInclude Include="..\util\background.h" />
-    <ClInclude Include="..\util\base64.h" />
-    <ClInclude Include="..\util\builder.h" />
-    <ClInclude Include="..\util\debug_util.h" />
-    <ClInclude Include="..\util\embedded_builder.h" />
-    <ClInclude Include="..\util\file.h" />
-    <ClInclude Include="..\util\file_allocator.h" />
-    <ClInclude Include="..\util\goodies.h" />
-    <ClInclude Include="..\util\hashtab.h" />
-    <ClInclude Include="..\util\hex.h" />
-    <ClInclude Include="lasterror.h" />
-    <ClInclude Include="..\util\log.h" />
-    <ClInclude Include="..\util\lruishmap.h" />
-    <ClInclude Include="..\util\mmap.h" />
-    <ClInclude Include="..\util\ntservice.h" />
-    <ClInclude Include="..\util\optime.h" />
-    <ClInclude Include="..\util\processinfo.h" />
-    <ClInclude Include="..\util\queue.h" />
-    <ClInclude Include="..\util\ramstore.h" />
-    <ClInclude Include="..\util\unittest.h" />
-    <ClInclude Include="..\util\concurrency\list.h" />
-    <ClInclude Include="..\util\concurrency\value.h" />
-    <ClInclude Include="..\util\web\html.h" />
-    <ClInclude Include="..\util\httpclient.h" />
-    <ClInclude Include="..\util\miniwebserver.h" />
-    <ClInclude Include="..\util\md5.h" />
-    <ClInclude Include="..\util\md5.hpp" />
-    <ClInclude Include="..\util\message.h" />
-    <ClInclude Include="..\util\message_server.h" />
-    <ClInclude Include="..\util\sock.h" />
-    <ClInclude Include="..\scripting\engine.h" />
-    <ClInclude Include="..\scripting\engine_spidermonkey.h" />
-    <ClInclude Include="..\scripting\engine_v8.h" />
-    <ClInclude Include="..\scripting\v8_db.h" />
-    <ClInclude Include="..\scripting\v8_utils.h" />
-    <ClInclude Include="..\scripting\v8_wrapper.h" />
-    <ClInclude Include="btree.h" />
-    <ClInclude Include="repl\health.h" />
-    <ClInclude Include="..\util\hostandport.h" />
-    <ClInclude Include="repl\rs.h" />
-    <ClInclude Include="repl\rs_config.h" />
-    <ClInclude Include="..\bson\bsonelement.h" />
-    <ClInclude Include="..\bson\bsoninlines.h" />
-    <ClInclude Include="..\bson\bsonmisc.h" />
-    <ClInclude Include="..\bson\bsonobj.h" />
-    <ClInclude Include="..\bson\bsonobjbuilder.h" />
-    <ClInclude Include="..\bson\bsonobjiterator.h" />
-    <ClInclude Include="..\bson\bsontypes.h" />
-    <ClInclude Include="jsobj.h" />
-    <ClInclude Include="..\bson\oid.h" />
-    <ClInclude Include="..\bson\ordering.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <Library Include="..\..\js\js32d.lib">
-      <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-    </Library>
-    <Library Include="..\..\js\js32r.lib">
-      <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </Library>
-    <Library Include="..\..\js\js64d.lib">
-      <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-    </Library>
-    <Library Include="..\..\js\js64r.lib">
-      <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-    </Library>
-  </ItemGroup>
-  <ItemGroup>
-    <ResourceCompile Include="db.rc" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectName>mongod</ProjectName>
+    <ProjectGuid>{215B2D68-0A70-4D10-8E75-B31010C62A91}</ProjectGuid>
+    <RootNamespace>db</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.;..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..;$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>No</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>No</MinimalRebuild>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>No</MinimalRebuild>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>No</MinimalRebuild>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\bson\oid.cpp" />
+    <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
+    <ClCompile Include="..\client\distlock.cpp" />
+    <ClCompile Include="..\client\model.cpp" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\scripting\bench.cpp" />
+    <ClCompile Include="..\shell\mongo_vstudio.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\s\chunk.cpp" />
+    <ClCompile Include="..\s\config.cpp" />
+    <ClCompile Include="..\s\d_chunk_manager.cpp" />
+    <ClCompile Include="..\s\d_migrate.cpp" />
+    <ClCompile Include="..\s\d_split.cpp" />
+    <ClCompile Include="..\s\d_state.cpp" />
+    <ClCompile Include="..\s\d_writeback.cpp" />
+    <ClCompile Include="..\s\grid.cpp" />
+    <ClCompile Include="..\s\shard.cpp" />
+    <ClCompile Include="..\s\shardconnection.cpp" />
+    <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\alignedbuilder.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\compress.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+    <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+    <ClCompile Include="..\util\concurrency\task.cpp" />
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+    <ClCompile Include="..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\util\file_allocator.cpp" />
+    <ClCompile Include="..\util\log.cpp" />
+    <ClCompile Include="..\util\logfile.cpp" />
+    <ClCompile Include="..\util\net\listen.cpp" />
+    <ClCompile Include="..\util\net\miniwebserver.cpp" />
+    <ClCompile Include="..\util\processinfo.cpp" />
+    <ClCompile Include="..\util\ramlog.cpp" />
+    <ClCompile Include="..\util\stringutils.cpp" />
+    <ClCompile Include="..\util\text.cpp" />
+    <ClCompile Include="..\util\version.cpp" />
+    <ClCompile Include="btreebuilder.cpp" />
+    <ClCompile Include="cap.cpp" />
+    <ClCompile Include="commands\distinct.cpp" />
+    <ClCompile Include="commands\find_and_modify.cpp" />
+    <ClCompile Include="commands\group.cpp" />
+    <ClCompile Include="commands\isself.cpp" />
+    <ClCompile Include="commands\mr.cpp" />
+    <ClCompile Include="compact.cpp" />
+    <ClCompile Include="dbcommands_generic.cpp" />
+    <ClCompile Include="dbmessage.cpp" />
+    <ClCompile Include="dur.cpp" />
+    <ClCompile Include="durop.cpp" />
+    <ClCompile Include="dur_commitjob.cpp" />
+    <ClCompile Include="dur_journal.cpp" />
+    <ClCompile Include="dur_preplogbuffer.cpp" />
+    <ClCompile Include="dur_recover.cpp" />
+    <ClCompile Include="dur_writetodatafiles.cpp" />
+    <ClCompile Include="geo\2d.cpp" />
+    <ClCompile Include="geo\haystack.cpp" />
+    <ClCompile Include="key.cpp" />
+    <ClCompile Include="mongommf.cpp" />
+    <ClCompile Include="oplog.cpp" />
+    <ClCompile Include="ops\delete.cpp" />
+    <ClCompile Include="ops\query.cpp" />
+    <ClCompile Include="ops\update.cpp" />
+    <ClCompile Include="projection.cpp" />
+    <ClCompile Include="queryoptimizercursor.cpp" />
+    <ClCompile Include="querypattern.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="record.cpp" />
+    <ClCompile Include="repl.cpp" />
+    <ClCompile Include="repl\consensus.cpp" />
+    <ClCompile Include="repl\heartbeat.cpp" />
+    <ClCompile Include="repl\manager.cpp" />
+    <ClCompile Include="repl\rs_initialsync.cpp" />
+    <ClCompile Include="repl\rs_initiate.cpp" />
+    <ClCompile Include="repl\rs_rollback.cpp" />
+    <ClCompile Include="repl\rs_sync.cpp" />
+    <ClCompile Include="repl_block.cpp" />
+    <ClCompile Include="restapi.cpp" />
+    <ClCompile Include="..\client\connpool.cpp" />
+    <ClCompile Include="..\client\dbclient.cpp" />
+    <ClCompile Include="..\client\syncclusterconnection.cpp" />
+    <ClCompile Include="..\pch.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="client.cpp" />
+    <ClCompile Include="clientcursor.cpp" />
+    <ClCompile Include="cloner.cpp" />
+    <ClCompile Include="commands.cpp" />
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="cursor.cpp" />
+    <ClCompile Include="database.cpp" />
+    <ClCompile Include="db.cpp" />
+    <ClCompile Include="dbcommands.cpp" />
+    <ClCompile Include="dbcommands_admin.cpp" />
+    <ClCompile Include="dbeval.cpp" />
+    <ClCompile Include="dbhelpers.cpp" />
+    <ClCompile Include="dbwebserver.cpp" />
+    <ClCompile Include="extsort.cpp" />
+    <ClCompile Include="index.cpp" />
+    <ClCompile Include="indexkey.cpp" />
+    <ClCompile Include="instance.cpp" />
+    <ClCompile Include="introspect.cpp" />
+    <ClCompile Include="jsobj.cpp" />
+    <ClCompile Include="json.cpp" />
+    <ClCompile Include="lasterror.cpp" />
+    <ClCompile Include="matcher.cpp" />
+    <ClCompile Include="matcher_covered.cpp" />
+    <ClCompile Include="..\util\mmap_win.cpp" />
+    <ClCompile Include="modules\mms.cpp" />
+    <ClCompile Include="module.cpp" />
+    <ClCompile Include="namespace.cpp" />
+    <ClCompile Include="nonce.cpp" />
+    <ClCompile Include="..\client\parallel.cpp" />
+    <ClCompile Include="pdfile.cpp" />
+    <ClCompile Include="queryoptimizer.cpp" />
+    <ClCompile Include="scanandorder.cpp" />
+    <ClCompile Include="security.cpp" />
+    <ClCompile Include="security_commands.cpp" />
+    <ClCompile Include="security_common.cpp" />
+    <ClCompile Include="tests.cpp" />
+    <ClCompile Include="cmdline.cpp" />
+    <ClCompile Include="queryutil.cpp" />
+    <ClCompile Include="..\util\assert_util.cpp" />
+    <ClCompile Include="..\util\background.cpp" />
+    <ClCompile Include="..\util\base64.cpp" />
+    <ClCompile Include="..\util\mmap.cpp" />
+    <ClCompile Include="..\util\ntservice.cpp" />
+    <ClCompile Include="..\util\processinfo_win32.cpp" />
+    <ClCompile Include="..\util\util.cpp" />
+    <ClCompile Include="..\util\net\httpclient.cpp" />
+    <ClCompile Include="..\util\md5.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeaderFile>
+    </ClCompile>
+    <ClCompile Include="..\util\md5main.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Use</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message.cpp" />
+    <ClCompile Include="..\util\net\message_port.cpp" />
+    <ClCompile Include="..\util\net\message_server_port.cpp" />
+    <ClCompile Include="..\util\net\sock.cpp" />
+    <ClCompile Include="..\s\d_logic.cpp" />
+    <ClCompile Include="..\scripting\engine.cpp" />
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\scripting\utils.cpp" />
+    <ClCompile Include="stats\counters.cpp" />
+    <ClCompile Include="stats\snapshots.cpp" />
+    <ClCompile Include="stats\top.cpp" />
+    <ClCompile Include="btree.cpp" />
+    <ClCompile Include="btreecursor.cpp" />
+    <ClCompile Include="repl\health.cpp" />
+    <ClCompile Include="repl\rs.cpp" />
+    <ClCompile Include="repl\replset_commands.cpp" />
+    <ClCompile Include="repl\rs_config.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\jstests\dur\basic1.sh" />
+    <None Include="..\jstests\dur\dur1.js" />
+    <None Include="..\jstests\replsets\replset1.js" />
+    <None Include="..\jstests\replsets\replset2.js" />
+    <None Include="..\jstests\replsets\replset3.js" />
+    <None Include="..\jstests\replsets\replset4.js" />
+    <None Include="..\jstests\replsets\replset5.js" />
+    <None Include="..\jstests\replsets\replsetadd.js" />
+    <None Include="..\jstests\replsets\replsetarb1.js" />
+    <None Include="..\jstests\replsets\replsetarb2.js" />
+    <None Include="..\jstests\replsets\replsetprio1.js" />
+    <None Include="..\jstests\replsets\replsetrestart1.js" />
+    <None Include="..\jstests\replsets\replsetrestart2.js" />
+    <None Include="..\jstests\replsets\replset_remove_node.js" />
+    <None Include="..\jstests\replsets\rollback.js" />
+    <None Include="..\jstests\replsets\rollback2.js" />
+    <None Include="..\jstests\replsets\sync1.js" />
+    <None Include="..\jstests\replsets\twosets.js" />
+    <None Include="..\SConstruct" />
+    <None Include="..\util\mongoutils\README" />
+    <None Include="mongo.ico" />
+    <None Include="repl\notes.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\bson\bson-inl.h" />
+    <ClInclude Include="..\bson\bson.h" />
+    <ClInclude Include="..\bson\bson_db.h" />
+    <ClInclude Include="..\bson\inline_decls.h" />
+    <ClInclude Include="..\bson\stringdata.h" />
+    <ClInclude Include="..\bson\util\atomic_int.h" />
+    <ClInclude Include="..\bson\util\builder.h" />
+    <ClInclude Include="..\bson\util\misc.h" />
+    <ClInclude Include="..\client\dbclientcursor.h" />
+    <ClInclude Include="..\client\distlock.h" />
+    <ClInclude Include="..\client\gridfs.h" />
+    <ClInclude Include="..\client\parallel.h" />
+    <ClInclude Include="..\s\d_logic.h" />
+    <ClInclude Include="..\targetver.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+    <ClInclude Include="..\third_party\snappy\config.h" />
+    <ClInclude Include="..\third_party\snappy\snappy.h" />
+    <ClInclude Include="..\util\alignedbuilder.h" />
+    <ClInclude Include="..\util\concurrency\race.h" />
+    <ClInclude Include="..\util\concurrency\rwlock.h" />
+    <ClInclude Include="..\util\concurrency\msg.h" />
+    <ClInclude Include="..\util\concurrency\mutex.h" />
+    <ClInclude Include="..\util\concurrency\mvar.h" />
+    <ClInclude Include="..\util\concurrency\task.h" />
+    <ClInclude Include="..\util\concurrency\thread_pool.h" />
+    <ClInclude Include="..\util\logfile.h" />
+    <ClInclude Include="..\util\mongoutils\checksum.h" />
+    <ClInclude Include="..\util\mongoutils\html.h" />
+    <ClInclude Include="..\util\mongoutils\str.h" />
+    <ClInclude Include="..\util\paths.h" />
+    <ClInclude Include="..\util\ramlog.h" />
+    <ClInclude Include="..\util\text.h" />
+    <ClInclude Include="..\util\time_support.h" />
+    <ClInclude Include="durop.h" />
+    <ClInclude Include="dur_commitjob.h" />
+    <ClInclude Include="dur_journal.h" />
+    <ClInclude Include="dur_journalformat.h" />
+    <ClInclude Include="dur_journalimpl.h" />
+    <ClInclude Include="dur_stats.h" />
+    <ClInclude Include="geo\core.h" />
+    <ClInclude Include="helpers\dblogger.h" />
+    <ClInclude Include="instance.h" />
+    <ClInclude Include="mongommf.h" />
+    <ClInclude Include="mongomutex.h" />
+    <ClInclude Include="namespace-inl.h" />
+    <ClInclude Include="oplogreader.h" />
+    <ClInclude Include="ops\delete.h" />
+    <ClInclude Include="ops\update.h" />
+    <ClInclude Include="projection.h" />
+    <ClInclude Include="queryutil.h" />
+    <ClInclude Include="repl.h" />
+    <ClInclude Include="replpair.h" />
+    <ClInclude Include="repl\connections.h" />
+    <ClInclude Include="repl\multicmd.h" />
+    <ClInclude Include="repl\rsmember.h" />
+    <ClInclude Include="repl\rs_optime.h" />
+    <ClInclude Include="stats\counters.h" />
+    <ClInclude Include="stats\snapshots.h" />
+    <ClInclude Include="stats\top.h" />
+    <ClInclude Include="..\client\connpool.h" />
+    <ClInclude Include="..\client\dbclient.h" />
+    <ClInclude Include="..\client\model.h" />
+    <ClInclude Include="..\client\redef_macros.h" />
+    <ClInclude Include="..\client\syncclusterconnection.h" />
+    <ClInclude Include="..\client\undef_macros.h" />
+    <ClInclude Include="background.h" />
+    <ClInclude Include="client.h" />
+    <ClInclude Include="clientcursor.h" />
+    <ClInclude Include="cmdline.h" />
+    <ClInclude Include="commands.h" />
+    <ClInclude Include="concurrency.h" />
+    <ClInclude Include="curop.h" />
+    <ClInclude Include="cursor.h" />
+    <ClInclude Include="database.h" />
+    <ClInclude Include="db.h" />
+    <ClInclude Include="dbhelpers.h" />
+    <ClInclude Include="dbinfo.h" />
+    <ClInclude Include="dbmessage.h" />
+    <ClInclude Include="diskloc.h" />
+    <ClInclude Include="index.h" />
+    <ClInclude Include="indexkey.h" />
+    <ClInclude Include="introspect.h" />
+    <ClInclude Include="json.h" />
+    <ClInclude Include="matcher.h" />
+    <ClInclude Include="namespace.h" />
+    <ClInclude Include="..\pch.h" />
+    <ClInclude Include="pdfile.h" />
+    <ClInclude Include="..\grid\protocol.h" />
+    <ClInclude Include="query.h" />
+    <ClInclude Include="queryoptimizer.h" />
+    <ClInclude Include="resource.h" />
+    <ClInclude Include="scanandorder.h" />
+    <ClInclude Include="security.h" />
+    <ClInclude Include="..\util\allocator.h" />
+    <ClInclude Include="..\util\array.h" />
+    <ClInclude Include="..\util\assert_util.h" />
+    <ClInclude Include="..\util\background.h" />
+    <ClInclude Include="..\util\base64.h" />
+    <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\debug_util.h" />
+    <ClInclude Include="..\util\embedded_builder.h" />
+    <ClInclude Include="..\util\file.h" />
+    <ClInclude Include="..\util\file_allocator.h" />
+    <ClInclude Include="..\util\goodies.h" />
+    <ClInclude Include="..\util\hashtab.h" />
+    <ClInclude Include="..\util\hex.h" />
+    <ClInclude Include="lasterror.h" />
+    <ClInclude Include="..\util\log.h" />
+    <ClInclude Include="..\util\lruishmap.h" />
+    <ClInclude Include="..\util\mmap.h" />
+    <ClInclude Include="..\util\ntservice.h" />
+    <ClInclude Include="..\util\optime.h" />
+    <ClInclude Include="..\util\processinfo.h" />
+    <ClInclude Include="..\util\queue.h" />
+    <ClInclude Include="..\util\ramstore.h" />
+    <ClInclude Include="..\util\unittest.h" />
+    <ClInclude Include="..\util\concurrency\list.h" />
+    <ClInclude Include="..\util\concurrency\value.h" />
+    <ClInclude Include="..\util\web\html.h" />
+    <ClInclude Include="..\util\net\httpclient.h" />
+    <ClInclude Include="..\util\md5.h" />
+    <ClInclude Include="..\util\md5.hpp" />
+    <ClInclude Include="..\util\net\message.h" />
+    <ClInclude Include="..\util\net\message_server.h" />
+    <ClInclude Include="..\util\net\sock.h" />
+    <ClInclude Include="..\scripting\engine.h" />
+    <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+    <ClInclude Include="..\scripting\engine_v8.h" />
+    <ClInclude Include="..\scripting\v8_db.h" />
+    <ClInclude Include="..\scripting\v8_utils.h" />
+    <ClInclude Include="..\scripting\v8_wrapper.h" />
+    <ClInclude Include="btree.h" />
+    <ClInclude Include="repl\health.h" />
+    <ClInclude Include="..\util\hostandport.h" />
+    <ClInclude Include="repl\rs.h" />
+    <ClInclude Include="repl\rs_config.h" />
+    <ClInclude Include="..\bson\bsonelement.h" />
+    <ClInclude Include="..\bson\bsoninlines.h" />
+    <ClInclude Include="..\bson\bsonmisc.h" />
+    <ClInclude Include="..\bson\bsonobj.h" />
+    <ClInclude Include="..\bson\bsonobjbuilder.h" />
+    <ClInclude Include="..\bson\bsonobjiterator.h" />
+    <ClInclude Include="..\bson\bsontypes.h" />
+    <ClInclude Include="jsobj.h" />
+    <ClInclude Include="..\bson\oid.h" />
+    <ClInclude Include="..\bson\ordering.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\js\js32d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js32r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="db.rc" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
 </Project>
 \ No newline at end of file
diff --git a/db/db.vcxproj.filters b/db/db.vcxproj.filters
index a2011df..5650b2c 100755
--- a/db/db.vcxproj.filters
+++ b/db/db.vcxproj.filters
@@ -6,30 +6,6 @@
     <ClCompile Include="..\client\dbclient_rs.cpp" />
     <ClCompile Include="..\client\distlock.cpp" />
     <ClCompile Include="..\client\model.cpp" />
-    <ClCompile Include="..\pcre-7.4\pcrecpp.cc" />
-    <ClCompile Include="..\pcre-7.4\pcre_chartables.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_compile.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_config.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_dfa_exec.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_exec.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_fullinfo.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_get.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_globals.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_info.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_maketables.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_newline.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_ord2utf8.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_refcount.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_scanner.cc" />
-    <ClCompile Include="..\pcre-7.4\pcre_stringpiece.cc" />
-    <ClCompile Include="..\pcre-7.4\pcre_study.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_tables.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_try_flipped.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_ucp_searchfuncs.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_valid_utf8.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_version.c" />
-    <ClCompile Include="..\pcre-7.4\pcre_xclass.c" />
-    <ClCompile Include="..\pcre-7.4\pcreposix.c" />
     <ClCompile Include="..\scripting\bench.cpp" />
     <ClCompile Include="..\shell\mongo_vstudio.cpp" />
     <ClCompile Include="..\s\chunk.cpp" />
@@ -118,12 +94,10 @@
     <ClCompile Include="nonce.cpp" />
     <ClCompile Include="..\client\parallel.cpp" />
     <ClCompile Include="pdfile.cpp" />
-    <ClCompile Include="query.cpp" />
     <ClCompile Include="queryoptimizer.cpp" />
     <ClCompile Include="security.cpp" />
     <ClCompile Include="security_commands.cpp" />
     <ClCompile Include="tests.cpp" />
-    <ClCompile Include="update.cpp" />
     <ClCompile Include="cmdline.cpp" />
     <ClCompile Include="queryutil.cpp" />
     <ClCompile Include="..\util\assert_util.cpp" />
@@ -133,13 +107,8 @@
     <ClCompile Include="..\util\ntservice.cpp" />
     <ClCompile Include="..\util\processinfo_win32.cpp" />
     <ClCompile Include="..\util\util.cpp" />
-    <ClCompile Include="..\util\httpclient.cpp" />
-    <ClCompile Include="..\util\miniwebserver.cpp" />
     <ClCompile Include="..\util\md5.c" />
     <ClCompile Include="..\util\md5main.cpp" />
-    <ClCompile Include="..\util\message.cpp" />
-    <ClCompile Include="..\util\message_server_port.cpp" />
-    <ClCompile Include="..\util\sock.cpp" />
     <ClCompile Include="..\s\d_logic.cpp" />
     <ClCompile Include="..\scripting\engine.cpp" />
     <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
@@ -153,8 +122,36 @@
     <ClCompile Include="repl\rs.cpp" />
     <ClCompile Include="repl\replset_commands.cpp" />
     <ClCompile Include="repl\rs_config.cpp" />
-    <ClCompile Include="security_key.cpp" />
     <ClCompile Include="..\util\file_allocator.cpp" />
+    <ClCompile Include="querypattern.cpp" />
+    <ClCompile Include="..\util\ramlog.cpp" />
+    <ClCompile Include="key.cpp" />
+    <ClCompile Include="btreebuilder.cpp" />
+    <ClCompile Include="queryoptimizercursor.cpp" />
+    <ClCompile Include="record.cpp" />
+    <ClCompile Include="ops\delete.cpp" />
+    <ClCompile Include="ops\update.cpp" />
+    <ClCompile Include="security_common.cpp" />
+    <ClCompile Include="ops\query.cpp" />
+    <ClCompile Include="..\util\net\httpclient.cpp" />
+    <ClCompile Include="..\util\net\message.cpp" />
+    <ClCompile Include="..\util\net\message_server_port.cpp" />
+    <ClCompile Include="..\util\net\sock.cpp" />
+    <ClCompile Include="..\util\net\miniwebserver.cpp" />
+    <ClCompile Include="..\util\net\listen.cpp" />
+    <ClCompile Include="..\util\net\message_port.cpp" />
+    <ClCompile Include="dbmessage.cpp" />
+    <ClCompile Include="commands\find_and_modify.cpp" />
+    <ClCompile Include="..\util\compress.cpp">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="scanandorder.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\client\dbclientcursor.h" />
@@ -163,8 +160,6 @@
     <ClInclude Include="..\client\parallel.h" />
     <ClInclude Include="..\s\d_logic.h" />
     <ClInclude Include="..\targetver.h" />
-    <ClInclude Include="..\pcre-7.4\config.h" />
-    <ClInclude Include="..\pcre-7.4\pcre.h" />
     <ClInclude Include="..\util\concurrency\rwlock.h" />
     <ClInclude Include="..\util\concurrency\msg.h" />
     <ClInclude Include="..\util\concurrency\mutex.h" />
@@ -235,7 +230,6 @@
     <ClInclude Include="resource.h" />
     <ClInclude Include="scanandorder.h" />
     <ClInclude Include="security.h" />
-    <ClInclude Include="update.h" />
     <ClInclude Include="..\util\allocator.h" />
     <ClInclude Include="..\util\array.h" />
     <ClInclude Include="..\util\assert_util.h" />
@@ -262,13 +256,8 @@
     <ClInclude Include="..\util\concurrency\list.h" />
     <ClInclude Include="..\util\concurrency\value.h" />
     <ClInclude Include="..\util\web\html.h" />
-    <ClInclude Include="..\util\httpclient.h" />
-    <ClInclude Include="..\util\miniwebserver.h" />
     <ClInclude Include="..\util\md5.h" />
     <ClInclude Include="..\util\md5.hpp" />
-    <ClInclude Include="..\util\message.h" />
-    <ClInclude Include="..\util\message_server.h" />
-    <ClInclude Include="..\util\sock.h" />
     <ClInclude Include="..\scripting\engine.h" />
     <ClInclude Include="..\scripting\engine_spidermonkey.h" />
     <ClInclude Include="..\scripting\engine_v8.h" />
@@ -292,6 +281,28 @@
     <ClInclude Include="..\bson\ordering.h" />
     <ClInclude Include="dur_journalimpl.h" />
     <ClInclude Include="..\util\concurrency\race.h" />
+    <ClInclude Include="..\util\alignedbuilder.h" />
+    <ClInclude Include="queryutil.h" />
+    <ClInclude Include="..\bson\bson.h" />
+    <ClInclude Include="..\bson\bson_db.h" />
+    <ClInclude Include="..\bson\bson-inl.h" />
+    <ClInclude Include="..\bson\inline_decls.h" />
+    <ClInclude Include="..\bson\stringdata.h" />
+    <ClInclude Include="..\bson\util\atomic_int.h" />
+    <ClInclude Include="..\bson\util\builder.h" />
+    <ClInclude Include="..\bson\util\misc.h" />
+    <ClInclude Include="ops\delete.h" />
+    <ClInclude Include="ops\update.h" />
+    <ClInclude Include="..\util\net\httpclient.h" />
+    <ClInclude Include="..\util\net\message.h" />
+    <ClInclude Include="..\util\net\message_server.h" />
+    <ClInclude Include="..\util\net\sock.h" />
+    <ClInclude Include="..\third_party\snappy\config.h">
+      <Filter>snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy.h">
+      <Filter>snappy</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="db.rc" />
@@ -326,4 +337,9 @@
     <Library Include="..\..\js\js64d.lib" />
     <Library Include="..\..\js\js64r.lib" />
   </ItemGroup>
+  <ItemGroup>
+    <Filter Include="snappy">
+      <UniqueIdentifier>{bb99c086-7926-4f50-838d-f5f0c18397c0}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
 </Project>
 \ No newline at end of file
diff --git a/db/db_10.sln b/db/db_10.sln
index f74ac3d..12d62a8 100755
--- a/db/db_10.sln
+++ b/db/db_10.sln
@@ -7,10 +7,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{40
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
-	ProjectSection(SolutionItems) = preProject
-		..\tools\export.cpp = ..\tools\export.cpp
-		..\tools\sniffer.cpp = ..\tools\sniffer.cpp
-	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files", "{2F760952-C71B-4865-998F-AABAE96D1373}"
 	ProjectSection(SolutionItems) = preProject
@@ -19,8 +15,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files",
 		..\util\processinfo_none.cpp = ..\util\processinfo_none.cpp
 	EndProjectSection
 EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "shell", "shell", "{407B4B88-3451-433C-B74F-31B31FEB5791}"
-EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "other", "other", "{12B11474-2D74-48C3-BB3D-F03249BEA88F}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongod", "db.vcxproj", "{215B2D68-0A70-4D10-8E75-B31010C62A91}"
@@ -33,12 +27,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bsondemo", "..\bson\bsondem
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongoutils test program", "..\util\mongoutils\mongoutils.vcxproj", "{7B84584E-92BC-4DB9-971B-A1A8F93E5053}"
 EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "jstests", "jstests", "{F5ABFB2C-A34F-48C1-9B5F-01D456AF6C57}"
-	ProjectSection(SolutionItems) = preProject
-		..\jstests\index_many.js = ..\jstests\index_many.js
-		..\jstests\indexapi.js = ..\jstests\indexapi.js
-		..\jstests\objid5.js = ..\jstests\objid5.js
-	EndProjectSection
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple_client_demo", "..\client\examples\simple_client_demo.vcxproj", "{89C30BC3-2874-4F2C-B4DA-EB04E9782236}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -120,6 +109,18 @@ Global
 		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.ActiveCfg = Release|Win32
 		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.Build.0 = Release|Win32
 		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|x64.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Any CPU.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.Build.0 = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|x64.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Any CPU.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.Build.0 = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|x64.ActiveCfg = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -127,10 +128,9 @@ Global
 	GlobalSection(NestedProjects) = preSolution
 		{2B262D59-9DC7-4BF1-A431-1BD4966899A5} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
 		{2F760952-C71B-4865-998F-AABAE96D1373} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
-		{407B4B88-3451-433C-B74F-31B31FEB5791} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
 		{4082881B-EB00-486F-906C-843B8EC06E18} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
 		{C9DB5EB7-81AA-4185-BAA1-DA035654402F} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
 		{7B84584E-92BC-4DB9-971B-A1A8F93E5053} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
-		{F5ABFB2C-A34F-48C1-9B5F-01D456AF6C57} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
 	EndGlobalSection
 EndGlobal
diff --git a/db/dbcommands.cpp b/db/dbcommands.cpp
index 59dd78c..31f4b7f 100644
--- a/db/dbcommands.cpp
+++ b/db/dbcommands.cpp
@@ -15,8 +15,13 @@
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+/* SHARDING: 
+   I believe this file is for mongod only.
+   See s/commnands_public.cpp for mongos.
+*/
+
 #include "pch.h"
-#include "query.h"
+#include "ops/query.h"
 #include "pdfile.h"
 #include "jsobj.h"
 #include "../bson/util/builder.h"
@@ -26,10 +31,11 @@
 #include "../util/lruishmap.h"
 #include "../util/md5.hpp"
 #include "../util/processinfo.h"
+#include "../util/ramlog.h"
 #include "json.h"
 #include "repl.h"
 #include "repl_block.h"
-#include "replpair.h"
+#include "replutil.h"
 #include "commands.h"
 #include "db.h"
 #include "instance.h"
@@ -45,7 +51,21 @@
 
 namespace mongo {
 
-    extern int otherTraceLevel;
+    namespace dur { 
+        void setAgeOutJournalFiles(bool rotate);
+    }
+    /** @return true if fields found */
+    bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { 
+        BSONElement e = cmdObj["ageOutJournalFiles"];
+        if( !e.eoo() ) {
+            bool r = e.trueValue();
+            log() << "ageOutJournalFiles " << r << endl;
+            dur::setAgeOutJournalFiles(r);
+            return true;
+        }
+        return false;
+    }
+
     void flushDiagLog();
 
     /* reset any errors so that getlasterror comes back clean.
@@ -68,7 +88,7 @@ namespace mongo {
             help << "reset error state (used with getpreverror)";
         }
         CmdResetError() : Command("resetError", false, "reseterror") {}
-        bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             LastError *le = lastError.get();
             assert( le );
             le->reset();
@@ -99,7 +119,7 @@ namespace mongo {
                  << "  { w:n } - await replication to n servers (including self) before returning\n"
                  << "  { wtimeout:m} - timeout for w in m milliseconds";
         }
-        bool run(const string& dbname, BSONObj& _cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& _cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             LastError *le = lastError.disableForCommand();
 
             bool err = false;
@@ -112,7 +132,7 @@ namespace mongo {
             Client& c = cc();
             c.appendLastOp( result );
 
-            result.appendNumber( "connectionId" , c.getConnectionId() );
+            result.appendNumber( "connectionId" , c.getConnectionId() ); // for sharding; also useful in general for debugging
 
             BSONObj cmdObj = _cmdObj;
             {
@@ -139,7 +159,7 @@ namespace mongo {
             else if ( cmdObj["fsync"].trueValue() ) {
                 Timer t;
                 if( !getDur().awaitCommit() ) {
-                    // if get here, not running with --dur
+                    // if get here, not running with --journal
                     log() << "fsync from getlasterror" << endl;
                     result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) );
                 }
@@ -156,12 +176,10 @@ namespace mongo {
             }
 
             BSONElement e = cmdObj["w"];
-            if ( e.isNumber() ) {
+            if ( e.ok() ) {
                 int timeout = cmdObj["wtimeout"].numberInt();
                 Timer t;
 
-                int w = e.numberInt();
-
                 long long passes = 0;
                 char buf[32];
                 while ( 1 ) {
@@ -171,7 +189,7 @@ namespace mongo {
                         if ( anyReplEnabled() ) {
                             result.append( "wnote" , "no write has been done on this connection" );
                         }
-                        else if ( w <= 1 ) {
+                        else if ( e.isNumber() && e.numberInt() <= 1 ) {
                             // don't do anything
                             // w=1 and no repl, so this is fine
                         }
@@ -185,8 +203,9 @@ namespace mongo {
                     }
 
                     // check this first for w=0 or w=1
-                    if ( opReplicatedEnough( op, w ) )
+                    if ( opReplicatedEnough( op, e ) ) {
                         break;
+                    }
 
                     // if replication isn't enabled (e.g., config servers)
                     if ( ! anyReplEnabled() ) {
@@ -230,7 +249,7 @@ namespace mongo {
             return true;
         }
         CmdGetPrevError() : Command("getPrevError", false, "getpreverror") {}
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             LastError *le = lastError.disableForCommand();
             le->appendSelf( result );
             if ( le->valid )
@@ -241,6 +260,65 @@ namespace mongo {
         }
     } cmdGetPrevError;
 
+    CmdShutdown cmdShutdown;
+
+    void CmdShutdown::help( stringstream& help ) const {
+        help << "shutdown the database.  must be ran against admin db and "
+             << "either (1) ran from localhost or (2) authenticated. If "
+             << "this is a primary in a replica set and there is no member "
+             << "within 10 seconds of its optime, it will not shutdown "
+             << "without force : true.  You can also specify timeoutSecs : "
+             << "N to wait N seconds for other members to catch up.";
+    }
+
+    bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+
+        if (!force && theReplSet && theReplSet->isPrimary()) {
+            long long timeout, now, start;
+            timeout = now = start = curTimeMicros64()/1000000;
+            if (cmdObj.hasField("timeoutSecs")) {
+                timeout += cmdObj["timeoutSecs"].numberLong();
+            }
+
+            OpTime lastOp = theReplSet->lastOpTimeWritten;
+            OpTime closest = theReplSet->lastOtherOpTime();
+            long long int diff = lastOp.getSecs() - closest.getSecs();
+            while (now <= timeout && (diff < 0 || diff > 10)) {
+                sleepsecs(1);
+                now++;
+
+                lastOp = theReplSet->lastOpTimeWritten;
+                closest = theReplSet->lastOtherOpTime();
+                diff = lastOp.getSecs() - closest.getSecs();
+            }
+
+            if (diff < 0 || diff > 10) {
+                errmsg = "no secondaries within 10 seconds of my optime";
+                result.append("closest", closest.getSecs());
+                result.append("difference", diff);
+                return false;
+            }
+
+            // step down
+            theReplSet->stepDown(120);
+
+            log() << "waiting for secondaries to catch up" << endl;
+
+            lastOp = theReplSet->lastOpTimeWritten;
+            while (lastOp != closest && now - start < 60) {
+                closest = theReplSet->lastOtherOpTime();
+
+                now++;
+                sleepsecs(1);
+            }
+
+            // regardless of whether they caught up, we'll shut down
+        }
+
+        return shutdownHelper();
+    }
+
     class CmdDropDatabase : public Command {
     public:
         virtual bool logTheOp() {
@@ -254,7 +332,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return WRITE; }
         CmdDropDatabase() : Command("dropDatabase") {}
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONElement e = cmdObj.firstElement();
             log() << "dropDatabase " << dbname << endl;
             int p = (int) e.number();
@@ -274,17 +352,20 @@ namespace mongo {
         virtual bool slaveOk() const {
             return true;
         }
+        virtual bool maintenanceMode() const { return true; }
         virtual void help( stringstream& help ) const {
             help << "repair database.  also compacts. note: slow.";
         }
         virtual LockType locktype() const { return WRITE; }
         CmdRepairDatabase() : Command("repairDatabase") {}
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONElement e = cmdObj.firstElement();
             log() << "repairDatabase " << dbname << endl;
             int p = (int) e.number();
-            if ( p != 1 )
+            if ( p != 1 ) {
+                errmsg = "bad option";
                 return false;
+            }
             e = cmdObj.getField( "preserveClonedFilesOnFailure" );
             bool preserveClonedFilesOnFailure = e.isBoolean() && e.boolean();
             e = cmdObj.getField( "backupOriginalFiles" );
@@ -311,7 +392,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return WRITE; }
         CmdProfile() : Command("profile") {}
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONElement e = cmdObj.firstElement();
             result.append("was", cc().database()->profile);
             result.append("slowms", cmdLine.slowMS );
@@ -348,7 +429,7 @@ namespace mongo {
             help << "returns lots of administrative server statistics";
         }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             long long start = Listener::getElapsedTimeMillis();
             BSONObjBuilder timeBuilder(128);
 
@@ -407,9 +488,11 @@ namespace mongo {
                 t.append("bits",  ( sizeof(int*) == 4 ? 32 : 64 ) );
 
                 ProcessInfo p;
+                int v = 0;
                 if ( p.supported() ) {
                     t.appendNumber( "resident" , p.getResidentSize() );
-                    t.appendNumber( "virtual" , p.getVirtualMemorySize() );
+                    v = p.getVirtualMemorySize();
+                    t.appendNumber( "virtual" , v );
                     t.appendBool( "supported" , true );
                 }
                 else {
@@ -419,7 +502,18 @@ namespace mongo {
 
                 timeBuilder.appendNumber( "middle of mem" , Listener::getElapsedTimeMillis() - start );
 
-                t.appendNumber( "mapped" , MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) );
+                int m = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ));
+                t.appendNumber( "mapped" , m );
+                
+                if ( cmdLine.dur ) {
+                    m *= 2;
+                    t.appendNumber( "mappedWithJournal" , m );
+                }
+                
+                if( v - m > 5000 ) { 
+                    t.append("note", "virtual minus mapped is large. could indicate a memory leak");
+                    log() << "warning: virtual size (" << v << "MB) - mapped size (" << m << "MB) is large. could indicate a memory leak" << endl;
+                }
 
                 t.done();
 
@@ -504,9 +598,27 @@ namespace mongo {
                 result.append("dur", dur::stats.asObj());
             }
 
+            timeBuilder.appendNumber( "after dur" , Listener::getElapsedTimeMillis() - start );
+
+            {
+                RamLog* rl = RamLog::get( "warnings" );
+                verify(15880, rl);
+                
+                if (rl->lastWrite() >= time(0)-(10*60)){ // only show warnings from last 10 minutes
+                    vector<const char*> lines;
+                    rl->get( lines );
+                    
+                    BSONArrayBuilder arr( result.subarrayStart( "warnings" ) );
+                    for ( unsigned i=std::max(0,(int)lines.size()-10); i<lines.size(); i++ )
+                        arr.append( lines[i] );
+                    arr.done();
+                }
+            }
+
             if ( ! authed )
                 result.append( "note" , "run against admin for more info" );
-
+            
+            timeBuilder.appendNumber( "at end" , Listener::getElapsedTimeMillis() - start );
             if ( Listener::getElapsedTimeMillis() - start > 1000 ) {
                 BSONObj t = timeBuilder.obj();
                 log() << "serverStatus was very slow: " << t << endl;
@@ -526,7 +638,7 @@ namespace mongo {
         virtual void help( stringstream& help ) const { help << "internal"; }
         virtual LockType locktype() const { return NONE; }
         CmdGetOpTime() : Command("getoptime") { }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             writelock l( "" );
             result.appendDate("optime", OpTime::now().asDate());
             return true;
@@ -555,7 +667,7 @@ namespace mongo {
         }
         void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Monitoring+and+Diagnostics#MonitoringandDiagnostics-DatabaseRecord%2FReplay"; }
         virtual LockType locktype() const { return WRITE; }
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() );
             flushDiagLog();
             if ( !cmdLine.quiet )
@@ -678,7 +790,7 @@ namespace mongo {
         }
         virtual void help( stringstream& help ) const { help << "drop a collection\n{drop : <collectionName>}"; }
         virtual LockType locktype() const { return WRITE; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string nsToDrop = dbname + '.' + cmdObj.firstElement().valuestr();
             NamespaceDetails *d = nsdetails(nsToDrop.c_str());
             if ( !cmdLine.quiet )
@@ -702,7 +814,7 @@ namespace mongo {
             return false;
         }
         virtual bool slaveOk() const {
-            // ok on --slave setups, not ok for nonmaster of a repl pair (unless override)
+            // ok on --slave setups
             return replSettings.slave == SimpleSlave;
         }
         virtual bool slaveOverrideOk() {
@@ -712,7 +824,7 @@ namespace mongo {
             return false;
         }
         virtual void help( stringstream& help ) const { help << "count objects in collection"; }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string ns = dbname + '.' + cmdObj.firstElement().valuestr();
             string err;
             long long n = runCount(ns.c_str(), cmdObj, err);
@@ -748,11 +860,14 @@ namespace mongo {
         }
         virtual LockType locktype() const { return WRITE; }
         virtual void help( stringstream& help ) const {
-            help << "create a collection";
+            help << "create a collection explicitly\n"
+                "{ create: <ns>[, capped: <bool>, size: <collSizeInBytes>, max: <nDocs>] }";
         }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            uassert(15888, "must pass name of collection to create", cmdObj.firstElement().valuestrsafe()[0] != '\0');
             string ns = dbname + '.' + cmdObj.firstElement().valuestr();
             string err;
+            uassert(14832, "specify size:<n> when capped is true", !cmdObj["capped"].trueValue() || cmdObj["size"].isNumber() || cmdObj.hasField("$nExtents"));
             bool ok = userCreateNS(ns.c_str(), cmdObj, err, ! fromRepl );
             if ( !ok && !err.empty() )
                 errmsg = err;
@@ -774,7 +889,7 @@ namespace mongo {
             help << "drop indexes for a collection";
         }
         CmdDropIndexes() : Command("dropIndexes", false, "deleteIndexes") { }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) {
             BSONElement e = jsobj.firstElement();
             string toDeleteNs = dbname + '.' + e.valuestr();
             NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
@@ -819,7 +934,7 @@ namespace mongo {
             help << "re-index a collection";
         }
         CmdReIndex() : Command("reIndex") { }
-        bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             static DBDirectClient db;
 
             BSONElement e = jsobj.firstElement();
@@ -837,7 +952,7 @@ namespace mongo {
             auto_ptr<DBClientCursor> i = db.getIndexes( toDeleteNs );
             BSONObjBuilder b;
             while ( i->more() ) {
-                BSONObj o = i->next().getOwned();
+                BSONObj o = i->next().removeField("v").getOwned();
                 b.append( BSONObjBuilder::numStr( all.size() ) , o );
                 all.push_back( o );
             }
@@ -851,21 +966,9 @@ namespace mongo {
 
             for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ) {
                 BSONObj o = *i;
-                if ( o.getIntField("v") > 0 ) {
-                    BSONObjBuilder b;
-                    BSONObjIterator i( o );
-                    while ( i.more() ) {
-                        BSONElement e = i.next();
-                        if ( str::equals( e.fieldName() , "v" ) )
-                            continue;
-                        b.append( e );
-                    }
-                    o = b.obj();
-                }
                 theDataFileMgr.insertWithObjMod( Namespace( toDeleteNs.c_str() ).getSisterNS( "system.indexes" ).c_str() , o , true );
             }
 
-            result.append( "ok" , 1 );
             result.append( "nIndexes" , (int)all.size() );
             result.appendArray( "indexes" , b.obj() );
             return true;
@@ -883,10 +986,10 @@ namespace mongo {
         virtual bool adminOnly() const {
             return true;
         }
-        virtual LockType locktype() const { return READ; }
+        virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream& help ) const { help << "list databases on this server"; }
         CmdListDatabases() : Command("listDatabases" , true ) {}
-        bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             vector< string > dbNames;
             getDatabaseNames( dbNames );
             vector< BSONObj > dbInfos;
@@ -895,12 +998,18 @@ namespace mongo {
             boost::intmax_t totalSize = 0;
             for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
                 BSONObjBuilder b;
-                b.append( "name", i->c_str() );
+                b.append( "name", *i );
+
                 boost::intmax_t size = dbSize( i->c_str() );
                 b.append( "sizeOnDisk", (double) size );
-                Client::Context ctx( *i );
-                b.appendBool( "empty", ctx.db()->isEmpty() );
                 totalSize += size;
+                
+                {
+                    readlock lk( *i );
+                    Client::Context ctx( *i );
+                    b.appendBool( "empty", ctx.db()->isEmpty() );
+                }
+                
                 dbInfos.push_back( b.obj() );
 
                 seen.insert( i->c_str() );
@@ -908,7 +1017,11 @@ namespace mongo {
 
             // TODO: erh 1/1/2010 I think this is broken where path != dbpath ??
             set<string> allShortNames;
-            dbHolder.getAllShortNames( allShortNames );
+            {
+                readlock lk;
+                dbHolder.getAllShortNames( allShortNames );
+            }
+            
             for ( set<string>::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ) {
                 string name = *i;
 
@@ -916,9 +1029,14 @@ namespace mongo {
                     continue;
 
                 BSONObjBuilder b;
-                b << "name" << name << "sizeOnDisk" << double( 1 );
-                Client::Context ctx( name );
-                b.appendBool( "empty", ctx.db()->isEmpty() );
+                b.append( "name" , name );
+                b.append( "sizeOnDisk" , (double)1.0 );
+
+                {
+                    readlock lk( name );
+                    Client::Context ctx( name );
+                    b.appendBool( "empty", ctx.db()->isEmpty() );
+                }
 
                 dbInfos.push_back( b.obj() );
             }
@@ -940,7 +1058,7 @@ namespace mongo {
         virtual LockType locktype() const { return WRITE; }
 
         CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {}
-        bool run(const string& dbname , BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             bool ok;
             try {
                 ok = dbHolder.closeAll( dbpath , result, false );
@@ -967,7 +1085,7 @@ namespace mongo {
             help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }";
         }
         virtual LockType locktype() const { return READ; }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string ns = dbname;
             ns += ".";
             {
@@ -986,7 +1104,7 @@ namespace mongo {
             BSONObj sort = BSON( "files_id" << 1 << "n" << 1 );
 
             shared_ptr<Cursor> cursor = bestGuessCursor(ns.c_str(), query, sort);
-            scoped_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns.c_str()));
+            auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns.c_str()));
 
             int n = 0;
             while ( cursor->ok() ) {
@@ -1000,37 +1118,31 @@ namespace mongo {
                 BSONObj obj = cursor->current();
                 cursor->advance();
 
-                ClientCursor::YieldLock yield (cc);
-                try {
-
-                    BSONElement ne = obj["n"];
-                    assert(ne.isNumber());
-                    int myn = ne.numberInt();
-                    if ( n != myn ) {
-                        log() << "should have chunk: " << n << " have:" << myn << endl;
-
-                        DBDirectClient client;
-                        Query q(query);
-                        q.sort(sort);
-                        auto_ptr<DBClientCursor> c = client.query(ns, q);
-                        while(c->more())
-                            PRINT(c->nextSafe());
+                BSONElement ne = obj["n"];
+                assert(ne.isNumber());
+                int myn = ne.numberInt();
+                if ( n != myn ) {
+                    log() << "should have chunk: " << n << " have:" << myn << endl;
+                    dumpChunks( ns , query , sort );
+                    uassert( 10040 ,  "chunks out of order" , n == myn );
+                }
 
-                        uassert( 10040 ,  "chunks out of order" , n == myn );
-                    }
+                int len;
+                const char * data = obj["data"].binDataClean( len );
 
-                    int len;
-                    const char * data = obj["data"].binDataClean( len );
+                ClientCursor::YieldLock yield (cc.get());
+                try {
                     md5_append( &st , (const md5_byte_t*)(data) , len );
-
                     n++;
                 }
                 catch (...) {
-                    yield.relock(); // needed before yield goes out of scope
+                    if ( ! yield.stillOk() ) // relocks
+                        cc.release();
                     throw;
                 }
 
                 if ( ! yield.stillOk() ) {
+                    cc.release();
                     uasserted(13281, "File deleted during filemd5 command");
                 }
             }
@@ -1041,6 +1153,15 @@ namespace mongo {
             result.append( "md5" , digestToString( d ) );
             return true;
         }
+
+        void dumpChunks( const string& ns , const BSONObj& query , const BSONObj& sort ) {
+            DBDirectClient client;
+            Query q(query);
+            q.sort(sort);
+            auto_ptr<DBClientCursor> c = client.query(ns, q);
+            while(c->more())
+                PRINT(c->nextSafe());
+        }
     } cmdFileMD5;
 
     static IndexDetails *cmdIndexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
@@ -1063,7 +1184,7 @@ namespace mongo {
                  "\nkeyPattern, min, and max parameters are optional."
                  "\nnote: This command may take a while to run";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             Timer timer;
 
             string ns = jsobj.firstElement().String();
@@ -1103,7 +1224,7 @@ namespace mongo {
                 if ( idx == 0 )
                     return false;
 
-                c.reset( new BtreeCursor( d, d->idxNo(*idx), *idx, min, max, false, 1 ) );
+                c.reset( BtreeCursor::make( d, d->idxNo(*idx), *idx, min, max, false, 1 ) );
             }
 
             long long avgObjSize = d->stats.datasize / d->stats.nrecords;
@@ -1178,9 +1299,10 @@ namespace mongo {
         virtual bool slaveOk() const { return true; }
         virtual LockType locktype() const { return READ; }
         virtual void help( stringstream &help ) const {
-            help << "{ collStats:\"blog.posts\" , scale : 1 } scale divides sizes e.g. for KB use 1024";
+            help << "{ collStats:\"blog.posts\" , scale : 1 } scale divides sizes e.g. for KB use 1024\n"
+                    "    avgObjSize - in bytes";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string ns = dbname + "." + jsobj.firstElement().valuestr();
             Client::Context cx( ns );
 
@@ -1199,7 +1321,6 @@ namespace mongo {
                     errmsg = "scale has to be > 0";
                     return false;
                 }
-
             }
             else if ( jsobj["scale"].trueValue() ) {
                 errmsg = "scale has to be a number > 0";
@@ -1246,9 +1367,24 @@ namespace mongo {
         virtual bool slaveOk() const { return true; }
         virtual LockType locktype() const { return READ; }
         virtual void help( stringstream &help ) const {
-            help << " example: { dbStats:1 } ";
+            help << 
+                "Get stats on a database. Not instantaneous. Slower for databases with large .ns files.\n" << 
+                "Example: { dbStats:1, scale:1 }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            int scale = 1;
+            if ( jsobj["scale"].isNumber() ) {
+                scale = jsobj["scale"].numberInt();
+                if ( scale <= 0 ) {
+                    errmsg = "scale has to be > 0";
+                    return false;
+                }
+            }
+            else if ( jsobj["scale"].trueValue() ) {
+                errmsg = "scale has to be a number > 0";
+                return false;
+            }
+
             list<string> collections;
             Database* d = cc().database();
             if ( d )
@@ -1288,12 +1424,14 @@ namespace mongo {
             result.appendNumber( "collections" , ncollections );
             result.appendNumber( "objects" , objects );
             result.append      ( "avgObjSize" , objects == 0 ? 0 : double(size) / double(objects) );
-            result.appendNumber( "dataSize" , size );
-            result.appendNumber( "storageSize" , storageSize);
+            result.appendNumber( "dataSize" , size / scale );
+            result.appendNumber( "storageSize" , storageSize / scale);
             result.appendNumber( "numExtents" , numExtents );
             result.appendNumber( "indexes" , indexes );
-            result.appendNumber( "indexSize" , indexSize );
-            result.appendNumber( "fileSize" , d->fileSize() );
+            result.appendNumber( "indexSize" , indexSize / scale );
+            result.appendNumber( "fileSize" , d->fileSize() / scale );
+            if( d )
+                result.appendNumber( "nsSizeMB", (int) d->namespaceIndex.fileLength() / 1024 / 1024 );
 
             return true;
         }
@@ -1308,7 +1446,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "{ cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string from = jsobj.getStringField( "cloneCollectionAsCapped" );
             string to = jsobj.getStringField( "toCollection" );
             long long size = (long long)jsobj.getField( "size" ).number();
@@ -1350,6 +1488,7 @@ namespace mongo {
             while( c->more() ) {
                 BSONObj obj = c->next();
                 theDataFileMgr.insertAndLog( toNs.c_str(), obj, true );
+                getDur().commitIfNeeded();
             }
 
             return true;
@@ -1369,7 +1508,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "{ convertToCapped:<fromCollectionName>, size:<sizeInBytes> }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             BackgroundOperation::assertNoBgOpInProgForDb(dbname.c_str());
 
             string from = jsobj.getStringField( "convertToCapped" );
@@ -1411,116 +1550,6 @@ namespace mongo {
         }
     } cmdConvertToCapped;
 
-    /* Find and Modify an object returning either the old (default) or new value*/
-    class CmdFindAndModify : public Command {
-    public:
-        virtual void help( stringstream &help ) const {
-            help <<
-                 "{ findAndModify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n"
-                 "{ findAndModify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n"
-                 "Either update or remove is required, all other fields have default values.\n"
-                 "Output is in the \"value\" field\n";
-        }
-
-        CmdFindAndModify() : Command("findAndModify", false, "findandmodify") { }
-        virtual bool logTheOp() {
-            return false; // the modification will be logged directly
-        }
-        virtual bool slaveOk() const {
-            return false;
-        }
-        virtual LockType locktype() const { return WRITE; }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
-            static DBDirectClient db;
-
-            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
-
-            BSONObj origQuery = cmdObj.getObjectField("query"); // defaults to {}
-            Query q (origQuery);
-            BSONElement sort = cmdObj["sort"];
-            if (!sort.eoo())
-                q.sort(sort.embeddedObjectUserCheck());
-
-            bool upsert = cmdObj["upsert"].trueValue();
-
-            BSONObj fieldsHolder (cmdObj.getObjectField("fields"));
-            const BSONObj* fields = (fieldsHolder.isEmpty() ? NULL : &fieldsHolder);
-
-            BSONObj out = db.findOne(ns, q, fields);
-            if (out.isEmpty()) {
-                if (!upsert) {
-                    errmsg = "No matching object found";
-                    return false;
-                }
-
-                BSONElement update = cmdObj["update"];
-                uassert(13329, "upsert mode requires update field", !update.eoo());
-                uassert(13330, "upsert mode requires query field", !origQuery.isEmpty());
-                db.update(ns, origQuery, update.embeddedObjectUserCheck(), true);
-
-                BSONObj gle = db.getLastErrorDetailed();
-                if (gle["err"].type() == String) {
-                    errmsg = gle["err"].String();
-                    return false;
-                }
-
-                if (cmdObj["new"].trueValue()) {
-                    BSONElement _id = gle["upserted"];
-                    if (_id.eoo())
-                        _id = origQuery["_id"];
-
-                    out = db.findOne(ns, QUERY("_id" << _id), fields);
-                }
-
-            }
-            else {
-
-                if (cmdObj["remove"].trueValue()) {
-                    uassert(12515, "can't remove and update", cmdObj["update"].eoo());
-                    db.remove(ns, QUERY("_id" << out["_id"]), 1);
-
-                }
-                else {   // update
-
-                    BSONElement queryId = origQuery["_id"];
-                    if (queryId.eoo() || getGtLtOp(queryId) != BSONObj::Equality) {
-                        // need to include original query for $ positional operator
-
-                        BSONObjBuilder b;
-                        b.append(out["_id"]);
-                        BSONObjIterator it(origQuery);
-                        while (it.more()) {
-                            BSONElement e = it.next();
-                            if (strcmp(e.fieldName(), "_id"))
-                                b.append(e);
-                        }
-                        q = Query(b.obj());
-                    }
-
-                    if (q.isComplex()) // update doesn't work with complex queries
-                        q = Query(q.getFilter().getOwned());
-
-                    BSONElement update = cmdObj["update"];
-                    uassert(12516, "must specify remove or update", !update.eoo());
-                    db.update(ns, q, update.embeddedObjectUserCheck());
-
-                    BSONObj gle = db.getLastErrorDetailed();
-                    if (gle["err"].type() == String) {
-                        errmsg = gle["err"].String();
-                        return false;
-                    }
-
-                    if (cmdObj["new"].trueValue())
-                        out = db.findOne(ns, QUERY("_id" << out["_id"]), fields);
-                }
-            }
-
-            result.append("value", out);
-
-            return true;
-        }
-    } cmdFindAndModify;
-
     /* Returns client's uri */
     class CmdWhatsMyUri : public Command {
     public:
@@ -1535,7 +1564,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "{whatsmyuri:1}";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             BSONObj info = cc().curop()->infoNoauth();
             result << "you" << info[ "client" ];
             return true;
@@ -1550,7 +1579,7 @@ namespace mongo {
             return true;
         }
         virtual bool slaveOk() const {
-            return false;
+            return true;
         }
         virtual LockType locktype() const { return WRITE; }
         virtual bool requiresAuth() {
@@ -1559,7 +1588,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const {
             help << "internal. for testing only.";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string coll = cmdObj[ "godinsert" ].valuestrsafe();
             uassert( 13049, "godinsert must specify a collection", !coll.empty() );
             string ns = dbname + "." + coll;
@@ -1574,7 +1603,7 @@ namespace mongo {
         DBHashCmd() : Command( "dbHash", false, "dbhash" ) {}
         virtual bool slaveOk() const { return true; }
         virtual LockType locktype() const { return READ; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             list<string> colls;
             Database* db = cc().database();
             if ( db )
@@ -1611,7 +1640,7 @@ namespace mongo {
 
                 int idNum = nsd->findIdIndex();
                 if ( idNum >= 0 ) {
-                    cursor.reset( new BtreeCursor( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) );
+                    cursor.reset( BtreeCursor::make( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) );
                 }
                 else if ( c.find( ".system." ) != string::npos ) {
                     continue;
@@ -1620,9 +1649,8 @@ namespace mongo {
                     cursor = findTableScan( c.c_str() , BSONObj() );
                 }
                 else {
-                    bb.done();
-                    errmsg = (string)"can't find _id index for: " + c;
-                    return 0;
+                    log() << "can't find _id index for: " << c << endl;
+                    continue;
                 }
 
                 md5_state_t st;
@@ -1665,16 +1693,13 @@ namespace mongo {
         virtual bool slaveOk() const { return true; }
         virtual void help( stringstream& help ) const {
             help << "internal testing command.  Makes db block (in a read lock) for 100 seconds\n";
-            help << "w:true write lock";
+            help << "w:true write lock. secs:<seconds>";
         }
         CmdSleep() : Command("sleep") { }
-        bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-
-
+        bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             int secs = 100;
             if ( cmdObj["secs"].isNumber() )
                 secs = cmdObj["secs"].numberInt();
-
             if( cmdObj.getBoolField("w") ) {
                 writelock lk("");
                 sleepsecs(secs);
@@ -1683,7 +1708,6 @@ namespace mongo {
                 readlock lk("");
                 sleepsecs(secs);
             }
-
             return true;
         }
     } cmdSleep;
@@ -1695,7 +1719,7 @@ namespace mongo {
         virtual bool slaveOk() const { return false; }
         virtual LockType locktype() const { return WRITE; }
         virtual bool requiresAuth() { return true; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string coll = cmdObj[ "captrunc" ].valuestrsafe();
             uassert( 13416, "captrunc must specify a collection", !coll.empty() );
             string ns = dbname + "." + coll;
@@ -1722,7 +1746,7 @@ namespace mongo {
         virtual bool slaveOk() const { return false; }
         virtual LockType locktype() const { return WRITE; }
         virtual bool requiresAuth() { return true; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             string coll = cmdObj[ "emptycapped" ].valuestrsafe();
             uassert( 13428, "emptycapped must specify a collection", !coll.empty() );
             string ns = dbname + "." + coll;
@@ -1787,13 +1811,31 @@ namespace mongo {
         if ( c->adminOnly() )
             log( 2 ) << "command: " << cmdObj << endl;
 
+        if (c->maintenanceMode() && theReplSet && theReplSet->isSecondary()) {
+            theReplSet->setMaintenanceMode(true);
+        }
+
         if ( c->locktype() == Command::NONE ) {
             // we also trust that this won't crash
+
+            if ( c->requiresAuth() ) {
+                // test that the user at least as read permissions
+                if ( ! client.getAuthenticationInfo()->isAuthorizedReads( dbname ) ) {
+                    result.append( "errmsg" , "need to login" );
+                    return false;
+                }
+            }
+
             client.curop()->ensureStarted();
             string errmsg;
-            int ok = c->run( dbname , cmdObj , errmsg , result , fromRepl );
+            int ok = c->run( dbname , cmdObj , queryOptions, errmsg , result , fromRepl );
             if ( ! ok )
                 result.append( "errmsg" , errmsg );
+
+            if (c->maintenanceMode() && theReplSet) {
+                theReplSet->setMaintenanceMode(false);
+            }
+
             return ok;
         }
 
@@ -1807,11 +1849,13 @@ namespace mongo {
         client.curop()->ensureStarted();
         Client::Context ctx( dbname , dbpath , &lk , c->requiresAuth() );
 
+        bool retval = true;
+
         try {
             string errmsg;
-            if ( ! c->run(dbname, cmdObj, errmsg, result, fromRepl ) ) {
+            if ( ! c->run(dbname, cmdObj, queryOptions, errmsg, result, fromRepl ) ) {
                 result.append( "errmsg" , errmsg );
-                return false;
+                retval = false;
             }
         }
         catch ( DBException& e ) {
@@ -1819,14 +1863,18 @@ namespace mongo {
             ss << "exception: " << e.what();
             result.append( "errmsg" , ss.str() );
             result.append( "code" , e.getCode() );
-            return false;
+            retval = false;
         }
 
-        if ( c->logTheOp() && ! fromRepl ) {
+        if ( retval && c->logTheOp() && ! fromRepl ) {
             logOp("c", cmdns, cmdObj);
         }
 
-        return true;
+        if (c->maintenanceMode() && theReplSet) {
+            theReplSet->setMaintenanceMode(false);
+        }
+
+        return retval;
     }
 
 
@@ -1850,7 +1898,10 @@ namespace mongo {
         BSONObj jsobj;
         {
             BSONElement e = _cmdobj.firstElement();
-            if ( e.type() == Object && string("query") == e.fieldName() ) {
+            if ( e.type() == Object && (e.fieldName()[0] == '$'
+                                         ? str::equals("query", e.fieldName()+1)
+                                         : str::equals("query", e.fieldName())))
+            {
                 jsobj = e.embeddedObject();
             }
             else {
diff --git a/db/dbcommands_admin.cpp b/db/dbcommands_admin.cpp
index 82a9c91..566027f 100644
--- a/db/dbcommands_admin.cpp
+++ b/db/dbcommands_admin.cpp
@@ -33,6 +33,7 @@
 #include "../util/background.h"
 #include "../util/logfile.h"
 #include "../util/alignedbuilder.h"
+#include "../util/paths.h"
 #include "../scripting/engine.h"
 
 namespace mongo {
@@ -46,7 +47,7 @@ namespace mongo {
 
         virtual void help(stringstream& h) const { h << "internal"; }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string dropns = dbname + "." + cmdObj.firstElement().valuestrsafe();
 
             if ( !cmdLine.quiet )
@@ -81,7 +82,7 @@ namespace mongo {
         virtual bool adminOnly() const { return true; }
         virtual void help(stringstream& h) const { h << "test how long to write and fsync to a test file in the journal/ directory"; }
 
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             filesystem::path p = dur::getJournalDir();
             p /= "journalLatencyTest";
         
@@ -133,6 +134,11 @@ namespace mongo {
             }
             catch(...) { }
 
+            try {
+                result.append("onSamePartition", onSamePartition(dur::getJournalDir().string(), dbpath));
+            }
+            catch(...) { }
+
             return 1;
         }
     } journalLatencyTestCmd;
@@ -145,12 +151,13 @@ namespace mongo {
             return true;
         }
 
-        virtual void help(stringstream& h) const { h << "Validate contents of a namespace by scanning its data structures for correctness.  Slow."; }
+        virtual void help(stringstream& h) const { h << "Validate contents of a namespace by scanning its data structures for correctness.  Slow.\n"
+                                                        "Add full:true option to do a more thorough check"; }
 
         virtual LockType locktype() const { return READ; }
-        //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] } */
+        //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] [, full: <bool> } */
 
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             string ns = dbname + "." + cmdObj.firstElement().valuestrsafe();
             NamespaceDetails * d = nsdetails( ns.c_str() );
             if ( !cmdLine.quiet )
@@ -162,24 +169,27 @@ namespace mongo {
             }
 
             result.append( "ns", ns );
-            result.append( "result" , validateNS( ns.c_str() , d, &cmdObj ) );
+            validateNS( ns.c_str() , d, cmdObj, result);
             return 1;
         }
 
+    private:
+        void validateNS(const char *ns, NamespaceDetails *d, const BSONObj& cmdObj, BSONObjBuilder& result) {
+            const bool full = cmdObj["full"].trueValue();
+            const bool scanData = full || cmdObj["scandata"].trueValue();
 
-        string validateNS(const char *ns, NamespaceDetails *d, BSONObj *cmdObj) {
-            bool scanData = true;
-            if( cmdObj && cmdObj->hasElement("scandata") && !cmdObj->getBoolField("scandata") )
-                scanData = false;
             bool valid = true;
-            stringstream ss;
-            ss << "\nvalidate\n";
-            //ss << "  details: " << hex << d << " ofs:" << nsindex(ns)->detailsOffset(d) << dec << endl;
-            if ( d->capped )
-                ss << "  capped:" << d->capped << " max:" << d->max << '\n';
-
-            ss << "  firstExtent:" << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString()<< '\n';
-            ss << "  lastExtent:" << d->lastExtent.toString()    << " ns:" << d->lastExtent.ext()->nsDiagnostic.toString() << '\n';
+            BSONArrayBuilder errors; // explanation(s) for why valid = false
+            if ( d->capped ){
+                result.append("capped", d->capped);
+                result.append("max", d->max);
+            }
+
+            result.append("firstExtent", str::stream() << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString());
+            result.append( "lastExtent", str::stream() <<  d->lastExtent.toString() << " ns:" <<  d->lastExtent.ext()->nsDiagnostic.toString());
+            
+            BSONArrayBuilder extentData;
+
             try {
                 d->firstExtent.ext()->assertOk();
                 d->lastExtent.ext()->assertOk();
@@ -191,32 +201,46 @@ namespace mongo {
                     e->assertOk();
                     el = e->xnext;
                     ne++;
+                    if ( full )
+                        extentData << e->dump();
+                    
                     killCurrentOp.checkForInterrupt();
                 }
-                ss << "  # extents:" << ne << '\n';
+                result.append("extentCount", ne);
             }
             catch (...) {
                 valid=false;
-                ss << " extent asserted ";
+                errors << "extent asserted";
             }
 
-            ss << "  datasize?:" << d->stats.datasize << " nrecords?:" << d->stats.nrecords << " lastExtentSize:" << d->lastExtentSize << '\n';
-            ss << "  padding:" << d->paddingFactor << '\n';
+            if ( full )
+                result.appendArray( "extents" , extentData.arr() );
+
+            
+            result.appendNumber("datasize", d->stats.datasize);
+            result.appendNumber("nrecords", d->stats.nrecords);
+            result.appendNumber("lastExtentSize", d->lastExtentSize);
+            result.appendNumber("padding", d->paddingFactor);
+            
+
             try {
 
                 try {
-                    ss << "  first extent:\n";
-                    d->firstExtent.ext()->dump(ss);
-                    valid = valid && d->firstExtent.ext()->validates();
+                    result.append("firstExtentDetails", d->firstExtent.ext()->dump());
+
+                    valid = valid && d->firstExtent.ext()->validates() && 
+                        d->firstExtent.ext()->xprev.isNull();
                 }
                 catch (...) {
-                    ss << "\n    exception firstextent\n" << endl;
+                    errors << "exception firstextent";
+                    valid = false;
                 }
 
                 set<DiskLoc> recs;
                 if( scanData ) {
                     shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
                     int n = 0;
+                    int nInvalid = 0;
                     long long len = 0;
                     long long nlen = 0;
                     int outOfOrder = 0;
@@ -236,27 +260,54 @@ namespace mongo {
                         Record *r = c->_current();
                         len += r->lengthWithHeaders;
                         nlen += r->netLength();
+
+                        if (full){
+                            BSONObj obj(r);
+                            if (!obj.isValid() || !obj.valid()){ // both fast and deep checks
+                                valid = false;
+                                if (nInvalid == 0) // only log once;
+                                    errors << "invalid bson object detected (see logs for more info)";
+
+                                nInvalid++;
+                                if (strcmp("_id", obj.firstElementFieldName()) == 0){
+                                    try {
+                                        obj.firstElement().validate(); // throws on error
+                                        log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl;
+                                    }
+                                    catch(...){
+                                        log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl;
+                                    }
+                                }
+                                else {
+                                    log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl;
+                                }
+                            }
+                        }
+
                         c->advance();
                     }
                     if ( d->capped && !d->capLooped() ) {
-                        ss << "  capped outOfOrder:" << outOfOrder;
+                        result.append("cappedOutOfOrder", outOfOrder);
                         if ( outOfOrder > 1 ) {
                             valid = false;
-                            ss << " ???";
+                            errors << "too many out of order records";
                         }
-                        else ss << " (OK)";
-                        ss << '\n';
                     }
-                    ss << "  " << n << " objects found, nobj:" << d->stats.nrecords << '\n';
-                    ss << "  " << len << " bytes data w/headers\n";
-                    ss << "  " << nlen << " bytes data wout/headers\n";
+                    result.append("objectsFound", n);
+
+                    if (full) {
+                        result.append("invalidObjects", nInvalid);
+                    }
+
+                    result.appendNumber("bytesWithHeaders", len);
+                    result.appendNumber("bytesWithoutHeaders", nlen);
                 }
 
-                ss << "  deletedList: ";
+                BSONArrayBuilder deletedListArray;
                 for ( int i = 0; i < Buckets; i++ ) {
-                    ss << (d->deletedList[i].isNull() ? '0' : '1');
+                    deletedListArray << d->deletedList[i].isNull();
                 }
-                ss << endl;
+
                 int ndel = 0;
                 long long delSize = 0;
                 int incorrect = 0;
@@ -278,7 +329,9 @@ namespace mongo {
                                 }
 
                                 if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) {
-                                    ss << "    ?bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k << endl;
+                                    string err (str::stream() << "bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k);
+                                    errors << err;
+
                                     valid = false;
                                     break;
                                 }
@@ -292,47 +345,60 @@ namespace mongo {
                         }
                     }
                     catch (...) {
-                        ss <<"    ?exception in deleted chain for bucket " << i << endl;
+                        errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i));
                         valid = false;
                     }
                 }
-                ss << "  deleted: n: " << ndel << " size: " << delSize << endl;
+                result.appendNumber("deletedCount", ndel);
+                result.appendNumber("deletedSize", delSize);
+
                 if ( incorrect ) {
-                    ss << "    ?corrupt: " << incorrect << " records from datafile are in deleted list\n";
+                    errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list");
                     valid = false;
                 }
 
                 int idxn = 0;
                 try  {
-                    ss << "  nIndexes:" << d->nIndexes << endl;
+                    result.append("nIndexes", d->nIndexes);
+                    BSONObjBuilder indexes; // not using subObjStart to be exception safe
                     NamespaceDetails::IndexIterator i = d->ii();
                     while( i.more() ) {
                         IndexDetails& id = i.next();
-                        ss << "    " << id.indexNamespace() << " keys:" <<
-                           id.head.btree()->fullValidate(id.head, id.keyPattern()) << endl;
+                        long long keys = id.idxInterface().fullValidate(id.head, id.keyPattern());
+                        indexes.appendNumber(id.indexNamespace(), keys);
                     }
+                    result.append("keysPerIndex", indexes.done());
                 }
                 catch (...) {
-                    ss << "\n    exception during index validate idxn:" << idxn << endl;
+                    errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn));
                     valid=false;
                 }
 
             }
             catch (AssertionException) {
-                ss << "\n    exception during validate\n" << endl;
+                errors << "exception during validate";
                 valid = false;
             }
 
-            if ( !valid )
-                ss << " ns corrupt, requires dbchk\n";
+            result.appendBool("valid", valid);
+            result.append("errors", errors.arr());
+
+            if ( !full ){
+                result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan.");
+            }
+            
+            if ( !valid ) {
+                result.append("advice", "ns corrupt, requires repair");
+            }
 
-            return ss.str();
         }
     } validateCmd;
 
-    extern bool unlockRequested;
-    extern unsigned lockedForWriting;
-    extern mongo::mutex lockedForWritingMutex;
+    bool lockedForWriting = false; // read from db/instance.cpp
+    static bool unlockRequested = false;
+    static mongo::mutex fsyncLockMutex("fsyncLock");
+    static boost::condition fsyncLockCondition;
+    static OID fsyncLockID; // identifies the current lock job
 
     /*
         class UnlockCommand : public Command {
@@ -360,6 +426,7 @@ namespace mongo {
        db.$cmd.sys.unlock.findOne()
     */
     class FSyncCommand : public Command {
+        static const char* url() { return  "http://www.mongodb.org/display/DOCS/fsync+Command"; }
         class LockDBJob : public BackgroundJob {
         protected:
             virtual string name() const { return "lockdbjob"; }
@@ -367,23 +434,26 @@ namespace mongo {
                 Client::initThread("fsyncjob");
                 Client& c = cc();
                 {
-                    scoped_lock lk(lockedForWritingMutex);
-                    lockedForWriting++;
+                    scoped_lock lk(fsyncLockMutex);
+                    while (lockedForWriting){ // there is a small window for two LockDBJob's to be active. This prevents it.
+                        fsyncLockCondition.wait(lk.boost());
+                    }
+                    lockedForWriting = true;
+                    fsyncLockID.init();
                 }
                 readlock lk("");
                 MemoryMappedFile::flushAll(true);
-                log() << "db is now locked for snapshotting, no writes allowed. use db.$cmd.sys.unlock.findOne() to unlock" << endl;
+                log() << "db is now locked for snapshotting, no writes allowed. db.fsyncUnlock() to unlock" << endl;
+                log() << "    For more info see " << FSyncCommand::url() << endl;
                 _ready = true;
-                while( 1 ) {
-                    if( unlockRequested ) {
-                        unlockRequested = false;
-                        break;
-                    }
-                    sleepmillis(20);
-                }
                 {
-                    scoped_lock lk(lockedForWritingMutex);
-                    lockedForWriting--;
+                    scoped_lock lk(fsyncLockMutex);
+                    while( !unlockRequested ) {
+                        fsyncLockCondition.wait(lk.boost());
+                    }
+                    unlockRequested = false;
+                    lockedForWriting = false;
+                    fsyncLockCondition.notify_all();
                 }
                 c.shutdown();
             }
@@ -402,8 +472,8 @@ namespace mongo {
             string x = cmdObj["exec"].valuestrsafe();
             return !x.empty();
         }*/
-        virtual void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/fsync+Command"; }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual void help(stringstream& h) const { h << url(); }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately
             bool lock = cmdObj["lock"].trueValue();
             log() << "CMD fsync:  sync:" << sync << " lock:" << lock << endl;
@@ -433,13 +503,19 @@ namespace mongo {
                 LockDBJob *l = new LockDBJob(ready);
 
                 dbMutex.releaseEarly();
+                
+                // There is a narrow window for another lock request to come in
+                // here before the LockDBJob grabs the readlock. LockDBJob will
+                // ensure that the requests are serialized and never running
+                // concurrently
 
                 l->go();
                 // don't return until background thread has acquired the read lock
                 while( !ready ) {
                     sleepmillis(10);
                 }
-                result.append("info", "now locked against writes, use db.$cmd.sys.unlock.findOne() to unlock");
+                result.append("info", "now locked against writes, use db.fsyncUnlock() to unlock");
+                result.append("seeAlso", url());
             }
             else {
                 // the simple fsync command case
@@ -453,7 +529,21 @@ namespace mongo {
 
     } fsyncCmd;
 
-
-
+    // Note that this will only unlock the current lock.  If another thread
+    // relocks before we return we still consider the unlocking successful.
+    // This is imporant because if two scripts are trying to fsync-lock, each
+    // one must be assured that between the fsync return and the call to unlock
+    // that the database is fully locked
+    void unlockFsyncAndWait(){
+        scoped_lock lk(fsyncLockMutex);
+        if (lockedForWriting) { // could have handled another unlock before we grabbed the lock
+            OID curOp = fsyncLockID;
+            unlockRequested = true;
+            fsyncLockCondition.notify_all();
+            while (lockedForWriting && fsyncLockID == curOp){
+                fsyncLockCondition.wait( lk.boost() );
+            }
+        }
+    }
 }
 
diff --git a/db/dbcommands_generic.cpp b/db/dbcommands_generic.cpp
index a555b6c..69b51c7 100644
--- a/db/dbcommands_generic.cpp
+++ b/db/dbcommands_generic.cpp
@@ -20,7 +20,7 @@
  */
 
 #include "pch.h"
-#include "query.h"
+#include "ops/query.h"
 #include "pdfile.h"
 #include "jsobj.h"
 #include "../bson/util/builder.h"
@@ -33,17 +33,17 @@
 #include "json.h"
 #include "repl.h"
 #include "repl_block.h"
-#include "replpair.h"
+#include "replutil.h"
 #include "commands.h"
 #include "db.h"
 #include "instance.h"
 #include "lasterror.h"
 #include "security.h"
-#include "queryoptimizer.h"
 #include "../scripting/engine.h"
 #include "stats/counters.h"
 #include "background.h"
 #include "../util/version.h"
+#include "../util/ramlog.h"
 
 namespace mongo {
 
@@ -57,8 +57,9 @@ namespace mongo {
             help << "get version #, etc.\n";
             help << "{ buildinfo:1 }";
         }
-        bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             result << "version" << versionString << "gitVersion" << gitVersion() << "sysInfo" << sysInfo();
+            result << "versionArray" << versionArray;
             result << "bits" << ( sizeof( int* ) == 4 ? 32 : 64 );
             result.appendBool( "debug" , debug );
             result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
@@ -87,7 +88,7 @@ namespace mongo {
             help << "  syncdelay\n";
             help << "{ getParameter:'*' } to get everything\n";
         }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             bool all = *cmdObj.firstElement().valuestrsafe() == '*';
 
             int before = result.len();
@@ -116,6 +117,9 @@ namespace mongo {
         }
     } cmdGet;
 
+    // tempish
+    bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl );
+
     class CmdSet : public Command {
     public:
         CmdSet() : Command( "setParameter" ) { }
@@ -123,37 +127,58 @@ namespace mongo {
         virtual bool adminOnly() const { return true; }
         virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream &help ) const {
-            help << "set administrative option(s)\nexample:\n";
-            help << "{ setParameter:1, notablescan:true }\n";
+            help << "set administrative option(s)\n";
+            help << "{ setParameter:1, <param>:<value> }\n";
             help << "supported so far:\n";
-            help << "  notablescan\n";
+            help << "  journalCommitInterval\n";
             help << "  logLevel\n";
+            help << "  notablescan\n";
             help << "  quiet\n";
+            help << "  syncdelay\n";
         }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
             int s = 0;
+            bool found = setParmsMongodSpecific(dbname, cmdObj, errmsg, result, fromRepl);
+            if( cmdObj.hasElement("journalCommitInterval") ) { 
+                if( !cmdLine.dur ) { 
+                    errmsg = "journaling is off";
+                    return false;
+                }
+                int x = (int) cmdObj["journalCommitInterval"].Number();
+                assert( x > 1 && x < 500 );
+                cmdLine.journalCommitInterval = x;
+                log() << "setParameter journalCommitInterval=" << x << endl;
+                s++;
+            }
             if( cmdObj.hasElement("notablescan") ) {
-                result.append("was", cmdLine.noTableScan);
+                assert( !cmdLine.isMongos() );
+                if( s == 0 )
+                    result.append("was", cmdLine.noTableScan);
                 cmdLine.noTableScan = cmdObj["notablescan"].Bool();
                 s++;
             }
             if( cmdObj.hasElement("quiet") ) {
-                result.append("was", cmdLine.quiet );
+                if( s == 0 )
+                    result.append("was", cmdLine.quiet );
                 cmdLine.quiet = cmdObj["quiet"].Bool();
                 s++;
             }
             if( cmdObj.hasElement("syncdelay") ) {
-                result.append("was", cmdLine.syncdelay );
+                assert( !cmdLine.isMongos() );
+                if( s == 0 )
+                    result.append("was", cmdLine.syncdelay );
                 cmdLine.syncdelay = cmdObj["syncdelay"].Number();
                 s++;
             }
             if( cmdObj.hasElement( "logLevel" ) ) {
-                result.append("was", logLevel );
+                if( s == 0 )
+                    result.append("was", logLevel );
                 logLevel = cmdObj["logLevel"].numberInt();
                 s++;
             }
             if( cmdObj.hasElement( "replApplyBatchSize" ) ) {
-                result.append("was", replApplyBatchSize );
+                if( s == 0 )
+                    result.append("was", replApplyBatchSize );
                 BSONElement e = cmdObj["replApplyBatchSize"];
                 ParameterValidator * v = ParameterValidator::get( e.fieldName() );
                 assert( v );
@@ -163,8 +188,8 @@ namespace mongo {
                 s++;
             }
 
-            if( s == 0 ) {
-                errmsg = "no option found to set, use '*' to get all ";
+            if( s == 0 && !found ) {
+                errmsg = "no option found to set, use help:true to see options ";
                 return false;
             }
 
@@ -179,7 +204,7 @@ namespace mongo {
         virtual void help( stringstream &help ) const { help << "a way to check that the server is alive. responds immediately even if server is in a db lock."; }
         virtual LockType locktype() const { return NONE; }
         virtual bool requiresAuth() { return false; }
-        virtual bool run(const string& badns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& badns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             // IMPORTANT: Don't put anything in here that might lock db - including authentication
             return true;
         }
@@ -192,7 +217,7 @@ namespace mongo {
         virtual bool slaveOk() const { return true; }
         virtual bool readOnly() { return true; }
         virtual LockType locktype() const { return NONE; }
-        virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if ( globalScriptEngine ) {
                 BSONObjBuilder bb( result.subobjStart( "js" ) );
                 result.append( "utf8" , globalScriptEngine->utf8Ok() );
@@ -214,7 +239,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return true; }
-        virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             rotateLogs();
             return 1;
         }
@@ -228,7 +253,7 @@ namespace mongo {
         virtual LockType locktype() const { return NONE; }
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return false; }
-        virtual bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             BSONObjBuilder b( result.subobjStart( "commands" ) );
             for ( map<string,Command*>::iterator i=_commands->begin(); i!=_commands->end(); ++i ) {
                 Command * c = i->second;
@@ -256,35 +281,18 @@ namespace mongo {
 
     } listCommandsCmd;
 
-    class CmdShutdown : public Command {
-    public:
-        virtual bool requiresAuth() { return true; }
-        virtual bool adminOnly() const { return true; }
-        virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return true; }
-        virtual bool logTheOp() {
-            return false;
-        }
-        virtual bool slaveOk() const {
-            return true;
-        }
-        virtual LockType locktype() const { return NONE; }
-        virtual void help( stringstream& help ) const {
-            help << "shutdown the database.  must be ran against admin db and either (1) ran from localhost or (2) authenticated.\n";
+    bool CmdShutdown::shutdownHelper() {
+        Client * c = currentClient.get();
+        if ( c ) {
+            c->shutdown();
         }
-        CmdShutdown() : Command("shutdown") {}
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            Client * c = currentClient.get();
-            if ( c ) {
-                c->shutdown();
-            }
 
-            log() << "terminating, shutdown command received" << endl;
+        log() << "terminating, shutdown command received" << endl;
 
-            dbexit( EXIT_CLEAN , "shutdown called" , true ); // this never returns
-            assert(0);
-            return true;
-        }
-    } cmdShutdown;
+        dbexit( EXIT_CLEAN , "shutdown called" , true ); // this never returns
+        assert(0);
+        return true;
+    }
 
     /* for testing purposes only */
     class CmdForceError : public Command {
@@ -300,7 +308,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return NONE; }
         CmdForceError() : Command("forceerror") {}
-        bool run(const string& dbnamne, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbnamne, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             uassert( 10038 , "forced error", false);
             return true;
         }
@@ -312,11 +320,57 @@ namespace mongo {
         virtual bool slaveOk() const { return true; }
         virtual LockType locktype() const { return NONE; }
         virtual bool requiresAuth() { return false; }
-        virtual bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
             result << "options" << QueryOption_AllSupported;
             return true;
         }
     } availableQueryOptionsCmd;
 
 
+    class GetLogCmd : public Command {
+    public:
+        GetLogCmd() : Command( "getLog" ){}
+
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool requiresAuth() { return false; }
+        virtual bool adminOnly() const { return true; }
+
+        virtual void help( stringstream& help ) const {
+            help << "{ getLog : '*' }  OR { getLog : 'global' }";
+        }
+
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string p = cmdObj.firstElement().String();
+            if ( p == "*" ) {
+                vector<string> names;
+                RamLog::getNames( names );
+
+                BSONArrayBuilder arr;
+                for ( unsigned i=0; i<names.size(); i++ ) {
+                    arr.append( names[i] );
+                }
+                
+                result.appendArray( "names" , arr.arr() );
+            }
+            else {
+                RamLog* rl = RamLog::get( p );
+                if ( ! rl ) {
+                    errmsg = str::stream() << "no RamLog named: " << p;
+                    return false;
+                }
+                
+                vector<const char*> lines;
+                rl->get( lines );
+                
+                BSONArrayBuilder arr( result.subarrayStart( "log" ) );
+                for ( unsigned i=0; i<lines.size(); i++ )
+                    arr.append( lines[i] );
+                arr.done();
+            }
+            return true;
+        }
+
+    } getLogCmd;
+
 }
diff --git a/db/dbeval.cpp b/db/dbeval.cpp
index 31d5260..5fe137f 100644
--- a/db/dbeval.cpp
+++ b/db/dbeval.cpp
@@ -18,7 +18,7 @@
 */
 
 #include "pch.h"
-#include "query.h"
+#include "ops/query.h"
 #include "pdfile.h"
 #include "jsobj.h"
 #include "../bson/util/builder.h"
@@ -86,7 +86,7 @@ namespace mongo {
         int res;
         {
             Timer t;
-            res = s->invoke(f,args, cmdLine.quota ? 10 * 60 * 1000 : 0 );
+            res = s->invoke(f, &args, 0, cmdLine.quota ? 10 * 60 * 1000 : 0 );
             int m = t.millis();
             if ( m > cmdLine.slowMS ) {
                 out() << "dbeval slow, time: " << dec << m << "ms " << dbName << endl;
@@ -121,7 +121,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return NONE; }
         CmdEval() : Command("eval", false, "$eval") { }
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
 
             AuthenticationInfo *ai = cc().getAuthenticationInfo();
             uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(dbname.c_str()) );
diff --git a/db/dbhelpers.cpp b/db/dbhelpers.cpp
index 5e49589..cc4fdba 100644
--- a/db/dbhelpers.cpp
+++ b/db/dbhelpers.cpp
@@ -19,12 +19,13 @@
 #include "pch.h"
 #include "db.h"
 #include "dbhelpers.h"
-#include "query.h"
 #include "json.h"
 #include "queryoptimizer.h"
 #include "btree.h"
 #include "pdfile.h"
 #include "oplog.h"
+#include "ops/update.h"
+#include "ops/delete.h"
 
 namespace mongo {
 
@@ -63,7 +64,7 @@ namespace mongo {
     public:
         FindOne( bool requireIndex ) : requireIndex_( requireIndex ) {}
         virtual void _init() {
-            if ( requireIndex_ && strcmp( qp().indexKey().firstElement().fieldName(), "$natural" ) == 0 )
+            if ( requireIndex_ && strcmp( qp().indexKey().firstElementFieldName(), "$natural" ) == 0 )
                 throw MsgAssertionException( 9011 , "Not an index cursor" );
             c_ = qp().newCursor();
             if ( !c_->ok() ) {
@@ -75,7 +76,7 @@ namespace mongo {
                 setComplete();
                 return;
             }
-            if ( matcher()->matches( c_->currKey(), c_->currLoc() ) ) {
+            if ( matcher( c_ )->matchesCurrent( c_.get() ) ) {
                 one_ = c_->current();
                 loc_ = c_->currLoc();
                 setStop();
@@ -148,7 +149,7 @@ namespace mongo {
 
         BSONObj key = i.getKeyFromQuery( query );
 
-        DiskLoc loc = i.head.btree()->findSingle( i , i.head , key );
+        DiskLoc loc = i.idxInterface().findSingle(i , i.head , key);
         if ( loc.isNull() )
             return false;
         result = loc.obj();
@@ -160,7 +161,7 @@ namespace mongo {
         uassert(13430, "no _id index", idxNo>=0);
         IndexDetails& i = d->idx( idxNo );
         BSONObj key = i.getKeyFromQuery( idquery );
-        return i.head.btree()->findSingle( i , i.head , key );
+        return i.idxInterface().findSingle(i , i.head , key);
     }
 
     bool Helpers::isEmpty(const char *ns, bool doAuth) {
@@ -178,10 +179,13 @@ namespace mongo {
         Client::Context context(ns);
 
         shared_ptr<Cursor> c = DataFileMgr::findAll(ns);
-        if ( !c->ok() )
+        if ( !c->ok() ) {
+            context.getClient()->curop()->done();
             return false;
+        }
 
         result = c->current();
+        context.getClient()->curop()->done();
         return true;
     }
 
@@ -208,12 +212,14 @@ namespace mongo {
         OpDebug debug;
         Client::Context context(ns);
         updateObjects(ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug );
+        context.getClient()->curop()->done();
     }
 
     void Helpers::putSingletonGod(const char *ns, BSONObj obj, bool logTheOp) {
         OpDebug debug;
         Client::Context context(ns);
         _updateObjects(/*god=*/true, ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , logTheOp , debug );
+        context.getClient()->curop()->done();
     }
 
     BSONObj Helpers::toKeyFormat( const BSONObj& o , BSONObj& key ) {
@@ -248,11 +254,21 @@ namespace mongo {
 
         IndexDetails& i = nsd->idx( ii );
 
-        shared_ptr<Cursor> c( new BtreeCursor( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) );
+        shared_ptr<Cursor> c( BtreeCursor::make( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) );
         auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
         cc->setDoingDeletes( true );
 
         while ( c->ok() ) {
+
+            if ( yield && ! cc->yieldSometimes( ClientCursor::WillNeed) ) {
+                // cursor got finished by someone else, so we're done
+                cc.release(); // if the collection/db is dropped, cc may be deleted
+                break;
+            }
+
+            if ( ! c->ok() )
+                break;
+
             DiskLoc rloc = c->currLoc();
 
             if ( callback )
@@ -269,11 +285,7 @@ namespace mongo {
 
             getDur().commitIfNeeded();
 
-            if ( yield && ! cc->yield() ) {
-                // cursor got finished by someone else, so we're done
-                cc.release(); // if the collection/db is dropped, cc may be deleted
-                break;
-            }
+
         }
 
         return num;
diff --git a/db/dbmessage.cpp b/db/dbmessage.cpp
new file mode 100644
index 0000000..c86b5a0
--- /dev/null
+++ b/db/dbmessage.cpp
@@ -0,0 +1,108 @@
+// dbmessage.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dbmessage.h"
+#include "../client/dbclient.h"
+
+namespace mongo {
+
+    string Message::toString() const {
+        stringstream ss;
+        ss << "op: " << opToString( operation() ) << " len: " << size();
+        if ( operation() >= 2000 && operation() < 2100 ) {
+            DbMessage d(*this);
+            ss << " ns: " << d.getns();
+            switch ( operation() ) {
+            case dbUpdate: {
+                int flags = d.pullInt();
+                BSONObj q = d.nextJsObj();
+                BSONObj o = d.nextJsObj();
+                ss << " flags: " << flags << " query: " << q << " update: " << o;
+                break;
+            }
+            case dbInsert:
+                ss << d.nextJsObj();
+                break;
+            case dbDelete: {
+                int flags = d.pullInt();
+                BSONObj q = d.nextJsObj();
+                ss << " flags: " << flags << " query: " << q;
+                break;
+            }
+            default:
+                ss << " CANNOT HANDLE YET";
+            }
+
+
+        }
+        return ss.str();
+    }
+
+
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      void *data, int size,
+                      int nReturned, int startingFrom,
+                      long long cursorId 
+                      ) {
+        BufBuilder b(32768);
+        b.skip(sizeof(QueryResult));
+        b.appendBuf(data, size);
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->_resultFlags() = queryResultFlags;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->cursorId = cursorId;
+        qr->startingFrom = startingFrom;
+        qr->nReturned = nReturned;
+        b.decouple();
+        Message resp(qr, true);
+        p->reply(requestMsg, resp, requestMsg.header()->id);
+    }
+
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      BSONObj& responseObj) {
+        replyToQuery(queryResultFlags,
+                     p, requestMsg,
+                     (void *) responseObj.objdata(), responseObj.objsize(), 1);
+    }
+
+    void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) {
+        BufBuilder b;
+        b.skip(sizeof(QueryResult));
+        b.appendBuf((void*) obj.objdata(), obj.objsize());
+        QueryResult* msgdata = (QueryResult *) b.buf();
+        b.decouple();
+        QueryResult *qr = msgdata;
+        qr->_resultFlags() = queryResultFlags;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->cursorId = 0;
+        qr->startingFrom = 0;
+        qr->nReturned = 1;
+        Message *resp = new Message();
+        resp->setData(msgdata, true); // transport will free
+        dbresponse.response = resp;
+        dbresponse.responseTo = m.header()->id;
+    }
+
+
+
+}
diff --git a/db/dbmessage.h b/db/dbmessage.h
index cc1d1d8..a789bff 100644
--- a/db/dbmessage.h
+++ b/db/dbmessage.h
@@ -1,3 +1,5 @@
+// dbmessage.h
+
 /**
 *    Copyright (C) 2008 10gen Inc.
 *
@@ -19,8 +21,9 @@
 #include "diskloc.h"
 #include "jsobj.h"
 #include "namespace-inl.h"
-#include "../util/message.h"
+#include "../util/net/message.h"
 #include "../client/constants.h"
+#include "instance.h"
 
 namespace mongo {
 
@@ -34,7 +37,48 @@ namespace mongo {
           list of marshalled JSObjects;
     */
 
-    extern bool objcheck;
+/* db request message format
+
+   unsigned opid;         // arbitary; will be echoed back
+   byte operation;
+   int options;
+
+   then for:
+
+   dbInsert:
+      string collection;
+      a series of JSObjects
+   dbDelete:
+      string collection;
+      int flags=0; // 1=DeleteSingle
+      JSObject query;
+   dbUpdate:
+      string collection;
+      int flags; // 1=upsert
+      JSObject query;
+      JSObject objectToUpdate;
+        objectToUpdate may include { $inc: <field> } or { $set: ... }, see struct Mod.
+   dbQuery:
+      string collection;
+      int nToSkip;
+      int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit)
+                     // greater than zero is simply a hint on how many objects to send back per "cursor batch".
+                     // a negative number indicates a hard limit.
+      JSObject query;
+      [JSObject fieldsToReturn]
+   dbGetMore:
+      string collection; // redundant, might use for security.
+      int nToReturn;
+      int64 cursorID;
+   dbKillCursors=2007:
+      int n;
+      int64 cursorIDs[n];
+
+   Note that on Update, there is only one object, which is different
+   from insert where you can pass a list of objects to insert in the db.
+   Note that the update field layout is very similar layout to Query.
+*/
+
 
 #pragma pack(1)
     struct QueryResult : public MsgData {
@@ -53,7 +97,11 @@ namespace mongo {
         void setResultFlagsToOk() {
             _resultFlags() = ResultFlag_AwaitCapable;
         }
+        void initializeResultFlags() {
+            _resultFlags() = 0;   
+        }
     };
+
 #pragma pack()
 
     /* For the database/server protocol, these objects and functions encapsulate
@@ -72,7 +120,11 @@ namespace mongo {
             nextjsobj = data;
         }
 
-        /** the 32 bit field before the ns */
+        /** the 32 bit field before the ns 
+         * track all bit usage here as its cross op
+         * 0: InsertOption_ContinueOnError
+         * 1: fromWriteback
+         */
         int& reservedField() { return *reserved; }
 
         const char * getns() const {
@@ -150,7 +202,7 @@ namespace mongo {
             massert( 10305 ,  "Client Error: Invalid object size", js.objsize() > 3 );
             massert( 10306 ,  "Client Error: Next object larger than space left in message",
                      js.objsize() < ( theEnd - data ) );
-            if ( objcheck && !js.valid() ) {
+            if ( cmdLine.objcheck && !js.valid() ) {
                 massert( 10307 , "Client Error: bad object in message", false);
             }
             nextjsobj += js.objsize();
@@ -178,6 +230,12 @@ namespace mongo {
         const char *theEnd;
 
         const char * mark;
+
+    public:
+        enum ReservedOptions {
+            Reserved_InsertOption_ContinueOnError = 1 << 0 , 
+            Reserved_FromWriteback = 1 << 1 
+        };
     };
 
 
@@ -204,70 +262,21 @@ namespace mongo {
         }
     };
 
-} // namespace mongo
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      void *data, int size,
+                      int nReturned, int startingFrom = 0,
+                      long long cursorId = 0
+                      );
 
-#include "../client/dbclient.h"
-
-namespace mongo {
-
-    inline void replyToQuery(int queryResultFlags,
-                             AbstractMessagingPort* p, Message& requestMsg,
-                             void *data, int size,
-                             int nReturned, int startingFrom = 0,
-                             long long cursorId = 0
-                            ) {
-        BufBuilder b(32768);
-        b.skip(sizeof(QueryResult));
-        b.appendBuf(data, size);
-        QueryResult *qr = (QueryResult *) b.buf();
-        qr->_resultFlags() = queryResultFlags;
-        qr->len = b.len();
-        qr->setOperation(opReply);
-        qr->cursorId = cursorId;
-        qr->startingFrom = startingFrom;
-        qr->nReturned = nReturned;
-        b.decouple();
-        Message resp(qr, true);
-        p->reply(requestMsg, resp, requestMsg.header()->id);
-    }
-
-} // namespace mongo
-
-//#include "bsonobj.h"
-
-#include "instance.h"
-
-namespace mongo {
 
     /* object reply helper. */
-    inline void replyToQuery(int queryResultFlags,
-                             AbstractMessagingPort* p, Message& requestMsg,
-                             BSONObj& responseObj) {
-        replyToQuery(queryResultFlags,
-                     p, requestMsg,
-                     (void *) responseObj.objdata(), responseObj.objsize(), 1);
-    }
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      BSONObj& responseObj);
 
     /* helper to do a reply using a DbResponse object */
-    inline void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) {
-        BufBuilder b;
-        b.skip(sizeof(QueryResult));
-        b.appendBuf((void*) obj.objdata(), obj.objsize());
-        QueryResult* msgdata = (QueryResult *) b.buf();
-        b.decouple();
-        QueryResult *qr = msgdata;
-        qr->_resultFlags() = queryResultFlags;
-        qr->len = b.len();
-        qr->setOperation(opReply);
-        qr->cursorId = 0;
-        qr->startingFrom = 0;
-        qr->nReturned = 1;
-        Message *resp = new Message();
-        resp->setData(msgdata, true); // transport will free
-        dbresponse.response = resp;
-        dbresponse.responseTo = m.header()->id;
-    }
-
-    string debugString( Message& m );
+    void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj);
+
 
 } // namespace mongo
diff --git a/db/dbwebserver.cpp b/db/dbwebserver.cpp
index 7aa6148..78c09c0 100644
--- a/db/dbwebserver.cpp
+++ b/db/dbwebserver.cpp
@@ -20,7 +20,7 @@
 */
 
 #include "pch.h"
-#include "../util/miniwebserver.h"
+#include "../util/net/miniwebserver.h"
 #include "../util/mongoutils/html.h"
 #include "../util/md5.hpp"
 #include "db.h"
@@ -31,7 +31,7 @@
 #include "commands.h"
 #include "../util/version.h"
 #include "../util/ramlog.h"
-#include <pcrecpp.h>
+#include "pcrecpp.h"
 #include "../util/admin_access.h"
 #include "dbwebserver.h"
 #include <boost/date_time/posix_time/posix_time.hpp>
@@ -61,7 +61,7 @@ namespace mongo {
     class DbWebServer : public MiniWebServer {
     public:
         DbWebServer(const string& ip, int port, const AdminAccess* webUsers)
-            : MiniWebServer(ip, port), _webUsers(webUsers) {
+            : MiniWebServer("admin web console", ip, port), _webUsers(webUsers) {
             WebStatusPlugin::initAll();
         }
 
@@ -148,7 +148,7 @@ namespace mongo {
 
                 if ( ! allowed( rq , headers, from ) ) {
                     responseCode = 401;
-                    headers.push_back( "Content-Type: text/plain" );
+                    headers.push_back( "Content-Type: text/plain;charset=utf-8" );
                     responseMsg = "not allowed\n";
                     return;
                 }
@@ -187,7 +187,7 @@ namespace mongo {
                 }
 
                 responseCode = 404;
-                headers.push_back( "Content-Type: text/html" );
+                headers.push_back( "Content-Type: text/html;charset=utf-8" );
                 responseMsg = "<html><body>unknown url</body></html>\n";
                 return;
             }
@@ -196,6 +196,7 @@ namespace mongo {
 
             if ( ! allowed( rq , headers, from ) ) {
                 responseCode = 401;
+                headers.push_back( "Content-Type: text/plain;charset=utf-8" );
                 responseMsg = "not allowed\n";
                 return;
             }
@@ -248,6 +249,7 @@ namespace mongo {
 
             ss << "</body></html>\n";
             responseMsg = ss.str();
+            headers.push_back( "Content-Type: text/html;charset=utf-8" );
         }
 
         void _rejectREST( string& responseMsg , int& responseCode, vector<string>& headers ) {
@@ -256,7 +258,7 @@ namespace mongo {
             ss << "REST is not enabled.  use --rest to turn on.\n";
             ss << "check that port " << _port << " is secured for the network too.\n";
             responseMsg = ss.str();
-            headers.push_back( "Content-Type: text/plain" );
+            headers.push_back( "Content-Type: text/plain;charset=utf-8" );
         }
 
     };
@@ -312,9 +314,11 @@ namespace mongo {
         }
 
         virtual void init() {
-            assert( ! _log );
-            _log = new RamLog();
-            Logstream::get().addGlobalTee( _log );
+            _log = RamLog::get( "global" );
+            if ( ! _log ) {
+                _log = new RamLog("global");
+                Logstream::get().addGlobalTee( _log );
+            }
         }
 
         virtual void run( stringstream& ss ) {
@@ -374,7 +378,7 @@ namespace mongo {
                              string& responseMsg, int& responseCode,
                              vector<string>& headers,  const SockAddr &from ) {
             responseCode = 404;
-            headers.push_back( "Content-Type: text/plain" );
+            headers.push_back( "Content-Type: text/plain;charset=utf-8" );
             responseMsg = "no favicon\n";
         }
 
@@ -387,7 +391,7 @@ namespace mongo {
         virtual void handle( const char *rq, string url, BSONObj params,
                              string& responseMsg, int& responseCode,
                              vector<string>& headers,  const SockAddr &from ) {
-            headers.push_back( "Content-Type: application/json" );
+            headers.push_back( "Content-Type: application/json;charset=utf-8" );
             responseCode = 200;
 
             static vector<string> commands;
@@ -420,7 +424,7 @@ namespace mongo {
                 string errmsg;
 
                 BSONObjBuilder sub;
-                if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) )
+                if ( ! c->run( "admin.$cmd" , co , 0, errmsg , sub , false ) )
                     buf.append( cmd , errmsg );
                 else
                     buf.append( cmd , sub.obj() );
@@ -439,7 +443,7 @@ namespace mongo {
         virtual void handle( const char *rq, string url, BSONObj params,
                              string& responseMsg, int& responseCode,
                              vector<string>& headers,  const SockAddr &from ) {
-            headers.push_back( "Content-Type: text/html" );
+            headers.push_back( "Content-Type: text/html;charset=utf-8" );
             responseCode = 200;
 
             stringstream ss;
@@ -509,11 +513,11 @@ namespace mongo {
             responseMsg = j;
 
             if( text ) {
-                headers.push_back( "Content-Type: text/plain" );
+                headers.push_back( "Content-Type: text/plain;charset=utf-8" );
                 responseMsg += '\n';
             }
             else {
-                headers.push_back( "Content-Type: application/json" );
+                headers.push_back( "Content-Type: application/json;charset=utf-8" );
             }
 
         }
@@ -527,7 +531,6 @@ namespace mongo {
         Client::initThread("websvr");
         const int p = cmdLine.port + 1000;
         DbWebServer mini(cmdLine.bind_ip, p, adminAccessPtr.get());
-        log() << "web admin interface listening on port " << p << endl;
         mini.initAndListen();
         cc().shutdown();
     }
diff --git a/db/diskloc.h b/db/diskloc.h
index f356c73..e717556 100644
--- a/db/diskloc.h
+++ b/db/diskloc.h
@@ -29,26 +29,28 @@ namespace mongo {
     class Record;
     class DeletedRecord;
     class Extent;
-    class BtreeBucket;
     class MongoDataFile;
 
+    template< class Version > class BtreeBucket;
+
 #pragma pack(1)
     /** represents a disk location/offset on disk in a database.  64 bits.
         it is assumed these will be passed around by value a lot so don't do anything to make them large
         (such as adding a virtual function)
      */
     class DiskLoc {
-        int _a;     // this will be volume, file #, etc. but is a logical value could be anything depending on storage engine
+        int _a;     // this will be volume, file #, etsc. but is a logical value could be anything depending on storage engine
         int ofs;
 
     public:
 
         enum SentinelValues {
+            /* note NullOfs is different. todo clean up.  see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
             NullOfs = -1,
             MaxFiles=16000 // thus a limit of about 32TB of data per db
         };
 
-        DiskLoc(int a, int b) : _a(a), ofs(b) { }
+        DiskLoc(int a, int Ofs) : _a(a), ofs(Ofs) { }
         DiskLoc() { Null(); }
         DiskLoc(const DiskLoc& l) {
             _a=l._a;
@@ -139,9 +141,13 @@ namespace mongo {
         Record* rec() const;
         DeletedRecord* drec() const;
         Extent* ext() const;
-        const BtreeBucket* btree() const;
+
+        template< class V >
+        const BtreeBucket<V> * btree() const;
+
         // Explicitly signals we are writing and casts away const
-        BtreeBucket* btreemod() const;
+        template< class V >
+        BtreeBucket<V> * btreemod() const;
 
         /*MongoDataFile& pdf() const;*/
     };
diff --git a/db/driverHelpers.cpp b/db/driverHelpers.cpp
index d98a33b..12aa018 100644
--- a/db/driverHelpers.cpp
+++ b/db/driverHelpers.cpp
@@ -46,7 +46,7 @@ namespace mongo {
     class ObjectIdTest : public BasicDriverHelper {
     public:
         ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ) {}
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if ( cmdObj.firstElement().type() != jstOID ) {
                 errmsg = "not oid";
                 return false;
diff --git a/db/dur.cpp b/db/dur.cpp
index 15b4565..4861773 100644
--- a/db/dur.cpp
+++ b/db/dur.cpp
@@ -17,7 +17,7 @@
 */
 
 /*
-   phases
+   phases:
 
      PREPLOGBUFFER
        we will build an output buffer ourself and then use O_DIRECT
@@ -36,6 +36,22 @@
        there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
          be required.  so doing these remaps fractionally is helpful. 
 
+   mutexes:
+
+     READLOCK dbMutex
+     LOCK groupCommitMutex
+       PREPLOGBUFFER()
+     READLOCK mmmutex
+       commitJob.reset()
+     UNLOCK dbMutex                                     // now other threads can write
+       WRITETOJOURNAL()
+       WRITETODATAFILES()
+     UNLOCK mmmutex
+     UNLOCK groupCommitMutex
+
+     on the next write lock acquisition for dbMutex:    // see MongoMutex::_acquiredWriteLock()
+       REMAPPRIVATEVIEW()
+
      @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
 */
 
@@ -46,11 +62,11 @@
 #include "dur_journal.h"
 #include "dur_commitjob.h"
 #include "dur_recover.h"
+#include "dur_stats.h"
 #include "../util/concurrency/race.h"
 #include "../util/mongoutils/hash.h"
 #include "../util/mongoutils/str.h"
 #include "../util/timer.h"
-#include "dur_stats.h"
 
 using namespace mongoutils;
 
@@ -58,8 +74,9 @@ namespace mongo {
 
     namespace dur {
 
-        void WRITETODATAFILES();
-        void PREPLOGBUFFER();
+        void PREPLOGBUFFER(JSectHeader& outParm);
+        void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed);
+        void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed);
 
         /** declared later in this file
             only used in this file -- use DurableInterface::commitNow() outside
@@ -84,12 +101,36 @@ namespace mongo {
         Stats::S * Stats::other() {
             return curr == &_a ? &_b : &_a;
         }
+                        string _CSVHeader();
+
+        string Stats::S::_CSVHeader() { 
+            return "cmts  jrnMB\twrDFMB\tcIWLk\tearly\tprpLgB  wrToJ\twrToDF\trmpPrVw";
+        }
+
+        string Stats::S::_asCSV() { 
+            stringstream ss;
+            ss << 
+                setprecision(2) << 
+                _commits << '\t' << fixed << 
+                _journaledBytes / 1000000.0 << '\t' << 
+                _writeToDataFilesBytes / 1000000.0 << '\t' << 
+                _commitsInWriteLock << '\t' << 
+                _earlyCommits <<  '\t' << 
+                (unsigned) (_prepLogBufferMicros/1000) << '\t' << 
+                (unsigned) (_writeToJournalMicros/1000) << '\t' << 
+                (unsigned) (_writeToDataFilesMicros/1000) << '\t' << 
+                (unsigned) (_remapPrivateViewMicros/1000);
+            return ss.str();
+        }
 
+        //int getAgeOutJournalFiles();
         BSONObj Stats::S::_asObj() {
-            return BSON(
+            BSONObjBuilder b;
+            b << 
                        "commits" << _commits <<
                        "journaledMB" << _journaledBytes / 1000000.0 <<
                        "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 <<
+                       "compression" << _journaledBytes / (_uncompressedBytes+1.0) <<
                        "commitsInWriteLock" << _commitsInWriteLock <<
                        "earlyCommits" << _earlyCommits << 
                        "timeMs" <<
@@ -98,8 +139,15 @@ namespace mongo {
                              "writeToJournal" << (unsigned) (_writeToJournalMicros/1000) <<
                              "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros/1000) <<
                              "remapPrivateView" << (unsigned) (_remapPrivateViewMicros/1000)
-                           )
-                   );
+                           );
+            /*int r = getAgeOutJournalFiles();
+            if( r == -1 )
+                b << "ageOutJournalFiles" << "mutex timeout";
+            if( r == 0 )
+                b << "ageOutJournalFiles" << false;*/
+            if( cmdLine.journalCommitInterval != 0 )
+                b << "journalCommitIntervalMs" << cmdLine.journalCommitInterval;
+            return b.obj();
         }
 
         BSONObj Stats::asObj() {
@@ -123,14 +171,22 @@ namespace mongo {
         }
 
         void DurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+            // we are at least read locked, so we need not worry about REMAPPRIVATEVIEW herein.
+            DEV dbMutex.assertAtLeastReadLocked();
+
             MemoryMappedFile::makeWritable(dst, len);
 
+            // we enter the RecoveryJob mutex here, so that if WRITETODATAFILES is happening we do not 
+            // conflict with it
+            scoped_lock lk1( RecoveryJob::get()._mx );
+
             // we stay in this mutex for everything to work with DurParanoid/validateSingleMapMatches
             //
-            // this also makes setNoJournal threadsafe, which is good as we call it from a read (not a write) lock 
-            // in class SlaveTracking
+            // either of these mutexes also makes setNoJournal threadsafe, which is good as we call it from a read 
+            // (not a write) lock in class SlaveTracking
             //
             scoped_lock lk( privateViews._mutex() );
+
             size_t ofs;
             MongoMMF *f = privateViews.find_inlock(dst, ofs);
             assert(f);
@@ -171,7 +227,7 @@ namespace mongo {
         }
 
         bool DurableImpl::awaitCommit() {
-            commitJob.awaitNextCommit();
+            commitJob._notify.awaitBeyondNow();
             return true;
         }
 
@@ -211,7 +267,15 @@ namespace mongo {
             return p;
         }
 
+        bool DurableImpl::aCommitIsNeeded() const {
+            DEV commitJob._nSinceCommitIfNeededCall = 0;
+            return commitJob.bytes() > UncommittedBytesLimit;
+        }
+
         bool DurableImpl::commitIfNeeded() {
+            if ( ! dbMutex.isWriteLocked() ) // we implicitly commit if needed when releasing write lock
+                return false;
+
             DEV commitJob._nSinceCommitIfNeededCall = 0;
             if (commitJob.bytes() > UncommittedBytesLimit) { // should this also fire if CmdLine::DurAlwaysCommit?
                 stats.curr->_earlyCommits++;
@@ -259,7 +323,7 @@ namespace mongo {
                         return;
                     }
                 }
-                log() << "dur data after write area " << i.start() << " does not agree" << endl;
+                log() << "journal data after write area " << i.start() << " does not agree" << endl;
                 log() << " was:  " << ((void*)b) << "  " << hexdump((char*)b, 8) << endl;
                 log() << " now:  " << ((void*)a) << "  " << hexdump((char*)a, 8) << endl;
                 log() << " n:    " << n << endl;
@@ -268,15 +332,6 @@ namespace mongo {
         }
 #endif
 
-        /** write the buffer we have built to the journal and fsync it.
-            outside of lock as that could be slow.
-        */
-        static void WRITETOJOURNAL(AlignedBuilder& ab) {
-            Timer t;
-            journal(ab);
-            stats.curr->_writeToJournalMicros += t.micros();
-        }
-
         // Functor to be called over all MongoFiles
 
         class validateSingleMapMatches {
@@ -285,8 +340,8 @@ namespace mongo {
             void operator () (MongoFile *mf) {
                 if( mf->isMongoMMF() ) {
                     MongoMMF *mmf = (MongoMMF*) mf;
-                    const char *p = (const char *) mmf->getView();
-                    const char *w = (const char *) mmf->view_write();
+                    const unsigned char *p = (const unsigned char *) mmf->getView();
+                    const unsigned char *w = (const unsigned char *) mmf->view_write();
 
                     if (!p || !w) return; // File not fully opened yet
 
@@ -310,6 +365,8 @@ namespace mongo {
                                 log() << endl; // separate blocks of mismatches
                             lastMismatch= i;
                             if( ++logged < 60 ) {
+                                if( logged == 1 )
+                                    log() << "ofs % 628 = 0x" << hex << (i%628) << endl; // for .ns files to find offset in record
                                 stringstream ss;
                                 ss << "mismatch ofs:" << hex << i <<  "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i];
                                 if( p[i] > 32 && p[i] <= 126 )
@@ -324,7 +381,7 @@ namespace mongo {
                     }
                     if( low != 0xffffffff ) {
                         std::stringstream ss;
-                        ss << "dur error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
+                        ss << "journal error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
                         log() << ss.str() << endl;
                         log() << "priv loc: " << (void*)(p+low) << ' ' << endl;
                         set<WriteIntent>& b = commitJob.writes();
@@ -357,6 +414,9 @@ namespace mongo {
             Call within write lock.
         */
         void _REMAPPRIVATEVIEW() {
+            // todo: Consider using ProcessInfo herein and watching for getResidentSize to drop.  that could be a way 
+            //       to assure very good behavior here.
+
             static unsigned startAt;
             static unsigned long long lastRemap;
 
@@ -370,9 +430,11 @@ namespace mongo {
             // remapping.
             unsigned long long now = curTimeMicros64();
             double fraction = (now-lastRemap)/2000000.0;
+            if( cmdLine.durOptions & CmdLine::DurAlwaysRemap )
+                fraction = 1;
             lastRemap = now;
 
-            rwlock lk(MongoFile::mmmutex, false);
+            RWLockRecursive::Shared lk(MongoFile::mmmutex);
             set<MongoFile*>& files = MongoFile::getAllFiles();
             unsigned sz = files.size();
             if( sz == 0 )
@@ -422,11 +484,79 @@ namespace mongo {
             stats.curr->_remapPrivateViewMicros += t.micros();
         }
 
+        // lock order: dbMutex first, then this
         mutex groupCommitMutex("groupCommit");
 
-        /** locking: in read lock when called. */
+        bool _groupCommitWithLimitedLocks() {
+            scoped_ptr<readlocktry> lk1( new readlocktry("", 500) );
+            if( !lk1->got() )
+                return false;
+
+            scoped_lock lk2(groupCommitMutex);
+
+            commitJob.beginCommit();
+
+            if( !commitJob.hasWritten() ) {
+                // getlasterror request could have came after the data was already committed
+                commitJob.notifyCommitted();
+                return true;
+            }
+            JSectHeader h;
+            PREPLOGBUFFER(h);
+
+            RWLockRecursive::Shared lk3(MongoFile::mmmutex);
+
+            unsigned abLen = commitJob._ab.len();
+            commitJob.reset(); // must be reset before allowing anyone to write
+            DEV assert( !commitJob.hasWritten() );
+
+            // release the readlock -- allowing others to now write while we are writing to the journal (etc.)
+            lk1.reset();
+
+            // ****** now other threads can do writes ******
+            WRITETOJOURNAL(h, commitJob._ab);
+            assert( abLen == commitJob._ab.len() ); // a check that no one touched the builder while we were doing work. if so, our locking is wrong.
+
+            // data is now in the journal, which is sufficient for acknowledging getLastError.
+            // (ok to crash after that)
+            commitJob.notifyCommitted();
+
+            WRITETODATAFILES(h, commitJob._ab);
+            assert( abLen == commitJob._ab.len() ); // check again wasn't modded
+            commitJob._ab.reset();
+
+            // can't : dbMutex._remapPrivateViewRequested = true;
+
+            return true;
+        }
+
+       /** @return true if committed; false if lock acquisition timed out (we only try for a read lock herein and only wait for a certain duration). */
+       bool groupCommitWithLimitedLocks() {
+           try {
+               return _groupCommitWithLimitedLocks();
+           }
+           catch(DBException& e ) {
+               log() << "dbexception in groupCommitLL causing immediate shutdown: " << e.toString() << endl;
+               mongoAbort("dur1");
+           }
+           catch(std::ios_base::failure& e) {
+               log() << "ios_base exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+               mongoAbort("dur2");
+           }
+           catch(std::bad_alloc& e) {
+               log() << "bad_alloc exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+               mongoAbort("dur3");
+           }
+           catch(std::exception& e) {
+               log() << "exception in dur::groupCommitLL causing immediate shutdown: " << e.what() << endl;
+               mongoAbort("dur4");
+           }
+           return false;
+       }
+
+       /** locking: in read lock when called. */
         static void _groupCommit() {
-            stats.curr->_commits++;
+            commitJob.beginCommit();
 
             if( !commitJob.hasWritten() ) {
                 // getlasterror request could have came after the data was already committed
@@ -438,20 +568,23 @@ namespace mongo {
             // (and we are only read locked in the dbMutex, so it could happen)
             scoped_lock lk(groupCommitMutex);
 
-            PREPLOGBUFFER();
+            JSectHeader h;
+            PREPLOGBUFFER(h);
 
             // todo : write to the journal outside locks, as this write can be slow.
             //        however, be careful then about remapprivateview as that cannot be done 
             //        if new writes are then pending in the private maps.
-            WRITETOJOURNAL(commitJob._ab);
+            WRITETOJOURNAL(h, commitJob._ab);
 
             // data is now in the journal, which is sufficient for acknowledging getLastError.
             // (ok to crash after that)
             commitJob.notifyCommitted();
 
-            WRITETODATAFILES();
+            WRITETODATAFILES(h, commitJob._ab);
+            debugValidateAllMapsMatch();
 
             commitJob.reset();
+            commitJob._ab.reset();
 
             // REMAPPRIVATEVIEW
             //
@@ -463,7 +596,7 @@ namespace mongo {
                 // this needs done in a write lock (as there is a short window during remapping when each view 
                 // might not exist) thus we do it on the next acquisition of that instead of here (there is no 
                 // rush if you aren't writing anyway -- but it must happen, if it is done, before any uncommitted 
-                // writes occur).  If desired, perhpas this can be eliminated on posix as it may be that the remap 
+                // writes occur).  If desired, perhaps this can be eliminated on posix as it may be that the remap 
                 // is race-free there.
                 //
                 dbMutex._remapPrivateViewRequested = true;
@@ -478,7 +611,8 @@ namespace mongo {
             }
         }
 
-        /** locking in read lock when called
+        /** locking: in read lock when called
+                     or, for early commits (commitIfNeeded), in write lock
             @see MongoMMF::close()
         */
         static void groupCommit() {
@@ -491,29 +625,33 @@ namespace mongo {
             }
             catch(DBException& e ) { 
                 log() << "dbexception in groupCommit causing immediate shutdown: " << e.toString() << endl;
-                abort();
+                mongoAbort("gc1");
             }
             catch(std::ios_base::failure& e) { 
                 log() << "ios_base exception in groupCommit causing immediate shutdown: " << e.what() << endl;
-                abort();
+                mongoAbort("gc2");
             }
             catch(std::bad_alloc& e) { 
                 log() << "bad_alloc exception in groupCommit causing immediate shutdown: " << e.what() << endl;
-                abort();
+                mongoAbort("gc3");
             }
             catch(std::exception& e) {
                 log() << "exception in dur::groupCommit causing immediate shutdown: " << e.what() << endl;
-                abort(); // based on myTerminate()
+                mongoAbort("gc4");
             }
         }
 
         static void go() {
-            if( !commitJob.hasWritten() ){
-                commitJob.notifyCommitted();
-                return;
+            const int N = 10;
+            static int n;
+            if( privateMapBytes < UncommittedBytesLimit && ++n % N && (cmdLine.durOptions&CmdLine::DurAlwaysRemap)==0 ) {
+                // limited locks version doesn't do any remapprivateview at all, so only try this if privateMapBytes
+                // is in an acceptable range.  also every Nth commit, we do everything so we can do some remapping;
+                // remapping a lot all at once could cause jitter from a large amount of copy-on-writes all at once.
+                if( groupCommitWithLimitedLocks() )
+                    return;
             }
-
-            {
+            else {
                 readlocktry lk("", 1000);
                 if( lk.got() ) {
                     groupCommit();
@@ -542,45 +680,53 @@ namespace mongo {
             else {
                 assert( inShutdown() );
                 if( commitJob.hasWritten() ) {
-                    log() << "dur warning files are closing outside locks with writes pending" << endl;
+                    log() << "journal warning files are closing outside locks with writes pending" << endl;
                 }
             }
         }
 
-        CodeBlock durThreadMain;
+        filesystem::path getJournalDir();
 
         void durThread() {
-            Client::initThread("dur");
-            const int HowOftenToGroupCommitMs = 90;
+            Client::initThread("journal");
+
+            bool samePartition = true;
+            try {
+                const string dbpathDir = boost::filesystem::path(dbpath).native_directory_string();
+                samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
+            }
+            catch(...) {
+            }
+
             while( !inShutdown() ) {
-                sleepmillis(10);
-                CodeBlock::Within w(durThreadMain);
+                unsigned ms = cmdLine.journalCommitInterval;
+                if( ms == 0 ) { 
+                    // use default
+                    ms = samePartition ? 100 : 30;
+                }
+
+                unsigned oneThird = (ms / 3) + 1; // +1 so never zero
+
                 try {
-                    int millis = HowOftenToGroupCommitMs;
-                    {
-                        stats.rotate();
-                        {
-                            Timer t;
-                            journalRotate(); // note we do this part outside of mongomutex
-                            millis -= t.millis();
-                            assert( millis <= HowOftenToGroupCommitMs );
-                            if( millis < 5 )
-                                millis = 5;
-                        }
+                    stats.rotate();
 
-                        // we do this in a couple blocks, which makes it a tiny bit faster (only a little) on throughput,
-                        // but is likely also less spiky on our cpu usage, which is good:
-                        sleepmillis(millis/2);
-                        commitJob.wi()._deferred.invoke();
-                        sleepmillis(millis/2);
+                    // we do this in a couple blocks (the invoke()), which makes it a tiny bit faster (only a little) on throughput,
+                    // but is likely also less spiky on our cpu usage, which is good.
+
+                    // commit sooner if one or more getLastError j:true is pending
+                    sleepmillis(oneThird);
+                    for( unsigned i = 1; i <= 2; i++ ) {
+                        if( commitJob._notify.nWaiting() )
+                            break;
                         commitJob.wi()._deferred.invoke();
+                        sleepmillis(oneThird);
                     }
 
                     go();
                 }
                 catch(std::exception& e) {
                     log() << "exception in durThread causing immediate shutdown: " << e.what() << endl;
-                    abort(); // based on myTerminate()
+                    mongoAbort("exception in durThread");
                 }
             }
             cc().shutdown();
@@ -604,6 +750,19 @@ namespace mongo {
             if( !cmdLine.dur )
                 return;
 
+#if defined(_DURABLEDEFAULTON)
+            DEV { 
+                if( time(0) & 1 ) {
+                    cmdLine.durOptions |= CmdLine::DurAlwaysCommit;
+                    log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysCommit mode for this run" << endl;
+                }
+                if( time(0) & 2 ) {
+                    cmdLine.durOptions |= CmdLine::DurAlwaysRemap;
+                    log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysRemap mode for this run" << endl;
+                }
+            }
+#endif
+
             DurableInterface::enableDurability();
 
             journalMakeDir();
@@ -623,6 +782,13 @@ namespace mongo {
         void DurableImpl::syncDataAndTruncateJournal() {
             dbMutex.assertWriteLocked();
 
+            // a commit from the commit thread won't begin while we are in the write lock,
+            // but it may already be in progress and the end of that work is done outside 
+            // (dbMutex) locks. This line waits for that to complete if already underway.
+            {
+                scoped_lock lk(groupCommitMutex);
+            }
+
             groupCommit();
             MongoFile::flushAll(true);
             journalCleanup();
diff --git a/db/dur.h b/db/dur.h
index a8035e4..f06ff50 100644
--- a/db/dur.h
+++ b/db/dur.h
@@ -9,6 +9,9 @@ namespace mongo {
 
     class NamespaceDetails;
 
+    void mongoAbort(const char *msg);
+    void abort(); // not defined -- use mongoAbort() instead
+
     namespace dur {
 
         // a smaller limit is likely better on 32 bit
@@ -100,6 +103,9 @@ namespace mongo {
             */
             virtual bool commitIfNeeded() = 0;
 
+            /** @return true if time to commit but does NOT do a commit */
+            virtual bool aCommitIsNeeded() const = 0;
+
             /** Declare write intent for a DiskLoc.  @see DiskLoc::writing() */
             inline DiskLoc& writingDiskLoc(DiskLoc& d) { return *((DiskLoc*) writingPtr(&d, sizeof(d))); }
 
@@ -152,7 +158,7 @@ namespace mongo {
              */
             Record* writing(Record* r);
             /** Intentionally unimplemented method. BtreeBuckets are allocated in buffers larger than sizeof( BtreeBucket ). */
-            BtreeBucket* writing( BtreeBucket* );
+//            BtreeBucket* writing( BtreeBucket* );
             /** Intentionally unimplemented method. NamespaceDetails may be based on references to 'Extra' objects. */
             NamespaceDetails* writing( NamespaceDetails* );
 
@@ -174,6 +180,7 @@ namespace mongo {
             bool awaitCommit() { return false; }
             bool commitNow() { return false; }
             bool commitIfNeeded() { return false; }
+            bool aCommitIsNeeded() const { return false; }
             void setNoJournal(void *dst, void *src, unsigned len);
             void syncDataAndTruncateJournal() {}
         };
@@ -186,6 +193,7 @@ namespace mongo {
             void createdFile(string filename, unsigned long long len);
             bool awaitCommit();
             bool commitNow();
+            bool aCommitIsNeeded() const;
             bool commitIfNeeded();
             void setNoJournal(void *dst, void *src, unsigned len);
             void syncDataAndTruncateJournal();
diff --git a/db/dur_commitjob.cpp b/db/dur_commitjob.cpp
index af77c4f..a459cd4 100644
--- a/db/dur_commitjob.cpp
+++ b/db/dur_commitjob.cpp
@@ -18,6 +18,7 @@
 
 #include "pch.h"
 #include "dur_commitjob.h"
+#include "dur_stats.h"
 #include "taskqueue.h"
 
 namespace mongo {
@@ -126,17 +127,24 @@ namespace mongo {
 
         size_t privateMapBytes = 0; // used by _REMAPPRIVATEVIEW to track how much / how fast to remap
 
+        void CommitJob::beginCommit() { 
+            DEV dbMutex.assertAtLeastReadLocked();
+            _commitNumber = _notify.now();
+            stats.curr->_commits++;
+        }
+
         void CommitJob::reset() {
             _hasWritten = false;
             _wi.clear();
-            _ab.reset();
             privateMapBytes += _bytes;
             _bytes = 0;
             _nSinceCommitIfNeededCall = 0;
         }
 
         CommitJob::CommitJob() : _ab(4 * 1024 * 1024) , _hasWritten(false), 
-            _bytes(0), _nSinceCommitIfNeededCall(0) { }
+            _bytes(0), _nSinceCommitIfNeededCall(0) { 
+            _commitNumber = 0;
+        }
 
         void CommitJob::note(void* p, int len) {
             // from the point of view of the dur module, it would be fine (i think) to only
@@ -149,7 +157,7 @@ namespace mongo {
 
                 if( !_hasWritten ) {
                     // you can't be writing if one of these is pending, so this is a verification.
-                    assert( !dbMutex._remapPrivateViewRequested );
+                    assert( !dbMutex._remapPrivateViewRequested ); // safe to assert here since it must be the first write in a write lock
 
                     // we don't bother doing a group commit when nothing is written, so we have a var to track that
                     _hasWritten = true;
@@ -196,8 +204,11 @@ namespace mongo {
 #if defined(_DEBUG)
                         _nSinceCommitIfNeededCall++;
                         if( _nSinceCommitIfNeededCall >= 80 ) {
-                            if( _nSinceCommitIfNeededCall % 40 == 0 )
+                            if( _nSinceCommitIfNeededCall % 40 == 0 ) {
                                 log() << "debug nsincecommitifneeded:" << _nSinceCommitIfNeededCall << " bytes:" << _bytes << endl;
+                                if( _nSinceCommitIfNeededCall == 120 || _nSinceCommitIfNeededCall == 1200 )
+                                    printStackTrace();
+                            }
                         }
 #endif
                         if (_bytes > UncommittedBytesLimit * 3) {
diff --git a/db/dur_commitjob.h b/db/dur_commitjob.h
index 104d054..a5f8515 100644
--- a/db/dur_commitjob.h
+++ b/db/dur_commitjob.h
@@ -38,8 +38,8 @@ namespace mongo {
          * since that is heavily used in set lookup.
          */
         struct WriteIntent { /* copyable */
-            WriteIntent() : w_ptr(0), p(0) { }
-            WriteIntent(void *a, unsigned b) : w_ptr(0), p((char*)a+b), len(b) { }
+            WriteIntent() : /*w_ptr(0), */ p(0) { }
+            WriteIntent(void *a, unsigned b) : /*w_ptr(0), */ p((char*)a+b), len(b) { }
 
             void* start() const { return (char*)p - len; }
             void* end() const { return p; }
@@ -64,7 +64,7 @@ namespace mongo {
                 return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len);
             }
 
-            mutable void *w_ptr;  // writable mapping of p.
+            //mutable void *w_ptr;  // writable mapping of p.
             // mutable because set::iterator is const but this isn't used in op<
 #if defined(_EXPERIMENTAL)
             mutable unsigned ofsInJournalBuffer;
@@ -189,14 +189,10 @@ namespace mongo {
             /** we use the commitjob object over and over, calling reset() rather than reconstructing */
             void reset();
 
-            /** the commit code calls this when data reaches the journal (on disk) */
-            void notifyCommitted() { _notify.notifyAll(); }
+            void beginCommit();
 
-            /** Wait until the next group commit occurs. That is, wait until someone calls notifyCommitted. */
-            void awaitNextCommit() {
-                if( hasWritten() )
-                    _notify.wait();
-            }
+            /** the commit code calls this when data reaches the journal (on disk) */
+            void notifyCommitted() { _notify.notifyAll(_commitNumber); }
 
             /** we check how much written and if it is getting to be a lot, we commit sooner. */
             size_t bytes() const { return _bytes; }
@@ -207,11 +203,12 @@ namespace mongo {
 
             Writes& wi() { return _wi; }
         private:
+            NotifyAll::When _commitNumber;
             bool _hasWritten;
             Writes _wi; // todo: fix name
             size_t _bytes;
-            NotifyAll _notify; // for getlasterror fsync:true acknowledgements
         public:
+            NotifyAll _notify; // for getlasterror fsync:true acknowledgements
             unsigned _nSinceCommitIfNeededCall;
         };
 
diff --git a/db/dur_journal.cpp b/db/dur_journal.cpp
index 946f94c..95a95c9 100644
--- a/db/dur_journal.cpp
+++ b/db/dur_journal.cpp
@@ -25,7 +25,7 @@
 #include "../util/logfile.h"
 #include "../util/timer.h"
 #include "../util/alignedbuilder.h"
-#include "../util/message.h" // getelapsedtimemillis
+#include "../util/net/listen.h" // getelapsedtimemillis
 #include "../util/concurrency/race.h"
 #include <boost/static_assert.hpp>
 #undef assert
@@ -33,6 +33,8 @@
 #include "../util/mongoutils/str.h"
 #include "dur_journalimpl.h"
 #include "../util/file.h"
+#include "../util/checksum.h"
+#include "../util/compress.h"
 
 using namespace mongoutils;
 
@@ -40,7 +42,25 @@ namespace mongo {
 
     class AlignedBuilder;
 
+    unsigned goodRandomNumberSlow();
+
     namespace dur {
+        // Rotate after reaching this data size in a journal (j._<n>) file
+        // We use a smaller size for 32 bit as the journal is mmapped during recovery (only)
+        // Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must 
+        // work.  (and should as-is)
+        // --smallfiles makes the limit small.
+
+#if defined(_DEBUG)
+        unsigned long long DataLimitPerJournalFile = 128 * 1024 * 1024;
+#elif defined(__APPLE__)
+        // assuming a developer box if OS X
+        unsigned long long DataLimitPerJournalFile = 256 * 1024 * 1024;
+#else
+        unsigned long long DataLimitPerJournalFile = (sizeof(void*)==4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024;
+#endif
+
+        BOOST_STATIC_ASSERT( sizeof(Checksum) == 16 );
         BOOST_STATIC_ASSERT( sizeof(JHeader) == 8192 );
         BOOST_STATIC_ASSERT( sizeof(JSectHeader) == 20 );
         BOOST_STATIC_ASSERT( sizeof(JSectFooter) == 32 );
@@ -61,8 +81,6 @@ namespace mongo {
             return getJournalDir()/"lsn";
         }
 
-        extern CodeBlock durThreadMain;
-
         /** this should be called when something really bad happens so that we can flag appropriately
         */
         void journalingFailure(const char *msg) {
@@ -75,6 +93,35 @@ namespace mongo {
             assert(false);
         }
 
+        JSectFooter::JSectFooter() { 
+            memset(this, 0, sizeof(*this));
+            sentinel = JEntry::OpCode_Footer;
+        }
+
+        JSectFooter::JSectFooter(const void* begin, int len) { // needs buffer to compute hash
+            sentinel = JEntry::OpCode_Footer;
+            reserved = 0;
+            magic[0] = magic[1] = magic[2] = magic[3] = '\n';
+
+            Checksum c;
+            c.gen(begin, (unsigned) len);
+            memcpy(hash, c.bytes, sizeof(hash));
+        }
+
+        bool JSectFooter::checkHash(const void* begin, int len) const {
+            if( !magicOk() ) { 
+                log() << "journal footer not valid" << endl;
+                return false;
+            }
+            Checksum c;
+            c.gen(begin, len);
+            DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(c.bytes, 16) << endl;
+            if( memcmp(hash, c.bytes, sizeof(hash)) == 0 ) 
+                return true;
+            log() << "journal checkHash mismatch, got: " << toHex(c.bytes, 16) << " expected: " << toHex(hash,16) << endl;
+            return false;
+        }
+
         JHeader::JHeader(string fname) {
             magic[0] = 'j'; magic[1] = '\n';
             _version = CurrentVersion;
@@ -85,21 +132,20 @@ namespace mongo {
             strncpy(dbpath, fname.c_str(), sizeof(dbpath)-1);
             {
                 fileId = t&0xffffffff;
-                fileId |= ((unsigned long long)getRandomNumber()) << 32;
+                fileId |= ((unsigned long long)goodRandomNumberSlow()) << 32;
             }
             memset(reserved3, 0, sizeof(reserved3));
             txt2[0] = txt2[1] = '\n';
             n1 = n2 = n3 = n4 = '\n';
         }
 
-        // class Journal
-
         Journal j;
 
         const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0);
 
         Journal::Journal() :
             _curLogFileMutex("JournalLfMutex") {
+            _ageOut = true;
             _written = 0;
             _nextFileNumber = 0;
             _curLogFile = 0;
@@ -163,15 +209,20 @@ namespace mongo {
                 throw;
             }
             assert(!haveJournalFiles());
+
+            flushMyDirectory(getJournalDir() / "file"); // flushes parent of argument (in this case journal dir)
+
             log(1) << "removeJournalFiles end" << endl;
         }
 
         /** at clean shutdown */
         bool okToCleanUp = false; // successful recovery would set this to true
-        void Journal::cleanup() {
+        void Journal::cleanup(bool _log) {
             if( !okToCleanUp )
                 return;
 
+            if( _log )
+                log() << "journalCleanup..." << endl;
             try {
                 scoped_lock lk(_curLogFileMutex);
                 closeCurrentJournalFile();
@@ -182,7 +233,7 @@ namespace mongo {
                 throw;
             }
         }
-        void journalCleanup() { j.cleanup(); }
+        void journalCleanup(bool log) { j.cleanup(log); }
 
         bool _preallocateIsFaster() {
             bool faster = false;
@@ -215,21 +266,45 @@ namespace mongo {
             return faster;
         }
         bool preallocateIsFaster() {
-            return _preallocateIsFaster() && _preallocateIsFaster() && _preallocateIsFaster(); 
+            Timer t;
+            bool res = false;
+            if( _preallocateIsFaster() && _preallocateIsFaster() ) { 
+                // maybe system is just super busy at the moment? sleep a second to let it calm down.  
+                // deciding to to prealloc is a medium big decision:
+                sleepsecs(1);
+                res = _preallocateIsFaster();
+            }
+            if( t.millis() > 3000 ) 
+                log() << "preallocateIsFaster check took " << t.millis()/1000.0 << " secs" << endl;
+            return res;
         }
 
         // throws
         void preallocateFile(filesystem::path p, unsigned long long len) {
             if( exists(p) ) 
                 return;
+            
+            log() << "preallocating a journal file " << p.string() << endl;
 
             const unsigned BLKSZ = 1024 * 1024;
-            log() << "preallocating a journal file " << p.string() << endl;
-            LogFile f(p.string());
-            AlignedBuilder b(BLKSZ);
-            for( unsigned long long x = 0; x < len; x += BLKSZ ) { 
-                f.synchronousAppend(b.buf(), BLKSZ);
+            assert( len % BLKSZ == 0 );
+
+            AlignedBuilder b(BLKSZ);            
+            memset((void*)b.buf(), 0, BLKSZ);
+
+            ProgressMeter m(len, 3/*secs*/, 10/*hits between time check (once every 6.4MB)*/);
+
+            File f;
+            f.open( p.string().c_str() , /*read-only*/false , /*direct-io*/false );
+            assert( f.is_open() );
+            fileofs loc = 0;
+            while ( loc < len ) {
+                f.write( loc , b.buf() , BLKSZ );
+                loc += BLKSZ;
+                m.hit(BLKSZ);
             }
+            assert( loc == len );
+            f.fsync();
         }
 
         // throws
@@ -238,7 +313,7 @@ namespace mongo {
                 string fn = str::stream() << "prealloc." << i;
                 filesystem::path filepath = getJournalDir() / fn;
 
-                unsigned long long limit = Journal::DataLimit;
+                unsigned long long limit = DataLimitPerJournalFile;
                 if( debug && i == 1 ) { 
                     // moving 32->64, the prealloc files would be short.  that is "ok", but we want to exercise that 
                     // case, so we force exercising here when _DEBUG is set by arbitrarily stopping prealloc at a low 
@@ -251,14 +326,14 @@ namespace mongo {
         }
 
         void preallocateFiles() {
-            if( preallocateIsFaster() ||
-                exists(getJournalDir()/"prealloc.0") || // if enabled previously, keep using
-                exists(getJournalDir()/"prealloc.1") ) {
+            if( exists(getJournalDir()/"prealloc.0") || // if enabled previously, keep using
+                exists(getJournalDir()/"prealloc.1") ||
+                ( cmdLine.preallocj && preallocateIsFaster() ) ) {
                     usingPreallocate = true;
                     try {
                         _preallocateFiles();
                     }
-                    catch(...) { 
+                    catch(...) {
                         log() << "warning caught exception in preallocateFiles, continuing" << endl;
                     }
             }
@@ -273,7 +348,19 @@ namespace mongo {
                         filesystem::path filepath = getJournalDir() / fn;
                         if( !filesystem::exists(filepath) ) {
                             // we can recycle this file into this prealloc file location
-                            boost::filesystem::rename(p, filepath);
+                            filesystem::path temppath = getJournalDir() / (fn+".temp");
+                            boost::filesystem::rename(p, temppath);
+                            {
+                                // zero the header
+                                File f;
+                                f.open(temppath.string().c_str(), false, false);
+                                char buf[8192];
+                                memset(buf, 0, 8192);
+                                f.write(0, buf, 8192);
+                                f.truncate(DataLimitPerJournalFile);
+                                f.fsync();
+                            }
+                            boost::filesystem::rename(temppath, filepath);
                             return;
                         }
                     }
@@ -385,7 +472,7 @@ namespace mongo {
             if something highly surprising, throws to abort
         */
         unsigned long long LSNFile::get() {
-            uassert(13614, "unexpected version number of lsn file in journal/ directory", ver == 0);
+            uassert(13614, str::stream() << "unexpected version number of lsn file in journal/ directory got: " << ver , ver == 0);
             if( ~lsn != checkbytes ) {
                 log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn << " checkbytes: " << hex << checkbytes << endl;
                 return 0;
@@ -396,12 +483,6 @@ namespace mongo {
         /** called during recovery (the error message text below assumes that)
         */
         unsigned long long journalReadLSN() {
-            if( !debug ) {
-                // in nondebug build, for now, be conservative until more tests written, and apply the whole journal.
-                // however we will still write the lsn file to exercise that code, and use in _DEBUG build.
-                return 0;
-            }
-
             if( !MemoryMappedFile::exists(lsnPath()) ) {
                 log() << "info no lsn file in journal/ directory" << endl;
                 return 0;
@@ -414,6 +495,11 @@ namespace mongo {
                 File f;
                 f.open(lsnPath().string().c_str());
                 assert(f.is_open());
+                if( f.len() == 0 ) { 
+                    // this could be 'normal' if we crashed at the right moment
+                    log() << "info lsn file is zero bytes long" << endl;
+                    return 0;
+                }
                 f.read(0,(char*)&L, sizeof(L));
                 unsigned long long lsn = L.get();
                 return lsn;
@@ -434,7 +520,6 @@ namespace mongo {
         void Journal::updateLSNFile() {
             if( !_writeToLSNNeeded )
                 return;
-            durThreadMain.assertWithin();
             _writeToLSNNeeded = false;
             try {
                 // os can flush as it likes.  if it flushes slowly, we will just do extra work on recovery.
@@ -446,10 +531,12 @@ namespace mongo {
                     log() << "warning: open of lsn file failed" << endl;
                     return;
                 }
-                log() << "lsn set " << _lastFlushTime << endl;
+                LOG(1) << "lsn set " << _lastFlushTime << endl;
                 LSNFile lsnf;
                 lsnf.set(_lastFlushTime);
                 f.write(0, (char*)&lsnf, sizeof(lsnf));
+				// do we want to fsync here? if we do it probably needs to be async so the durthread
+				// is not delayed.
             }
             catch(std::exception& e) {
                 log() << "warning: write to lsn file failed " << e.what() << endl;
@@ -502,32 +589,29 @@ namespace mongo {
             }
         }
 
-        /** check if time to rotate files.  assure a file is open.
-            done separately from the journal() call as we can do this part
-            outside of lock.
-            thread: durThread()
-         */
-        void journalRotate() {
-            j.rotate();
+        /*int getAgeOutJournalFiles() {
+            mutex::try_lock lk(j._curLogFileMutex, 4000);
+            if( !lk.ok )
+                return -1;
+            return j._ageOut ? 1 : 0;
+        }*/
+        void setAgeOutJournalFiles(bool a) {
+            scoped_lock lk(j._curLogFileMutex);
+            j._ageOut = a;
         }
-        void Journal::rotate() {
-            assert( !dbMutex.atLeastReadLocked() );
-            durThreadMain.assertWithin();
-
-            scoped_lock lk(_curLogFileMutex);
 
+        void Journal::_rotate() {
             if ( inShutdown() || !_curLogFile )
                 return;
 
             j.updateLSNFile();
 
-            if( _curLogFile && _written < DataLimit )
+            if( _curLogFile && _written < DataLimitPerJournalFile )
                 return;
 
             if( _curLogFile ) {
-
+                _curLogFile->truncate();
                 closeCurrentJournalFile();
-
                 removeUnneededJournalFiles();
             }
 
@@ -545,24 +629,74 @@ namespace mongo {
             }
         }
 
-        /** write to journal
+        /** write (append) the buffer we have built to the journal and fsync it.
+            outside of dbMutex lock as this could be slow.
+            @param uncompressed - a buffer that will be written to the journal after compression
+            will not return until on disk
         */
-        void journal(const AlignedBuilder& b) {
-            j.journal(b);
-        }
-        void Journal::journal(const AlignedBuilder& b) {
+        void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed) {
+            Timer t;
+            j.journal(h, uncompressed);
+            stats.curr->_writeToJournalMicros += t.micros();
+        }
+        void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) {
+            RACECHECK
+            static AlignedBuilder b(32*1024*1024);
+            /* buffer to journal will be
+               JSectHeader
+               compressed operations
+               JSectFooter
+            */
+            const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter);
+            const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize;
+            b.reset(max);
+
+            {
+                dassert( h.sectionLen() == (unsigned) 0xffffffff ); // we will backfill later
+                b.appendStruct(h);
+            }
+
+            size_t compressedLength = 0;
+            rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength);
+            assert( compressedLength < 0xffffffff );
+            assert( compressedLength < max );
+            b.skip(compressedLength);
+
+            // footer
+            unsigned L = 0xffffffff;
+            {
+                // pad to alignment, and set the total section length in the JSectHeader
+                assert( 0xffffe000 == (~(Alignment-1)) );
+                unsigned lenUnpadded = b.len() + sizeof(JSectFooter);
+                L = (lenUnpadded + Alignment-1) & (~(Alignment-1));
+                dassert( L >= lenUnpadded );
+
+                ((JSectHeader*)b.atOfs(0))->setSectionLen(lenUnpadded);
+
+                JSectFooter f(b.buf(), b.len()); // computes checksum
+                b.appendStruct(f);
+                dassert( b.len() == lenUnpadded );
+
+                b.skip(L - lenUnpadded);
+                dassert( b.len() % Alignment == 0 );
+            }
+
             try {
                 mutex::scoped_lock lk(_curLogFileMutex);
 
                 // must already be open -- so that _curFileId is correct for previous buffer building
                 assert( _curLogFile );
 
-                stats.curr->_journaledBytes += b.len();
-                _written += b.len();
-                _curLogFile->synchronousAppend((void *) b.buf(), b.len());
+                stats.curr->_uncompressedBytes += b.len();
+                unsigned w = b.len();
+                _written += w;
+                assert( w <= L );
+                stats.curr->_journaledBytes += L;
+                _curLogFile->synchronousAppend((const void *) b.buf(), L);
+                _rotate();
             }
             catch(std::exception& e) {
-                log() << "warning exception in dur::journal " << e.what() << endl;
+                log() << "error exception in dur::journal " << e.what() << endl;
                 throw;
             }
         }
diff --git a/db/dur_journal.h b/db/dur_journal.h
index 81957b5..664f639 100644
--- a/db/dur_journal.h
+++ b/db/dur_journal.h
@@ -27,8 +27,12 @@ namespace mongo {
         */
         extern bool okToCleanUp;
 
-        /** at termination after db files closed & fsynced */
-        void journalCleanup();
+        /** at termination after db files closed & fsynced 
+            also after recovery
+            closes and removes journal files
+            @param log report in log that we are cleaning up if we actually do any work
+        */
+        void journalCleanup(bool log = false);
 
         /** assure journal/ dir exists. throws */
         void journalMakeDir();
@@ -40,12 +44,6 @@ namespace mongo {
          */
         void journalRotate();
 
-        /** write/append to journal file *
-            @param buf - a buffer that will be written to the journal.
-            will not return until on disk
-        */
-        void journal(const AlignedBuilder& buf);
-
         /** flag that something has gone wrong during writing to the journal
             (not for recovery mode)
         */
@@ -64,5 +62,7 @@ namespace mongo {
         // in case disk controller buffers writes
         const long long ExtraKeepTimeMs = 10000;
 
+        const unsigned JournalCommitIntervalDefault = 100;
+
     }
 }
diff --git a/db/dur_journalformat.h b/db/dur_journalformat.h
index d29f94d..10ed848 100644
--- a/db/dur_journalformat.h
+++ b/db/dur_journalformat.h
@@ -18,12 +18,12 @@
 
 #pragma once
 
-#include "../util/md5.hpp"
-
 namespace mongo {
 
     namespace dur {
 
+        const unsigned Alignment = 8192;
+
 #pragma pack(1)
         /** beginning header for a journal/j._<n> file
             there is nothing important int this header at this time.  except perhaps version #.
@@ -36,7 +36,11 @@ namespace mongo {
 
             // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near
             // that.  simply incrementing the version # is safe on a fwd basis.
-            enum { CurrentVersion = 0x4147 };
+#if defined(_NOCOMPRESS)
+            enum { CurrentVersion = 0x4148 };
+#else
+            enum { CurrentVersion = 0x4149 };
+#endif
             unsigned short _version;
 
             // these are just for diagnostic ease (make header more useful as plain text)
@@ -57,11 +61,25 @@ namespace mongo {
 
         /** "Section" header.  A section corresponds to a group commit.
             len is length of the entire section including header and footer.
+            header and footer are not compressed, just the stuff in between.
         */
         struct JSectHeader {
-            unsigned len;                  // length in bytes of the whole section
+        private:
+            unsigned _sectionLen;          // unpadded length in bytes of the whole section
+        public:
             unsigned long long seqNumber;  // sequence number that can be used on recovery to not do too much work
             unsigned long long fileId;     // matches JHeader::fileId
+            unsigned sectionLen() const { return _sectionLen; }
+
+            // we store the unpadded length so we can use that when we uncompress. to 
+            // get the true total size this must be rounded up to the Alignment.
+            void setSectionLen(unsigned lenUnpadded) { _sectionLen = lenUnpadded; }
+
+            unsigned sectionLenWithPadding() const { 
+                unsigned x = (sectionLen() + (Alignment-1)) & (~(Alignment-1));
+                dassert( x % Alignment == 0 );
+                return x;
+            }
         };
 
         /** an individual write operation within a group commit section.  Either the entire section should
@@ -113,31 +131,21 @@ namespace mongo {
 
         /** group commit section footer. md5 is a key field. */
         struct JSectFooter {
-            JSectFooter(const void* begin, int len) { // needs buffer to compute hash
-                sentinel = JEntry::OpCode_Footer;
-                reserved = 0;
-                magic[0] = magic[1] = magic[2] = magic[3] = '\n';
-
-                // skip section header since size modified after hashing
-                (const char*&)begin += sizeof(JSectHeader);
-                len                 -= sizeof(JSectHeader);
-
-                md5(begin, len, hash);
-            }
+            JSectFooter();
+            JSectFooter(const void* begin, int len); // needs buffer to compute hash
             unsigned sentinel;
-            md5digest hash; // unsigned char[16]
+            unsigned char hash[16];
             unsigned long long reserved;
             char magic[4]; // "\n\n\n\n"
 
-            bool checkHash(const void* begin, int len) const {
-                // skip section header since size modified after hashing
-                (const char*&)begin += sizeof(JSectHeader);
-                len                 -= sizeof(JSectHeader);
-                md5digest current;
-                md5(begin, len, current);
-                DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(current, 16) << endl;
-                return (memcmp(hash, current, sizeof(hash)) == 0);
-            }
+            /** used by recovery to see if buffer is valid
+                @param begin the buffer
+                @param len buffer len
+                @return true if buffer looks valid
+            */
+            bool checkHash(const void* begin, int len) const;
+
+            bool magicOk() const { return *((unsigned*)magic) == 0x0a0a0a0a; }
         };
 
         /** declares "the next entry(s) are for this database / file path prefix" */
diff --git a/db/dur_journalimpl.h b/db/dur_journalimpl.h
index 9566dff..bf771c5 100644
--- a/db/dur_journalimpl.h
+++ b/db/dur_journalimpl.h
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include "dur_journalformat.h"
 #include "../util/logfile.h"
 
 namespace mongo {
@@ -40,20 +41,14 @@ namespace mongo {
              */
             void rotate();
 
-            /** write to journal
+            /** append to the journal file
             */
-            void journal(const AlignedBuilder& b);
+            void journal(const JSectHeader& h, const AlignedBuilder& b);
 
             boost::filesystem::path getFilePathFor(int filenumber) const;
 
             unsigned long long lastFlushTime() const { return _lastFlushTime; }
-            void cleanup();
-
-            // Rotate after reaching this data size in a journal (j._<n>) file
-            // We use a smaller size for 32 bit as the journal is mmapped during recovery (only)
-            // Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must 
-            // work.  (and should as-is)
-            static const unsigned long long DataLimit = (sizeof(void*)==4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024;
+            void cleanup(bool log); // closes and removes journal files
 
             unsigned long long curFileId() const { return _curFileId; }
 
@@ -67,14 +62,21 @@ namespace mongo {
             void open();
 
         private:
+            /** check if time to rotate files.  assure a file is open.
+             *  internally called with every commit
+             */
+            void _rotate();
+
             void _open();
             void closeCurrentJournalFile();
             void removeUnneededJournalFiles();
 
             unsigned long long _written; // bytes written so far to the current journal (log) file
             unsigned _nextFileNumber;
-
+        public:
             mutex _curLogFileMutex;
+            bool _ageOut;
+        private:
 
             LogFile *_curLogFile; // use _curLogFileMutex
             unsigned long long _curFileId; // current file id see JHeader::fileId
diff --git a/db/dur_preplogbuffer.cpp b/db/dur_preplogbuffer.cpp
index 1648e89..0d8ef36 100644
--- a/db/dur_preplogbuffer.cpp
+++ b/db/dur_preplogbuffer.cpp
@@ -35,6 +35,7 @@
 #include "../util/alignedbuilder.h"
 #include "../util/timer.h"
 #include "dur_stats.h"
+#include "../server.h"
 
 using namespace mongoutils;
 
@@ -58,9 +59,8 @@ namespace mongo {
         void prepBasicWrite_inlock(AlignedBuilder&bb, const WriteIntent *i, RelativePath& lastDbPath) {
             size_t ofs = 1;
             MongoMMF *mmf = findMMF_inlock(i->start(), /*out*/ofs);
-            dassert( i->w_ptr == 0 );
 
-            if( !mmf->willNeedRemap() ) {
+            if( unlikely(!mmf->willNeedRemap()) ) {
                 // tag this mmf as needed a remap of its private view later.
                 // usually it will already be dirty/already set, so we do the if above first
                 // to avoid possibility of cpu cache line contention
@@ -69,8 +69,13 @@ namespace mongo {
 
             // since we have already looked up the mmf, we go ahead and remember the write view location
             // so we don't have to find the MongoMMF again later in WRITETODATAFILES()
+            // 
+            // this was for WRITETODATAFILES_Impl2 so commented out now
+            //
+            /*
             dassert( i->w_ptr == 0 );
             i->w_ptr = ((char*)mmf->view_write()) + ofs;
+            */
 
             JEntry e;
             e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); //dont write past end of file
@@ -92,8 +97,8 @@ namespace mongo {
 #endif
             bb.appendBuf(i->start(), e.len);
 
-            if (e.len != (unsigned)i->length()) {
-                log() << "dur info splitting prepBasicWrite at boundary" << endl;
+            if (unlikely(e.len != (unsigned)i->length())) {
+                log() << "journal info splitting prepBasicWrite at boundary" << endl;
 
                 // This only happens if we write to the last byte in a file and
                 // the fist byte in another file that is mapped adjacently. I
@@ -120,23 +125,20 @@ namespace mongo {
             }
         }
 
-        void resetLogBuffer(AlignedBuilder& bb) {
+        void resetLogBuffer(/*out*/JSectHeader& h, AlignedBuilder& bb) {
             bb.reset();
 
-            // JSectHeader
-            JSectHeader h;
-            h.len = (unsigned) 0xffffffff;  // total length, will fill in later
+            h.setSectionLen(0xffffffff);  // total length, will fill in later
             h.seqNumber = getLastDataFileFlushTime();
             h.fileId = j.curFileId();
-
-            bb.appendStruct(h);
         }
 
         /** we will build an output buffer ourself and then use O_DIRECT
             we could be in read lock for this
             caller handles locking
+            @return partially populated sectheader and _ab set
         */
-        void _PREPLOGBUFFER() {
+        void _PREPLOGBUFFER(JSectHeader& h) {
             assert( cmdLine.dur );
 
             {
@@ -148,7 +150,7 @@ namespace mongo {
             }
 
             AlignedBuilder& bb = commitJob._ab;
-            resetLogBuffer(bb);
+            resetLogBuffer(h, bb); // adds JSectHeader
 
             // ops other than basic writes (DurOp's)
             {
@@ -157,34 +159,14 @@ namespace mongo {
                 }
             }
 
-            {
-                prepBasicWrites(bb);
-            }
-
-            {
-                JSectFooter f(bb.buf(), bb.len());
-                bb.appendStruct(f);
-            }
-
-            {
-                // pad to alignment, and set the total section length in the JSectHeader
-                assert( 0xffffe000 == (~(Alignment-1)) );
-                unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1));
-                dassert( L >= (unsigned) bb.len() );
-
-                *((unsigned*)bb.atOfs(0)) = L;
-
-                unsigned padding = L - bb.len();
-                bb.skip(padding);
-                dassert( bb.len() % Alignment == 0 );
-            }
+            prepBasicWrites(bb);
 
             return;
         }
-        void PREPLOGBUFFER() {
+        void PREPLOGBUFFER(/*out*/ JSectHeader& h) {
             Timer t;
             j.assureLogFileOpen(); // so fileId is set
-            _PREPLOGBUFFER();
+            _PREPLOGBUFFER(h);
             stats.curr->_prepLogBufferMicros += t.micros();
         }
 
diff --git a/db/dur_recover.cpp b/db/dur_recover.cpp
index 1480a59..3c9fee7 100644
--- a/db/dur_recover.cpp
+++ b/db/dur_recover.cpp
@@ -19,6 +19,7 @@
 #include "pch.h"
 
 #include "dur.h"
+#include "dur_stats.h"
 #include "dur_recover.h"
 #include "dur_journal.h"
 #include "dur_journalformat.h"
@@ -26,13 +27,16 @@
 #include "namespace.h"
 #include "../util/mongoutils/str.h"
 #include "../util/bufreader.h"
+#include "../util/concurrency/race.h"
 #include "pdfile.h"
 #include "database.h"
 #include "db.h"
 #include "../util/unittest.h"
+#include "../util/checksum.h"
 #include "cmdline.h"
 #include "curop.h"
 #include "mongommf.h"
+#include "../util/compress.h"
 
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -90,62 +94,73 @@ namespace mongo {
             throws
         */
         class JournalSectionIterator : boost::noncopyable {
+            auto_ptr<BufReader> _entries;
+            const JSectHeader _h;
+            const char *_lastDbName; // pointer into mmaped journal file
+            const bool _doDurOps;
+            string _uncompressed;
         public:
-            JournalSectionIterator(const void *p, unsigned len, bool doDurOps)
-                : _br(p, len)
-                , _sectHead(static_cast<const JSectHeader*>(_br.skip(sizeof(JSectHeader))))
-                , _lastDbName(NULL)
-                , _doDurOps(doDurOps)
-            {}
+            JournalSectionIterator(const JSectHeader& h, const void *compressed, unsigned compressedLen, bool doDurOpsRecovering) :
+                _h(h),
+                _lastDbName(0)
+                , _doDurOps(doDurOpsRecovering)
+            {
+                assert( doDurOpsRecovering );
+                bool ok = uncompress((const char *)compressed, compressedLen, &_uncompressed);
+                if( !ok ) { 
+                    // it should always be ok (i think?) as there is a previous check to see that the JSectFooter is ok
+                    log() << "couldn't uncompress journal section" << endl;
+                    msgasserted(15874, "couldn't uncompress journal section");
+                }
+                const char *p = _uncompressed.c_str();
+                assert( compressedLen == _h.sectionLen() - sizeof(JSectFooter) - sizeof(JSectHeader) );
+                _entries = auto_ptr<BufReader>( new BufReader(p, _uncompressed.size()) );
+            }
+
+            // we work with the uncompressed buffer when doing a WRITETODATAFILES (for speed)
+            JournalSectionIterator(const JSectHeader &h, const void *p, unsigned len) :
+                _entries( new BufReader((const char *) p, len) ),
+                _h(h),
+                _lastDbName(0)
+                , _doDurOps(false)
 
-            bool atEof() const { return _br.atEof(); }
+                { }
 
-            unsigned long long seqNumber() const { return _sectHead->seqNumber; }
+            bool atEof() const { return _entries->atEof(); }
+
+            unsigned long long seqNumber() const { return _h.seqNumber; }
 
             /** get the next entry from the log.  this function parses and combines JDbContext and JEntry's.
-             *  @return true if got an entry.  false at successful end of section (and no entry returned).
              *  throws on premature end of section.
              */
-            bool next(ParsedJournalEntry& e) {
+            void next(ParsedJournalEntry& e) {
                 unsigned lenOrOpCode;
-                _br.read(lenOrOpCode);
+                _entries->read(lenOrOpCode);
 
                 if (lenOrOpCode > JEntry::OpCode_Min) {
                     switch( lenOrOpCode ) {
 
                     case JEntry::OpCode_Footer: {
-                        if (_doDurOps) {
-                            const char* pos = (const char*) _br.pos();
-                            pos -= sizeof(lenOrOpCode); // rewind to include OpCode
-                            const JSectFooter& footer = *(const JSectFooter*)pos;
-                            int len = pos - (char*)_sectHead;
-                            if (!footer.checkHash(_sectHead, len)) {
-                                massert(13594, str::stream() << "Journal checksum doesn't match. recorded: "
-                                        << toHex(footer.hash, sizeof(footer.hash))
-                                        << " actual: " << md5simpledigest(_sectHead, len)
-                                        , false);
-                            }
-                        }
-                        return false; // false return value denotes end of section
+                        assert( false );
                     }
 
                     case JEntry::OpCode_FileCreated:
                     case JEntry::OpCode_DropDb: {
                         e.dbName = 0;
-                        boost::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, _br);
+                        boost::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, *_entries);
                         if (_doDurOps) {
                             e.op = op;
                         }
-                        return true;
+                        return;
                     }
 
                     case JEntry::OpCode_DbContext: {
-                        _lastDbName = (const char*) _br.pos();
-                        const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _br.remaining());
+                        _lastDbName = (const char*) _entries->pos();
+                        const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _entries->remaining());
                         const unsigned len = strnlen(_lastDbName, limit);
                         massert(13533, "problem processing journal file during recovery", _lastDbName[len] == '\0');
-                        _br.skip(len+1); // skip '\0' too
-                        _br.read(lenOrOpCode);
+                        _entries->skip(len+1); // skip '\0' too
+                        _entries->read(lenOrOpCode); // read this for the fall through
                     }
                     // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
 
@@ -157,18 +172,13 @@ namespace mongo {
 
                 // JEntry - a basic write
                 assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min );
-                _br.rewind(4);
-                e.e = (JEntry *) _br.skip(sizeof(JEntry));
+                _entries->rewind(4);
+                e.e = (JEntry *) _entries->skip(sizeof(JEntry));
                 e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
                 assert( e.e->len == lenOrOpCode );
-                _br.skip(e.e->len);
-                return true;
+                _entries->skip(e.e->len);
             }
-        private:
-            BufReader _br;
-            const JSectHeader* _sectHead;
-            const char *_lastDbName; // pointer into mmaped journal file
-            const bool _doDurOps;
+
         };
 
         static string fileName(const char* dbName, int fileNo) {
@@ -204,6 +214,11 @@ namespace mongo {
         }
 
         void RecoveryJob::write(const ParsedJournalEntry& entry) {
+            //TODO(mathias): look into making some of these dasserts
+            assert(entry.e);
+            assert(entry.dbName);
+            assert(strnlen(entry.dbName, MaxDatabaseNameLen) < MaxDatabaseNameLen);
+
             const string fn = fileName(entry.dbName, entry.e->getFileNo());
             MongoFile* file;
             {
@@ -225,8 +240,12 @@ namespace mongo {
             }
 
             if ((entry.e->ofs + entry.e->len) <= mmf->length()) {
+                assert(mmf->view_write());
+                assert(entry.e->srcData());
+
                 void* dest = (char*)mmf->view_write() + entry.e->ofs;
                 memcpy(dest, entry.e->srcData(), entry.e->len);
+                stats.curr->_writeToDataFilesBytes += entry.e->len;
             }
             else {
                 massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering);
@@ -278,27 +297,64 @@ namespace mongo {
                 log() << "END section" << endl;
         }
 
-        void RecoveryJob::processSection(const void *p, unsigned len) {
+        void RecoveryJob::processSection(const JSectHeader *h, const void *p, unsigned len, const JSectFooter *f) {
             scoped_lock lk(_mx);
+            RACECHECK
+
+            /** todo: we should really verify the checksum to see that seqNumber is ok?
+                      that is expensive maybe there is some sort of checksum of just the header 
+                      within the header itself
+            */
+            if( _recovering && _lastDataSyncedFromLastRun > h->seqNumber + ExtraKeepTimeMs ) {
+                if( h->seqNumber != _lastSeqMentionedInConsoleLog ) {
+                    static int n;
+                    if( ++n < 10 ) {
+                        log() << "recover skipping application of section seq:" << h->seqNumber << " < lsn:" << _lastDataSyncedFromLastRun << endl;
+                    }
+                    else if( n == 10 ) { 
+                        log() << "recover skipping application of section more..." << endl;
+                    }
+                    _lastSeqMentionedInConsoleLog = h->seqNumber;
+                }
+                return;
+            }
 
-            vector<ParsedJournalEntry> entries;
-            JournalSectionIterator i(p, len, _recovering);
+            auto_ptr<JournalSectionIterator> i;
+            if( _recovering ) {
+                i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, p, len, _recovering));
+            }
+            else { 
+                i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, /*after header*/p, /*w/out header*/len));
+            }
 
-            //DEV log() << "recovery processSection seq:" << i.seqNumber() << endl;
-            if( _recovering && _lastDataSyncedFromLastRun > i.seqNumber() + ExtraKeepTimeMs ) {
-                if( i.seqNumber() != _lastSeqMentionedInConsoleLog ) {
-                    log() << "recover skipping application of section seq:" << i.seqNumber() << " < lsn:" << _lastDataSyncedFromLastRun << endl;
-                    _lastSeqMentionedInConsoleLog = i.seqNumber();
+            // we use a static so that we don't have to reallocate every time through.  occasionally we 
+            // go back to a small allocation so that if there were a spiky growth it won't stick forever.
+            static vector<ParsedJournalEntry> entries;
+            entries.clear();
+/** TEMP uncomment
+            RARELY OCCASIONALLY {
+                if( entries.capacity() > 2048 ) {
+                    entries.shrink_to_fit();
+                    entries.reserve(2048);
                 }
-                return;
             }
+*/
 
             // first read all entries to make sure this section is valid
             ParsedJournalEntry e;
-            while( i.next(e) ) {
+            while( !i->atEof() ) {
+                i->next(e);
                 entries.push_back(e);
             }
 
+            // after the entries check the footer checksum
+            if( _recovering ) {
+                assert( ((const char *)h) + sizeof(JSectHeader) == p );
+                if( !f->checkHash(h, len + sizeof(JSectHeader)) ) { 
+                    msgasserted(13594, "journal checksum doesn't match");
+                }
+            }
+
             // got all the entries for one group commit.  apply them:
             applyEntries(entries);
         }
@@ -334,11 +390,16 @@ namespace mongo {
                     if( h.fileId != fileId ) {
                         if( debug || (cmdLine.durOptions & CmdLine::DurDumpJournal) ) {
                             log() << "Ending processFileBuffer at differing fileId want:" << fileId << " got:" << h.fileId << endl;
-                            log() << "  sect len:" << h.len << " seqnum:" << h.seqNumber << endl;
+                            log() << "  sect len:" << h.sectionLen() << " seqnum:" << h.seqNumber << endl;
                         }
                         return true;
                     }
-                    processSection(br.skip(h.len), h.len);
+                    unsigned slen = h.sectionLen();
+                    unsigned dataLen = slen - sizeof(JSectHeader) - sizeof(JSectFooter);
+                    const char *hdr = (const char *) br.skip(h.sectionLenWithPadding());
+                    const char *data = hdr + sizeof(JSectHeader);
+                    const char *footer = data + dataLen;
+                    processSection((const JSectHeader*) hdr, data, dataLen, (const JSectFooter*) footer);
 
                     // ctrl c check
                     killCurrentOp.checkForInterrupt(false);
@@ -356,6 +417,17 @@ namespace mongo {
         /** apply a specific journal file */
         bool RecoveryJob::processFile(path journalfile) {
             log() << "recover " << journalfile.string() << endl;
+
+            try { 
+                if( boost::filesystem::file_size( journalfile.string() ) == 0 ) {
+                    log() << "recover info " << journalfile.string() << " has zero length" << endl;
+                    return true;
+                }
+            } catch(...) { 
+                // if something weird like a permissions problem keep going so the massert down below can happen (presumably)
+                log() << "recover exception checking filesize" << endl;
+            }
+
             MemoryMappedFile f;
             void *p = f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL);
             massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p);
@@ -371,13 +443,19 @@ namespace mongo {
             _lastDataSyncedFromLastRun = journalReadLSN();
             log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl;
 
+            // todo: we could truncate the journal file at rotation time to the right length, then this abruptEnd 
+            // check can be turned back on.  this is relevant when prealloc is being used.
             for( unsigned i = 0; i != files.size(); ++i ) {
-	      /*bool abruptEnd = */processFile(files[i]);
-                /*if( abruptEnd && i+1 < files.size() ) {
+	      bool abruptEnd = processFile(files[i]);
+                if( abruptEnd && i+1 < files.size() ) {
+#if 1 // Leaving this as a warning for now. TODO: make this an error post 2.0
+                    log() << "recover warning: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl;
+#else
                     log() << "recover error: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl;
                     close();
                     uasserted(13535, "recover abrupt journal file end");
-                }*/
+#endif
+                }
             }
 
             close();
diff --git a/db/dur_recover.h b/db/dur_recover.h
index 1022fdc..955e730 100644
--- a/db/dur_recover.h
+++ b/db/dur_recover.h
@@ -2,6 +2,7 @@
 
 #pragma once
 
+#include "dur_journalformat.h"
 #include "../util/concurrency/mutex.h"
 #include "../util/file.h"
 
@@ -15,10 +16,14 @@ namespace mongo {
          */
         class RecoveryJob : boost::noncopyable {
         public:
-            RecoveryJob() :_lastDataSyncedFromLastRun(0), _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; }
+            RecoveryJob() : _lastDataSyncedFromLastRun(0), 
+                _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; }
             void go(vector<path>& files);
             ~RecoveryJob();
-            void processSection(const void *, unsigned len);
+
+            /** @param data data between header and footer. compressed if recovering. */
+            void processSection(const JSectHeader *h, const void *data, unsigned len, const JSectFooter *f);
+
             void close(); // locks and calls _close()
 
             static RecoveryJob & get() { return _instance; }
@@ -34,9 +39,9 @@ namespace mongo {
 
             unsigned long long _lastDataSyncedFromLastRun;
             unsigned long long _lastSeqMentionedInConsoleLog;
-
-            mongo::mutex _mx; // protects _mmfs
-
+        public:
+            mongo::mutex _mx; // protects _mmfs; see setNoJournal() too
+        private:
             bool _recovering; // are we in recovery or WRITETODATAFILES
 
             static RecoveryJob &_instance;
diff --git a/db/dur_stats.h b/db/dur_stats.h
index 5f5a188..50a26d1 100644
--- a/db/dur_stats.h
+++ b/db/dur_stats.h
@@ -13,11 +13,14 @@ namespace mongo {
             unsigned _intervalMicros;
             struct S {
                 BSONObj _asObj();
+                string _asCSV();
+                string _CSVHeader();
                 void reset();
 
                 unsigned _commits;
                 unsigned _earlyCommits; // count of early commits from commitIfNeeded() or from getDur().commitNow()
                 unsigned long long _journaledBytes;
+                unsigned long long _uncompressedBytes;
                 unsigned long long _writeToDataFilesBytes;
 
                 unsigned long long _prepLogBufferMicros;
diff --git a/db/dur_writetodatafiles.cpp b/db/dur_writetodatafiles.cpp
index 50797ea..6724f07 100644
--- a/db/dur_writetodatafiles.cpp
+++ b/db/dur_writetodatafiles.cpp
@@ -47,11 +47,13 @@ namespace mongo {
             @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc&hl=en
         */
 
-        void WRITETODATAFILES_Impl1() {
-            RecoveryJob::get().processSection(commitJob._ab.buf(), commitJob._ab.len());
+        void WRITETODATAFILES_Impl1(const JSectHeader& h, AlignedBuilder& uncompressed) {
+            RWLockRecursive::Shared lk(MongoFile::mmmutex);
+            RecoveryJob::get().processSection(&h, uncompressed.buf(), uncompressed.len(), 0);
         }
 
-        // the old implementation
+#if 0
+        // the old implementation.  doesn't work with groupCommitWithLimitedLocks()
         void WRITETODATAFILES_Impl2() {
             /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
             for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
@@ -61,8 +63,10 @@ namespace mongo {
                 memcpy(intent.w_ptr, intent.start(), intent.length());
             }
         }
+#endif
 
 #if defined(_EXPERIMENTAL)
+        // doesn't work with groupCommitWithLimitedLocks()
         void WRITETODATAFILES_Impl3() {
             /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
             for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
@@ -76,23 +80,15 @@ namespace mongo {
         }
 #endif
 
-        void WRITETODATAFILES() {
-            dbMutex.assertAtLeastReadLocked();
-
-            MongoFile::markAllWritable(); // for _DEBUG. normally we don't write in a read lock
-
+        // concurrency: in mmmutex, not necessarily in dbMutex
+        void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed) {
             Timer t;
 #if defined(_EXPERIMENTAL)
             WRITETODATAFILES_Impl3();
 #else
-            WRITETODATAFILES_Impl1();
+            WRITETODATAFILES_Impl1(h, uncompressed);
 #endif
             stats.curr->_writeToDataFilesMicros += t.micros();
-
-            if (!dbMutex.isWriteLocked())
-                MongoFile::unmarkAllWritable();
-
-            debugValidateAllMapsMatch();
         }
 
     }
diff --git a/db/durop.cpp b/db/durop.cpp
index 344b21e..80ee504 100644
--- a/db/durop.cpp
+++ b/db/durop.cpp
@@ -48,7 +48,7 @@ namespace mongo {
                 op = shared_ptr<DurOp>( new DropDbOp(br) );
                 break;
             default:
-                massert(13546, (str::stream() << "dur recover unrecognized opcode in journal " << opcode), false);
+                massert(13546, (str::stream() << "journal recover: unrecognized opcode in journal " << opcode), false);
             }
             return op;
         }
@@ -152,6 +152,7 @@ namespace mongo {
                 ofs += w;
             }
             f.fsync();
+            flushMyDirectory(full);
             massert(13628, str::stream() << "recover failure writing file " << full, !f.bad() );
         }
 
diff --git a/db/durop.h b/db/durop.h
index c4574c2..9ab1bfc 100644
--- a/db/durop.h
+++ b/db/durop.h
@@ -28,8 +28,6 @@ namespace mongo {
 
     namespace dur {
 
-        const unsigned Alignment = 8192;
-
         /** DurOp - Operations we journal that aren't just basic writes.
          *
          *  Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent.
diff --git a/db/extsort.cpp b/db/extsort.cpp
index 2e6d8d8..0cc36f1 100644
--- a/db/extsort.cpp
+++ b/db/extsort.cpp
@@ -27,11 +27,12 @@
 
 namespace mongo {
 
-    BSONObj BSONObjExternalSorter::extSortOrder;
+    IndexInterface *BSONObjExternalSorter::extSortIdxInterface;
+    Ordering BSONObjExternalSorter::extSortOrder( Ordering::make(BSONObj()) );
     unsigned long long BSONObjExternalSorter::_compares = 0;
 
-    BSONObjExternalSorter::BSONObjExternalSorter( const BSONObj & order , long maxFileSize )
-        : _order( order.getOwned() ) , _maxFilesize( maxFileSize ) ,
+    BSONObjExternalSorter::BSONObjExternalSorter( IndexInterface &i, const BSONObj & order , long maxFileSize )
+        : _idxi(i), _order( order.getOwned() ) , _maxFilesize( maxFileSize ) ,
           _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0) {
 
         stringstream rootpath;
@@ -52,7 +53,6 @@ namespace mongo {
             delete _cur;
             _cur = 0;
         }
-
         unsigned long removed = remove_all( _root );
         wassert( removed == 1 + _files.size() );
     }
@@ -61,7 +61,8 @@ namespace mongo {
         // extSortComp needs to use glbals
         // qsort_r only seems available on bsd, which is what i really want to use
         dblock l;
-        extSortOrder = _order;
+        extSortIdxInterface = &_idxi;
+        extSortOrder = Ordering::make(_order);
         _cur->sort( BSONObjExternalSorter::extSortComp );
     }
 
@@ -147,7 +148,7 @@ namespace mongo {
     // ---------------------------------
 
     BSONObjExternalSorter::Iterator::Iterator( BSONObjExternalSorter * sorter ) :
-        _cmp( sorter->_order ) , _in( 0 ) {
+        _cmp( sorter->_idxi, sorter->_order ) , _in( 0 ) {
 
         for ( list<string>::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ) {
             _files.push_back( new FileIterator( *i ) );
@@ -158,8 +159,6 @@ namespace mongo {
             _in = sorter->_cur;
             _it = sorter->_cur->begin();
         }
-
-
     }
 
     BSONObjExternalSorter::Iterator::~Iterator() {
diff --git a/db/extsort.h b/db/extsort.h
index c0791db..ae6a334 100644
--- a/db/extsort.h
+++ b/db/extsort.h
@@ -26,27 +26,47 @@
 
 namespace mongo {
 
-
     /**
-       for sorting by BSONObj and attaching a value
+       for external (disk) sorting by BSONObj and attaching a value
      */
     class BSONObjExternalSorter : boost::noncopyable {
     public:
-
+        BSONObjExternalSorter( IndexInterface &i, const BSONObj & order = BSONObj() , long maxFileSize = 1024 * 1024 * 100 );
+        ~BSONObjExternalSorter();
         typedef pair<BSONObj,DiskLoc> Data;
-
+ 
     private:
-        static BSONObj extSortOrder;
+        IndexInterface& _idxi;
 
-        static int extSortComp( const void *lv, const void *rv ) {
+        static int _compare(IndexInterface& i, const Data& l, const Data& r, const Ordering& order) { 
             RARELY killCurrentOp.checkForInterrupt();
             _compares++;
+            int x = i.keyCompare(l.first, r.first, order);
+            if ( x )
+                return x;
+            return l.second.compare( r.second );
+        }
+
+        class MyCmp {
+        public:
+            MyCmp( IndexInterface& i, BSONObj order = BSONObj() ) : _i(i), _order( Ordering::make(order) ) {}
+            bool operator()( const Data &l, const Data &r ) const {
+                return _compare(_i, l, r, _order) < 0;
+            };
+        private:
+            IndexInterface& _i;
+            const Ordering _order;
+        };
+
+        static IndexInterface *extSortIdxInterface;
+        static Ordering extSortOrder;
+        static int extSortComp( const void *lv, const void *rv ) {
+            DEV RARELY {                 
+                dbMutex.assertWriteLocked(); // must be as we use a global var
+            }
             Data * l = (Data*)lv;
             Data * r = (Data*)rv;
-            int cmp = l->first.woCompare( r->first , extSortOrder );
-            if ( cmp )
-                return cmp;
-            return l->second.compare( r->second );
+            return _compare(*extSortIdxInterface, *l, *r, extSortOrder);
         };
 
         class FileIterator : boost::noncopyable {
@@ -61,22 +81,6 @@ namespace mongo {
             char * _end;
         };
 
-        class MyCmp {
-        public:
-            MyCmp( const BSONObj & order = BSONObj() ) : _order( order ) {}
-            bool operator()( const Data &l, const Data &r ) const {
-                RARELY killCurrentOp.checkForInterrupt();
-                _compares++;
-                int x = l.first.woCompare( r.first , _order );
-                if ( x )
-                    return x < 0;
-                return l.second.compare( r.second ) < 0;
-            };
-
-        private:
-            BSONObj _order;
-        };
-
     public:
 
         typedef FastArray<Data> InMemory;
@@ -99,9 +103,6 @@ namespace mongo {
 
         };
 
-        BSONObjExternalSorter( const BSONObj & order = BSONObj() , long maxFileSize = 1024 * 1024 * 100 );
-        ~BSONObjExternalSorter();
-
         void add( const BSONObj& o , const DiskLoc & loc );
         void add( const BSONObj& o , int a , int b ) {
             add( o , DiskLoc( a , b ) );
diff --git a/db/geo/2d.cpp b/db/geo/2d.cpp
index 7b2bf17..b873490 100644
--- a/db/geo/2d.cpp
+++ b/db/geo/2d.cpp
@@ -26,12 +26,31 @@
 #include "../btree.h"
 #include "../curop-inl.h"
 #include "../matcher.h"
-
 #include "core.h"
 
+// Note: we use indexinterface herein to talk to the btree code. In the future it would be nice to 
+//       be able to use the V1 key class (see key.h) instead of toBson() which has some cost.
+//       toBson() is new with v1 so this could be slower than it used to be?  a quick profiling
+//       might make sense.
+
 namespace mongo {
 
+    class GeoKeyNode { 
+        GeoKeyNode();
+    public:
+        GeoKeyNode(DiskLoc r, BSONObj k) : recordLoc(r), _key(k) { }
+        const DiskLoc recordLoc;
+        const BSONObj _key;
+    };
+
+    // just use old indexes for geo for now. todo.
+//    typedef BtreeBucket<V0> GeoBtreeBucket;
+//    typedef GeoBtreeBucket::KeyNode GeoKeyNode;
+
+//#define BTREE btree<V0>
+
 #if 0
+# define GEODEBUGGING
 # define GEODEBUG(x) cout << x << endl;
 # define GEODEBUGPRINT(x) PRINT(x)
     inline void PREFIXDEBUG(GeoHash prefix, const GeoConvert* g) {
@@ -77,6 +96,8 @@ namespace mongo {
 
     class Geo2dType : public IndexType , public GeoConvert {
     public:
+        virtual ~Geo2dType() { }
+
         Geo2dType( const IndexPlugin * plugin , const IndexSpec* spec )
             : IndexType( plugin , spec ) {
 
@@ -98,34 +119,42 @@ namespace mongo {
 
             uassert( 13024 , "no geo field specified" , _geo.size() );
 
-            _bits = _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft
+            double bits =  _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft
+
+            uassert( 13028 , "bits in geo index must be between 1 and 32" , bits > 0 && bits <= 32 );
 
-            uassert( 13028 , "can't have more than 32 bits in geo index" , _bits <= 32 );
+            _bits = (unsigned) bits;
 
-            _max = _configval( spec , "max" , 180 );
-            _min = _configval( spec , "min" , -180 );
+            _max = _configval( spec , "max" , 180.0 );
+            _min = _configval( spec , "min" , -180.0 );
 
-            _scaling = (1024*1024*1024*4.0)/(_max-_min);
+            double numBuckets = (1024 * 1024 * 1024 * 4.0);
+
+            _scaling = numBuckets / ( _max - _min );
 
             _order = orderBuilder.obj();
 
             GeoHash a(0, 0, _bits);
             GeoHash b = a;
             b.move(1, 1);
-            _error = distance(a, b);
+
+            // Epsilon is 1/100th of a bucket size
+            // TODO:  Can we actually find error bounds for the sqrt function?
+            double epsilon = 0.001 / _scaling;
+            _error = distance(a, b) + epsilon;
+
+            // Error in radians
+            _errorSphere = deg2rad( _error );
         }
 
-        int _configval( const IndexSpec* spec , const string& name , int def ) {
+        double _configval( const IndexSpec* spec , const string& name , double def ) {
             BSONElement e = spec->info[name];
-            if ( e.isNumber() )
-                return e.numberInt();
+            if ( e.isNumber() ) {
+                return e.numberDouble();
+            }
             return def;
         }
 
-        ~Geo2dType() {
-
-        }
-
         virtual BSONObj fixKey( const BSONObj& in ) {
             if ( in.firstElement().type() == BinData )
                 return in;
@@ -148,54 +177,132 @@ namespace mongo {
             return b.obj();
         }
 
-        virtual void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
-            BSONElement geo = obj.getFieldDotted(_geo.c_str());
-            if ( geo.eoo() )
-                return;
+        /** Finds the key objects to put in an index */
+        virtual void getKeys( const BSONObj& obj, BSONObjSet& keys ) const {
+            getKeys( obj, &keys, NULL );
+        }
 
-            BSONObjBuilder b(64);
+        /** Finds all locations in a geo-indexed object */
+        // TODO:  Can we just return references to the locs, if they won't change?
+        void getKeys( const BSONObj& obj, vector< BSONObj >& locs ) const {
+            getKeys( obj, NULL, &locs );
+        }
 
-            if ( ! geo.isABSONObj() )
-                return;
+        /** Finds the key objects and/or locations for a geo-indexed object */
+        void getKeys( const BSONObj &obj, BSONObjSet* keys, vector< BSONObj >* locs ) const {
+
+            BSONElementMSet bSet;
+
+            // Get all the nested location fields, but don't return individual elements from
+            // the last array, if it exists.
+            obj.getFieldsDotted(_geo.c_str(), bSet, false);
 
-            BSONObj embed = geo.embeddedObject();
-            if ( embed.isEmpty() )
+            if( bSet.empty() )
                 return;
 
-            _hash( embed ).append( b , "" );
+            for( BSONElementMSet::iterator setI = bSet.begin(); setI != bSet.end(); ++setI ) {
 
-            // Go through all the other index keys
-            for ( vector<string>::const_iterator i = _other.begin(); i != _other.end(); ++i ){
+                BSONElement geo = *setI;
 
-            	// Get *all* fields for the index key
-				BSONElementSet eSet;
-				obj.getFieldsDotted( *i, eSet );
+                GEODEBUG( "Element " << geo << " found for query " << _geo.c_str() );
 
+                if ( geo.eoo() || ! geo.isABSONObj() )
+                    continue;
 
-				if ( eSet.size() == 0 )
-					b.appendAs( _spec->missingField(), "" );
-				else if ( eSet.size() == 1 )
-					b.appendAs( *(eSet.begin()), "" );
-				else{
+                //
+                // Grammar for location lookup:
+                // locs ::= [loc,loc,...,loc]|{<k>:loc,<k>:loc}|loc
+                // loc  ::= { <k1> : #, <k2> : # }|[#, #]|{}
+                //
+                // Empty locations are ignored, preserving single-location semantics
+                //
 
-					// If we have more than one key, store as an array of the objects
-					// TODO:  Store multiple keys?
+                BSONObj embed = geo.embeddedObject();
+                if ( embed.isEmpty() )
+                    continue;
 
-					BSONArrayBuilder aBuilder;
+                // Differentiate between location arrays and locations
+                // by seeing if the first element value is a number
+                bool singleElement = embed.firstElement().isNumber();
 
-					for( BSONElementSet::iterator ei = eSet.begin(); ei != eSet.end(); ++ei ){
-						aBuilder.append( *ei );
-					}
+                BSONObjIterator oi(embed);
 
-					BSONArray arr = aBuilder.arr();
+                while( oi.more() ) {
 
-					b.append( "", arr );
+                    BSONObj locObj;
 
-				}
+                    if( singleElement ) locObj = embed;
+                    else {
+                        BSONElement locElement = oi.next();
+
+                        uassert( 13654, str::stream() << "location object expected, location array not in correct format",
+                                 locElement.isABSONObj() );
+
+                        locObj = locElement.embeddedObject();
+
+                        if( locObj.isEmpty() )
+                            continue;
+                    }
+
+                    BSONObjBuilder b(64);
+
+                    // Remember the actual location object if needed
+                    if( locs )
+                        locs->push_back( locObj );
+
+                    // Stop if we don't need to get anything but location objects
+                    if( ! keys ) {
+                        if( singleElement ) break;
+                        else continue;
+                    }
+
+                    _hash( locObj ).append( b , "" );
+
+                    // Go through all the other index keys
+                    for ( vector<string>::const_iterator i = _other.begin(); i != _other.end(); ++i ) {
 
-			}
+                        // Get *all* fields for the index key
+                        BSONElementSet eSet;
+                        obj.getFieldsDotted( *i, eSet );
+
+
+                        if ( eSet.size() == 0 )
+                            b.appendAs( _spec->missingField(), "" );
+                        else if ( eSet.size() == 1 )
+                            b.appendAs( *(eSet.begin()), "" );
+                        else {
+
+                            // If we have more than one key, store as an array of the objects
+
+                            BSONArrayBuilder aBuilder;
+
+                            for( BSONElementSet::iterator ei = eSet.begin(); ei != eSet.end(); ++ei ) {
+                                aBuilder.append( *ei );
+                            }
+
+                            BSONArray arr = aBuilder.arr();
+
+                            b.append( "", arr );
+
+                        }
+
+                    }
+
+                    keys->insert( b.obj() );
+
+                    if( singleElement ) break;
+
+                }
+            }
 
-            keys.insert( b.obj() );
+        }
+
+        BSONObj _fromBSONHash( const BSONElement& e ) const {
+            return _unhash( _tohash( e ) );
+        }
+
+        BSONObj _fromBSONHash( const BSONObj& o ) const {
+            return _unhash( _tohash( o.firstElement() ) );
         }
 
         GeoHash _tohash( const BSONElement& e ) const {
@@ -217,6 +324,10 @@ namespace mongo {
             return hash( x.number() , y.number() );
         }
 
+        GeoHash hash( const Point& p ) const {
+            return hash( p._x, p._y );
+        }
+
         GeoHash hash( double x , double y ) const {
             return GeoHash( _convert(x), _convert(y) , _bits );
         }
@@ -231,9 +342,9 @@ namespace mongo {
         }
 
         unsigned _convert( double in ) const {
-            uassert( 13027 , "point not in range" , in <= (_max + _error) && in >= (_min - _error) );
+            uassert( 13027 , str::stream() << "point not in interval of [ " << _min << ", " << _max << " )", in < _max && in >= _min );
             in -= _min;
-            assert( in > 0 );
+            assert( in >= 0 );
             return (unsigned)(in * _scaling);
         }
 
@@ -269,6 +380,10 @@ namespace mongo {
         }
 
         double sizeEdge( const GeoHash& a ) const {
+
+            if( ! a.constrains() )
+                return _max - _min;
+
             double ax,ay,bx,by;
             GeoHash b = a;
             b.move( 1 , 1 );
@@ -297,13 +412,15 @@ namespace mongo {
                 case BSONObj::opNEAR:
                 case BSONObj::opWITHIN:
                     return OPTIMAL;
-                default:;
+                default:
+                    // We can try to match if there's no other indexing defined,
+                    // this is assumed a point
+                    return HELPFUL;
                 }
             }
             case Array:
-            	// Non-geo index data is stored in a non-standard way, cannot use for exact lookups with
-            	// additional criteria
-            	if ( query.nFields() > 1 ) return USELESS;
+                // We can try to match if there's no other indexing defined,
+                // this is assumed a point
                 return HELPFUL;
             default:
                 return USELESS;
@@ -314,12 +431,13 @@ namespace mongo {
         vector<string> _other;
 
         unsigned _bits;
-        int _max;
-        int _min;
+        double _max;
+        double _min;
         double _scaling;
 
         BSONObj _order;
         double _error;
+        double _errorSphere;
     };
 
     class Box {
@@ -341,6 +459,10 @@ namespace mongo {
 
         Box() {}
 
+        BSONArray toBSON() const {
+            return BSON_ARRAY( BSON_ARRAY( _min._x << _min._y ) << BSON_ARRAY( _max._x << _max._y ) );
+        }
+
         string toString() const {
             StringBuilder buf(64);
             buf << _min.toString() << " -->> " << _max.toString();
@@ -351,6 +473,10 @@ namespace mongo {
             return val + fudge >= min && val <= max + fudge;
         }
 
+        bool onBoundary( double bound, double val, double fudge = 0 ) const {
+            return ( val >= bound - fudge && val <= bound + fudge );
+        }
+
         bool mid( double amin , double amax , double bmin , double bmax , bool min , double& res ) const {
             assert( amin <= amax );
             assert( bmin <= bmax );
@@ -380,18 +506,43 @@ namespace mongo {
 
             Box intersection( boundMin , boundMax );
 
-            return intersection.area() / ( ( area() + other.area() ) / 2 );
+            return intersection.area() / area();
         }
 
         double area() const {
             return ( _max._x - _min._x ) * ( _max._y - _min._y );
         }
 
+        double maxDim() const {
+            return max( _max._x - _min._x, _max._y - _min._y );
+        }
+
         Point center() const {
             return Point( ( _min._x + _max._x ) / 2 ,
                           ( _min._y + _max._y ) / 2 );
         }
 
+        void truncate( const Geo2dType* g ) {
+            if( _min._x < g->_min ) _min._x = g->_min;
+            if( _min._y < g->_min ) _min._y = g->_min;
+            if( _max._x > g->_max ) _max._x = g->_max;
+            if( _max._y > g->_max ) _max._y = g->_max;
+        }
+
+        void fudge( const Geo2dType* g ) {
+            _min._x -= g->_error;
+            _min._y -= g->_error;
+            _max._x += g->_error;
+            _max._y += g->_error;
+        }
+
+        bool onBoundary( Point p, double fudge = 0 ) {
+            return onBoundary( _min._x, p._x, fudge ) ||
+                   onBoundary( _max._x, p._x, fudge ) ||
+                   onBoundary( _min._y, p._y, fudge ) ||
+                   onBoundary( _max._y, p._y, fudge );
+        }
+
         bool inside( Point p , double fudge = 0 ) {
             bool res = inside( p._x , p._y , fudge );
             //cout << "is : " << p.toString() << " in " << toString() << " = " << res << endl;
@@ -412,405 +563,481 @@ namespace mongo {
         Point _max;
     };
 
-    class Geo2dPlugin : public IndexPlugin {
+
+    class Polygon {
     public:
-        Geo2dPlugin() : IndexPlugin( GEO2DNAME ) {
-        }
 
-        virtual IndexType* generate( const IndexSpec* spec ) const {
-            return new Geo2dType( this , spec );
+        Polygon( void ) : _centroidCalculated( false ) {}
+
+        Polygon( vector<Point> points ) : _centroidCalculated( false ),
+            _points( points ) { }
+
+        void add( Point p ) {
+            _centroidCalculated = false;
+            _points.push_back( p );
         }
-    } geo2dplugin;
 
-    struct GeoUnitTest : public UnitTest {
+        int size( void ) const {
+            return _points.size();
+        }
 
-        int round( double d ) {
-            return (int)(.5+(d*1000));
+        /**
+         * Determine if the point supplied is contained by the current polygon.
+         *
+         * The algorithm uses a ray casting method.
+         */
+        bool contains( const Point& p ) const {
+            return contains( p, 0 ) > 0;
         }
 
-#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); }
+        int contains( const Point &p, double fudge ) const {
 
-        void run() {
-            assert( ! GeoHash::isBitSet( 0 , 0 ) );
-            assert( ! GeoHash::isBitSet( 0 , 31 ) );
-            assert( GeoHash::isBitSet( 1 , 31 ) );
+            Box fudgeBox( Point( p._x - fudge, p._y - fudge ), Point( p._x + fudge, p._y + fudge ) );
 
-            IndexSpec i( BSON( "loc" << "2d" ) );
-            Geo2dType g( &geo2dplugin , &i );
-            {
-                double x = 73.01212;
-                double y = 41.352964;
-                BSONObj in = BSON( "x" << x << "y" << y );
-                GeoHash h = g._hash( in );
-                BSONObj out = g._unhash( h );
-                assert( round(x) == round( out["x"].number() ) );
-                assert( round(y) == round( out["y"].number() ) );
-                assert( round( in["x"].number() ) == round( out["x"].number() ) );
-                assert( round( in["y"].number() ) == round( out["y"].number() ) );
-            }
+            int counter = 0;
+            Point p1 = _points[0];
+            for ( int i = 1; i <= size(); i++ ) {
+                Point p2 = _points[i % size()];
 
-            {
-                double x = -73.01212;
-                double y = 41.352964;
-                BSONObj in = BSON( "x" << x << "y" << y );
-                GeoHash h = g._hash( in );
-                BSONObj out = g._unhash( h );
-                assert( round(x) == round( out["x"].number() ) );
-                assert( round(y) == round( out["y"].number() ) );
-                assert( round( in["x"].number() ) == round( out["x"].number() ) );
-                assert( round( in["y"].number() ) == round( out["y"].number() ) );
-            }
+                GEODEBUG( "Doing intersection check of " << fudgeBox.toString() << " with seg " << p1.toString() << " to " << p2.toString() );
 
-            {
-                GeoHash h( "0000" );
-                h.move( 0 , 1 );
-                GEOHEQ( h , "0001" );
-                h.move( 0 , -1 );
-                GEOHEQ( h , "0000" );
+                // We need to check whether or not this segment intersects our error box
+                if( fudge > 0 &&
+                        // Points not too far below box
+                        fudgeBox._min._y <= std::max( p1._y, p2._y ) &&
+                        // Points not too far above box
+                        fudgeBox._max._y >= std::min( p1._y, p2._y ) &&
+                        // Points not too far to left of box
+                        fudgeBox._min._x <= std::max( p1._x, p2._x ) &&
+                        // Points not too far to right of box
+                        fudgeBox._max._x >= std::min( p1._x, p2._x ) ) {
 
-                h.init( "0001" );
-                h.move( 0 , 1 );
-                GEOHEQ( h , "0100" );
-                h.move( 0 , -1 );
-                GEOHEQ( h , "0001" );
+                    GEODEBUG( "Doing detailed check" );
 
+                    // If our box contains one or more of these points, we need to do an exact check.
+                    if( fudgeBox.inside(p1) ) {
+                        GEODEBUG( "Point 1 inside" );
+                        return 0;
+                    }
+                    if( fudgeBox.inside(p2) ) {
+                        GEODEBUG( "Point 2 inside" );
+                        return 0;
+                    }
 
-                h.init( "0000" );
-                h.move( 1 , 0 );
-                GEOHEQ( h , "0010" );
-            }
+                    // Do intersection check for vertical sides
+                    if ( p1._y != p2._y ) {
 
-            {
-                Box b( 5 , 5 , 2 );
-                assert( "(5,5) -->> (7,7)" == b.toString() );
-            }
+                        double invSlope = ( p2._x - p1._x ) / ( p2._y - p1._y );
 
-            {
-                GeoHash a = g.hash( 1 , 1 );
-                GeoHash b = g.hash( 4 , 5 );
-                assert( 5 == (int)(g.distance( a , b ) ) );
-                a = g.hash( 50 , 50 );
-                b = g.hash( 42 , 44 );
-                assert( round(10) == round(g.distance( a , b )) );
-            }
+                        double xintersT = ( fudgeBox._max._y - p1._y ) * invSlope + p1._x;
+                        if( fudgeBox._min._x <= xintersT && fudgeBox._max._x >= xintersT ) {
+                            GEODEBUG( "Top intersection @ " << xintersT );
+                            return 0;
+                        }
 
-            {
-                GeoHash x("0000");
-                assert( 0 == x.getHash() );
-                x.init( 0 , 1 , 32 );
-                GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" )
+                        double xintersB = ( fudgeBox._min._y - p1._y ) * invSlope + p1._x;
+                        if( fudgeBox._min._x <= xintersB && fudgeBox._max._x >= xintersB ) {
+                            GEODEBUG( "Bottom intersection @ " << xintersB );
+                            return 0;
+                        }
 
-                assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) );
-                assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) );
-            }
+                    }
 
-            {
-                GeoHash x("1010");
-                GEOHEQ( x , "1010" );
-                GeoHash y = x + "01";
-                GEOHEQ( y , "101001" );
-            }
+                    // Do intersection check for horizontal sides
+                    if( p1._x != p2._x ) {
 
-            {
+                        double slope = ( p2._y - p1._y ) / ( p2._x - p1._x );
 
-                GeoHash a = g.hash( 5 , 5 );
-                GeoHash b = g.hash( 5 , 7 );
-                GeoHash c = g.hash( 100 , 100 );
-                /*
-                cout << "a: " << a << endl;
-                cout << "b: " << b << endl;
-                cout << "c: " << c << endl;
+                        double yintersR = ( p1._x - fudgeBox._max._x ) * slope + p1._y;
+                        if( fudgeBox._min._y <= yintersR && fudgeBox._max._y >= yintersR ) {
+                            GEODEBUG( "Right intersection @ " << yintersR );
+                            return 0;
+                        }
 
-                cout << "a: " << a.toStringHex1() << endl;
-                cout << "b: " << b.toStringHex1() << endl;
-                cout << "c: " << c.toStringHex1() << endl;
-                */
-                BSONObj oa = a.wrap();
-                BSONObj ob = b.wrap();
-                BSONObj oc = c.wrap();
-                /*
-                cout << "a: " << oa.hexDump() << endl;
-                cout << "b: " << ob.hexDump() << endl;
-                cout << "c: " << oc.hexDump() << endl;
-                */
-                assert( oa.woCompare( ob ) < 0 );
-                assert( oa.woCompare( oc ) < 0 );
+                        double yintersL = ( p1._x - fudgeBox._min._x ) * slope + p1._y;
+                        if( fudgeBox._min._y <= yintersL && fudgeBox._max._y >= yintersL ) {
+                            GEODEBUG( "Left intersection @ " << yintersL );
+                            return 0;
+                        }
 
-            }
+                    }
 
-            {
-                GeoHash x( "000000" );
-                x.move( -1 , 0 );
-                GEOHEQ( x , "101010" );
-                x.move( 1 , -1 );
-                GEOHEQ( x , "010101" );
-                x.move( 0 , 1 );
-                GEOHEQ( x , "000000" );
-            }
+                }
+                else if( fudge == 0 ){
 
-            {
-                GeoHash prefix( "110011000000" );
-                GeoHash entry(  "1100110000011100000111000001110000011100000111000001000000000000" );
-                assert( ! entry.hasPrefix( prefix ) );
+                    // If this is an exact vertex, we won't intersect, so check this
+                    if( p._y == p1._y && p._x == p1._x ) return 1;
+                    else if( p._y == p2._y && p._x == p2._x ) return 1;
 
-                entry = GeoHash("1100110000001100000111000001110000011100000111000001000000000000");
-                assert( entry.toString().find( prefix.toString() ) == 0 );
-                assert( entry.hasPrefix( GeoHash( "1100" ) ) );
-                assert( entry.hasPrefix( prefix ) );
+                    // If this is a horizontal line we won't intersect, so check this
+                    if( p1._y == p2._y && p._y == p1._y ){
+                        // Check that the x-coord lies in the line
+                        if( p._x >= std::min( p1._x, p2._x ) && p._x <= std::max( p1._x, p2._x ) ) return 1;
+                    }
+
+                }
+
+                // Normal intersection test.
+                // TODO: Invert these for clearer logic?
+                if ( p._y > std::min( p1._y, p2._y ) ) {
+                    if ( p._y <= std::max( p1._y, p2._y ) ) {
+                        if ( p._x <= std::max( p1._x, p2._x ) ) {
+                            if ( p1._y != p2._y ) {
+                                double xinters = (p._y-p1._y)*(p2._x-p1._x)/(p2._y-p1._y)+p1._x;
+                                // Special case of point on vertical line
+                                if ( p1._x == p2._x && p._x == p1._x ){
+
+                                    // Need special case for the vertical edges, for example:
+                                    // 1) \e   pe/----->
+                                    // vs.
+                                    // 2) \ep---e/----->
+                                    //
+                                    // if we count exact as intersection, then 1 is in but 2 is out
+                                    // if we count exact as no-int then 1 is out but 2 is in.
+
+                                    return 1;
+                                }
+                                else if( p1._x == p2._x || p._x <= xinters ) {
+                                    counter++;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                p1 = p2;
             }
 
-            {
-                GeoHash a = g.hash( 50 , 50 );
-                GeoHash b = g.hash( 48 , 54 );
-                assert( round( 4.47214 ) == round( g.distance( a , b ) ) );
+            if ( counter % 2 == 0 ) {
+                return -1;
             }
+            else {
+                return 1;
+            }
+        }
 
+        /**
+         * Calculate the centroid, or center of mass of the polygon object.
+         */
+        Point centroid( void ) {
 
-            {
-                Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) );
-                assert( b.inside( 29.763 , -95.363 ) );
-                assert( ! b.inside( 32.9570255 , -96.1082497 ) );
-                assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) );
+            /* Centroid is cached, it won't change betwen points */
+            if ( _centroidCalculated ) {
+                return _centroid;
             }
 
-            {
-                GeoHash a( "11001111" );
-                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11") ) );
-                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11110000") ) );
+            Point cent;
+            double signedArea = 0.0;
+            double area = 0.0;  // Partial signed area
+
+            /// For all vertices except last
+            int i = 0;
+            for ( i = 0; i < size() - 1; ++i ) {
+                area = _points[i]._x * _points[i+1]._y - _points[i+1]._x * _points[i]._y ;
+                signedArea += area;
+                cent._x += ( _points[i]._x + _points[i+1]._x ) * area;
+                cent._y += ( _points[i]._y + _points[i+1]._y ) * area;
             }
 
-            {
-                int N = 10000;
-                {
-                    Timer t;
-                    for ( int i=0; i<N; i++ ) {
-                        unsigned x = (unsigned)rand();
-                        unsigned y = (unsigned)rand();
-                        GeoHash h( x , y );
-                        unsigned a,b;
-                        h.unhash_slow( a,b );
-                        assert( a == x );
-                        assert( b == y );
-                    }
-                    //cout << "slow: " << t.millis() << endl;
-                }
+            // Do last vertex
+            area = _points[i]._x * _points[0]._y - _points[0]._x * _points[i]._y;
+            cent._x += ( _points[i]._x + _points[0]._x ) * area;
+            cent._y += ( _points[i]._y + _points[0]._y ) * area;
+            signedArea += area;
+            signedArea *= 0.5;
+            cent._x /= ( 6 * signedArea );
+            cent._y /= ( 6 * signedArea );
 
-                {
-                    Timer t;
-                    for ( int i=0; i<N; i++ ) {
-                        unsigned x = (unsigned)rand();
-                        unsigned y = (unsigned)rand();
-                        GeoHash h( x , y );
-                        unsigned a,b;
-                        h.unhash_fast( a,b );
-                        assert( a == x );
-                        assert( b == y );
-                    }
-                    //cout << "fast: " << t.millis() << endl;
-                }
+            _centroidCalculated = true;
+            _centroid = cent;
 
-            }
+            return cent;
+        }
 
-            {
-                // see http://en.wikipedia.org/wiki/Great-circle_distance#Worked_example
+        Box bounds( void ) {
 
-                {
-                    Point BNA (-86.67, 36.12);
-                    Point LAX (-118.40, 33.94);
+            // TODO: Cache this
 
-                    double dist1 = spheredist_deg(BNA, LAX);
-                    double dist2 = spheredist_deg(LAX, BNA);
+            _bounds._max = _points[0];
+            _bounds._min = _points[0];
 
-                    // target is 0.45306
-                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
-                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
-                }
-                {
-                    Point BNA (-1.5127, 0.6304);
-                    Point LAX (-2.0665, 0.5924);
+            for ( int i = 1; i < size(); i++ ) {
 
-                    double dist1 = spheredist_rad(BNA, LAX);
-                    double dist2 = spheredist_rad(LAX, BNA);
+                _bounds._max._x = max( _bounds._max._x, _points[i]._x );
+                _bounds._max._y = max( _bounds._max._y, _points[i]._y );
+                _bounds._min._x = min( _bounds._min._x, _points[i]._x );
+                _bounds._min._y = min( _bounds._min._y, _points[i]._y );
 
-                    // target is 0.45306
-                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
-                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
-                }
-                {
-                    Point JFK (-73.77694444, 40.63861111 );
-                    Point LAX (-118.40, 33.94);
+            }
 
-                    double dist = spheredist_deg(JFK, LAX) * EARTH_RADIUS_MILES;
-                    assert( dist > 2469 && dist < 2470 );
-                }
+            return _bounds;
 
-                {
-                    Point BNA (-86.67, 36.12);
-                    Point LAX (-118.40, 33.94);
-                    Point JFK (-73.77694444, 40.63861111 );
-                    assert( spheredist_deg(BNA, BNA) < 1e-6);
-                    assert( spheredist_deg(LAX, LAX) < 1e-6);
-                    assert( spheredist_deg(JFK, JFK) < 1e-6);
+        }
 
-                    Point zero (0, 0);
-                    Point antizero (0,-180);
+    private:
 
-                    // these were known to cause NaN
-                    assert( spheredist_deg(zero, zero) < 1e-6);
-                    assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6);
-                    assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6);
-                }
-            }
+        bool _centroidCalculated;
+        Point _centroid;
+
+        Box _bounds;
+
+        vector<Point> _points;
+    };
+
+    class Geo2dPlugin : public IndexPlugin {
+    public:
+        Geo2dPlugin() : IndexPlugin( GEO2DNAME ) {
         }
-    } geoUnitTest;
+
+        virtual IndexType* generate( const IndexSpec* spec ) const {
+            return new Geo2dType( this , spec );
+        }
+    } geo2dplugin;
+
+    void __forceLinkGeoPlugin() {
+        geo2dplugin.getName();
+    }
+    
+
+
+    class GeoHopper;
 
     class GeoPoint {
     public:
-        GeoPoint() {
+
+        GeoPoint() : _distance( -1 ), _exact( false )
+        {}
+
+        //// Distance not used ////
+
+        GeoPoint( const GeoKeyNode& node )
+            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( -1 ) , _exact( false ) {
         }
 
-        GeoPoint( const KeyNode& node , double distance )
-            : _key( node.key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ) , _distance( distance ) {
+        //// Immediate initialization of distance ////
+
+        GeoPoint( const GeoKeyNode& node, double distance, bool exact )
+            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( distance ), _exact( exact ) {
         }
 
-        GeoPoint( const BSONObj& key , DiskLoc loc , double distance )
-            : _key(key) , _loc(loc) , _o( loc.obj() ) , _distance( distance ) {
+        GeoPoint( const GeoPoint& pt, double distance, bool exact )
+            : _key( pt.key() ) , _loc( pt.loc() ) , _o( pt.obj() ), _distance( distance ), _exact( exact ) {
         }
 
         bool operator<( const GeoPoint& other ) const {
-            return _distance < other._distance;
+            if( _distance != other._distance ) return _distance < other._distance;
+            if( _exact != other._exact ) return _exact < other._exact;
+            return _loc < other._loc;
         }
 
-        bool isEmpty() const {
+        double distance() const {
+            return _distance;
+        }
+
+        bool isExact() const {
+            return _exact;
+        }
+
+        BSONObj key() const {
+            return _key;
+        }
+
+        DiskLoc loc() const {
+            return _loc;
+        }
+
+        BSONObj obj() const {
+            return _o;
+        }
+
+        BSONObj pt() const {
+            return _pt;
+        }
+
+        bool isEmpty() {
             return _o.isEmpty();
         }
 
+        string toString() const {
+            return str::stream() << "Point from " << _o << " dist : " << _distance << ( _exact ? " (ex)" : " (app)" );
+        }
+
         BSONObj _key;
         DiskLoc _loc;
         BSONObj _o;
+        BSONObj _pt;
+
         double _distance;
+        bool _exact;
     };
 
+    // GeoBrowse subclasses this
     class GeoAccumulator {
     public:
-        GeoAccumulator( const Geo2dType * g , const BSONObj& filter )
-            : _g(g) , _lookedAt(0) , _objectsLoaded(0) , _found(0) {
+        GeoAccumulator( const Geo2dType * g , const BSONObj& filter, bool uniqueDocs, bool needDistance )
+            : _g(g) ,
+              _keysChecked(0) ,
+              _lookedAt(0) ,
+              _matchesPerfd(0) ,
+              _objectsLoaded(0) ,
+              _pointsLoaded(0) ,
+              _found(0) ,
+              _uniqueDocs( uniqueDocs ) ,
+              _needDistance( needDistance )
+        {
             if ( ! filter.isEmpty() ) {
                 _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) );
+                GEODEBUG( "Matcher is now " << _matcher->docMatcher().toString() );
             }
         }
 
-        virtual ~GeoAccumulator() {
-        }
+        virtual ~GeoAccumulator() { }
+
+        /** Check if we've already looked at a key.  ALSO marks as seen, anticipating a follow-up call 
+            to add().  This is broken out to avoid some work extracting the key bson if it's an
+            already seen point.
+        */
+    private:
+        set< pair<DiskLoc,int> > _seen;
+    public:
+        bool seen(DiskLoc bucket, int pos) {
 
-        virtual void add( const KeyNode& node ) {
-            // when looking at other boxes, don't want to look at some object twice
-            pair<set<DiskLoc>::iterator,bool> seenBefore = _seen.insert( node.recordLoc );
+            _keysChecked++;
+
+            pair< set<pair<DiskLoc,int> >::iterator, bool > seenBefore = _seen.insert( make_pair(bucket,pos) );
             if ( ! seenBefore.second ) {
-                GEODEBUG( "\t\t\t\t already seen : " << node.recordLoc.obj()["_id"] );
-                return;
+                GEODEBUG( "\t\t\t\t already seen : " << bucket.toString() << ' ' << pos ); // node.key.toString() << " @ " << Point( _g, GeoHash( node.key.firstElement() ) ).toString() << " with " << node.recordLoc.obj()["_id"] );
+                return true;
             }
+            return false;
+        }
+
+        enum KeyResult { BAD, BORDER, GOOD };
+
+        virtual void add( const GeoKeyNode& node ) {
+
+            GEODEBUG( "\t\t\t\t checking key " << node._key.toString() )
+
             _lookedAt++;
 
-            // distance check
-            double d = 0;
-            if ( ! checkDistance( GeoHash( node.key.firstElement() ) , d ) ) {
-                GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj()  << "\t" << d );
+            ////
+            // Approximate distance check using key data
+            ////
+            double keyD = 0;
+            Point keyP( _g, GeoHash( node._key.firstElement(), _g->_bits ) );
+            KeyResult keyOk = approxKeyCheck( keyP, keyD );
+            if ( keyOk == BAD ) {
+                GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj()  << "\t" << keyD );
                 return;
             }
-            GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj()  << "\t" << d );
+            GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj()  << "\t" << keyD );
 
-            // matcher
-            MatchDetails details;
-            if ( _matcher.get() ) {
-                bool good = _matcher->matches( node.key , node.recordLoc , &details );
-                if ( details.loadedObject )
-                    _objectsLoaded++;
+            ////
+            // Check for match using other key (and potentially doc) criteria
+            ////
+            // Remember match results for each object
+            map<DiskLoc, bool>::iterator match = _matched.find( node.recordLoc );
+            bool newDoc = match == _matched.end();
+            if( newDoc ) {
+
+                GEODEBUG( "\t\t\t\t matching new doc with " << (_matcher ? _matcher->docMatcher().toString() : "(empty)" ) );
+
+                // matcher
+                MatchDetails details;
+                if ( _matcher.get() ) {
+                    bool good = _matcher->matchesWithSingleKeyIndex( node._key , node.recordLoc , &details );
+
+                    _matchesPerfd++;
 
-                if ( ! good ) {
-                    GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] );
-                    return;
+                    if ( details._loadedObject )
+                        _objectsLoaded++;
+
+                    if ( ! good ) {
+                        GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] );
+                        _matched[ node.recordLoc ] = false;
+                        return;
+                    }
                 }
+
+                _matched[ node.recordLoc ] = true;
+
+                if ( ! details._loadedObject ) // don't double count
+                    _objectsLoaded++;
+
+            }
+            else if( !((*match).second) ) {
+                GEODEBUG( "\t\t\t\t previously didn't match : " << node.recordLoc.obj()["_id"] );
+                return;
             }
 
-            if ( ! details.loadedObject ) // dont double count
-                _objectsLoaded++;
+            ////
+            // Exact check with particular data fields
+            ////
+            // Can add multiple points
+            int diff = addSpecific( node , keyP, keyOk == BORDER, keyD, newDoc );
+            if( diff > 0 ) _found += diff;
+            else _found -= -diff;
 
-            addSpecific( node , d );
-            _found++;
         }
 
-        virtual void addSpecific( const KeyNode& node , double d ) = 0;
-        virtual bool checkDistance( const GeoHash& node , double& d ) = 0;
+        virtual void getPointsFor( const BSONObj& key, const BSONObj& obj, vector< BSONObj >& locsForNode, bool allPoints = false ){
 
-        long long found() const {
-            return _found;
-        }
+            // Find all the location objects from the keys
+            vector< BSONObj > locs;
+            _g->getKeys( obj, allPoints ? locsForNode : locs );
+            _pointsLoaded++;
 
-        const Geo2dType * _g;
-        set<DiskLoc> _seen;
-        auto_ptr<CoveredIndexMatcher> _matcher;
+            if( allPoints ) return;
+            if( locs.size() == 1 ){
+                locsForNode.push_back( locs[0] );
+                return;
+            }
 
-        long long _lookedAt;
-        long long _objectsLoaded;
-        long long _found;
-    };
+            // Find the particular location we want
+            GeoHash keyHash( key.firstElement(), _g->_bits );
 
-    class GeoHopper : public GeoAccumulator {
-    public:
-        typedef multiset<GeoPoint> Holder;
+            // log() << "Hash: " << node.key << " and " << keyHash.getHash() << " unique " << _uniqueDocs << endl;
+            for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) {
 
-        GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN)
-            : GeoAccumulator( g , filter ) , _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _farthest(-1)
-        {}
+                // Ignore all locations not hashed to the key's hash, since we may see
+                // those later
+                if( _g->_hash( *i ) != keyHash ) continue;
+
+                locsForNode.push_back( *i );
 
-        virtual bool checkDistance( const GeoHash& h , double& d ) {
-            switch (_type) {
-            case GEO_PLAIN:
-                d = _near.distance( Point(_g, h) );
-                break;
-            case GEO_SPHERE:
-                d = spheredist_deg(_near, Point(_g, h));
-                break;
-            default:
-                assert(0);
             }
-            bool good = d < _maxDistance && ( _points.size() < _max || d < farthest() );
-            GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near.toString() << "\t" << h << "\t" << d
-                      << " ok: " << good << " farthest: " << farthest() );
-            return good;
+
         }
 
-        virtual void addSpecific( const KeyNode& node , double d ) {
-            GEODEBUG( "\t\t" << GeoHash( node.key.firstElement() ) << "\t" << node.recordLoc.obj() << "\t" << d );
-            _points.insert( GeoPoint( node.key , node.recordLoc , d ) );
-            if ( _points.size() > _max ) {
-                _points.erase( --_points.end() );
+        virtual int addSpecific( const GeoKeyNode& node, const Point& p , bool inBounds, double d, bool newDoc ) = 0;
+        virtual KeyResult approxKeyCheck( const Point& p , double& keyD ) = 0;
+        virtual bool exactDocCheck( const Point& p , double& d ) = 0;
+        virtual bool expensiveExactCheck(){ return false; }
 
-                Holder::iterator i = _points.end();
-                i--;
-                _farthest = i->_distance;
-            }
-            else {
-                if (d > _farthest)
-                    _farthest = d;
-            }
-        }
 
-        double farthest() const {
-            return _farthest;
+        long long found() const {
+            return _found;
         }
 
+        const Geo2dType * _g;
+        map<DiskLoc, bool> _matched;
+        shared_ptr<CoveredIndexMatcher> _matcher;
+
+        long long _keysChecked;
+        long long _lookedAt;
+        long long _matchesPerfd;
+        long long _objectsLoaded;
+        long long _pointsLoaded;
+        long long _found;
+
+        bool _uniqueDocs;
+        bool _needDistance;
 
-        unsigned _max;
-        Point _near;
-        Holder _points;
-        double _maxDistance;
-        GeoDistType _type;
-        double _farthest;
     };
 
     struct BtreeLocation {
+        BtreeLocation() : ii(0) { }
+        IndexInterface *ii;
         int pos;
         bool found;
         DiskLoc bucket;
@@ -818,11 +1045,13 @@ namespace mongo {
         BSONObj key() {
             if ( bucket.isNull() )
                 return BSONObj();
-            return bucket.btree()->keyNode( pos ).key;
+            return ii->keyAt(bucket, pos);
+            //return bucket.btree<V>()->keyNode( pos ).key.toBson();
         }
 
         bool hasPrefix( const GeoHash& hash ) {
-            BSONElement e = key().firstElement();
+            BSONObj k = key();
+            BSONElement e = k.firstElement();
             if ( e.eoo() )
                 return false;
             return GeoHash( e ).hasPrefix( hash );
@@ -832,7 +1061,7 @@ namespace mongo {
 
             if ( bucket.isNull() )
                 return false;
-            bucket = bucket.btree()->advance( bucket , pos , direction , "btreelocation" );
+            bucket = ii->advance( bucket , pos , direction , "btreelocation" );
 
             if ( all )
                 return checkCur( totalFound , all );
@@ -844,9 +1073,15 @@ namespace mongo {
             if ( bucket.isNull() )
                 return false;
 
-            if ( bucket.btree()->isUsed(pos) ) {
+            if ( ii->isUsed(bucket, pos) ) {
                 totalFound++;
-                all->add( bucket.btree()->keyNode( pos ) );
+                if( !all->seen(bucket, pos) ) { 
+                    BSONObj o;
+                    DiskLoc recLoc;
+                    ii->keyAt(bucket, pos, o, recLoc);
+                    GeoKeyNode n(recLoc, o);
+                    all->add(n);
+                }
             }
             else {
                 GEODEBUG( "\t\t\t\t not used: " << key() );
@@ -861,6 +1096,9 @@ namespace mongo {
             return ss.str();
         }
 
+        // Returns the min and max keys which bound a particular location.
+        // The only time these may be equal is when we actually equal the location
+        // itself, otherwise our expanding algorithm will fail.
         static bool initial( const IndexDetails& id , const Geo2dType * spec ,
                              BtreeLocation& min , BtreeLocation&  max ,
                              GeoHash start ,
@@ -868,211 +1106,33 @@ namespace mongo {
 
             Ordering ordering = Ordering::make(spec->_order);
 
-            min.bucket = id.head.btree()->locate( id , id.head , start.wrap() ,
-                                                  ordering , min.pos , min.found , minDiskLoc );
-            if (hopper) min.checkCur( found , hopper );
-            max = min;
+            IndexInterface *ii = &id.idxInterface();
+            min.ii = ii;
+            max.ii = ii;
 
-            if ( min.bucket.isNull() || ( hopper && !(hopper->found()) ) ) {
-                min.bucket = id.head.btree()->locate( id , id.head , start.wrap() ,
-                                                      ordering , min.pos , min.found , minDiskLoc , -1 );
-                if (hopper) min.checkCur( found , hopper );
-            }
+            min.bucket = ii->locate( id , id.head , start.wrap() ,
+                                     ordering , min.pos , min.found , minDiskLoc, -1 );
 
-            return ! min.bucket.isNull() || ! max.bucket.isNull();
-        }
-    };
-
-    class GeoSearch {
-    public:
-        GeoSearch( const Geo2dType * g , const GeoHash& n , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN)
-            : _spec( g ) ,_startPt(g,n), _start( n ) ,
-              _numWanted( numWanted ) , _filter( filter ) , _maxDistance( maxDistance ) ,
-              _hopper( new GeoHopper( g , numWanted , _startPt , filter , maxDistance, type ) ), _type(type) {
-            assert( g->getDetails() );
-            _nscanned = 0;
-            _found = 0;
-
-            if (type == GEO_PLAIN) {
-                _scanDistance = maxDistance;
-            }
-            else if (type == GEO_SPHERE) {
-                if (maxDistance == numeric_limits<double>::max()) {
-                    _scanDistance = maxDistance;
-                }
-                else {
-                    //TODO: consider splitting into x and y scan distances
-                    _scanDistance = computeXScanDistance(_startPt._y, rad2deg(maxDistance));
-                }
-            }
-            else {
-                assert(0);
-            }
-        }
-
-        void exec() {
-            const IndexDetails& id = *_spec->getDetails();
-
-            const BtreeBucket * head = id.head.btree();
-            assert( head );
-            /*
-             * Search algorithm
-             * 1) use geohash prefix to find X items
-             * 2) compute max distance from want to an item
-             * 3) find optimal set of boxes that complete circle
-             * 4) use regular btree cursors to scan those boxes
-             */
-
-            GeoHopper * hopper = _hopper.get();
-
-            _prefix = _start;
-            BtreeLocation min,max;
-            {
-                // 1 regular geo hash algorithm
-
-
-                if ( ! BtreeLocation::initial( id , _spec , min , max , _start , _found , NULL ) )
-                    return;
-
-                while ( !_prefix.constrains() || // if next pass would cover universe, just keep going
-                        ( _hopper->found() < _numWanted && _spec->sizeEdge( _prefix ) <= _scanDistance)) {
-                    GEODEBUG( _prefix << "\t" << _found << "\t DESC" );
-                    while ( min.hasPrefix(_prefix) && min.checkCur(_found, hopper) && min.advance(-1, _found, NULL) )
-                        _nscanned++;
-                    GEODEBUG( _prefix << "\t" << _found << "\t ASC" );
-                    while ( max.hasPrefix(_prefix) && max.checkCur(_found, hopper) && max.advance(+1, _found, NULL) )
-                        _nscanned++;
-
-                    if ( ! _prefix.constrains() ) {
-                        GEODEBUG( "done search w/o part 2" )
-                        return;
-                    }
-
-                    _alreadyScanned = Box(_spec, _prefix);
-                    _prefix = _prefix.up();
-                }
-            }
-            GEODEBUG( "done part 1" );
-            {
-                // 2
-                double farthest = hopper->farthest();
-                GEODEBUGPRINT(hopper->farthest());
-                if (hopper->found() < _numWanted) {
-                    // Not enough found in Phase 1
-                    farthest = _scanDistance;
-                }
-                else if (_type == GEO_SPHERE) {
-                    farthest = std::min(_scanDistance, computeXScanDistance(_startPt._y, rad2deg(farthest)));
-                }
-                GEODEBUGPRINT(farthest);
-
-                Box want( _startPt._x - farthest , _startPt._y - farthest , farthest * 2 );
-                GEODEBUGPRINT(want.toString());
-
-                _prefix = _start;
-                while (_prefix.constrains() && _spec->sizeEdge( _prefix ) < farthest ) {
-                    _prefix = _prefix.up();
-                }
-
-                PREFIXDEBUG(_prefix, _spec);
-
-                if (_prefix.getBits() <= 1) {
-                    // TODO consider walking in $natural order
-
-                    while ( min.checkCur(_found, hopper) && min.advance(-1, _found, NULL) )
-                        _nscanned++;
-                    while ( max.checkCur(_found, hopper) && max.advance(+1, _found, NULL) )
-                        _nscanned++;
-
-                    GEODEBUG( "done search after scanning whole collection" )
-                    return;
-                }
-
-                if ( logLevel > 0 ) {
-                    log(1) << "want: " << want << " found:" << _found << " nscanned: " << _nscanned << " hash size:" << _spec->sizeEdge( _prefix )
-                           << " farthest: " << farthest << " using box: " << Box( _spec , _prefix ).toString() << endl;
-                }
-
-                for ( int x=-1; x<=1; x++ ) {
-                    for ( int y=-1; y<=1; y++ ) {
-                        GeoHash toscan = _prefix;
-                        toscan.move( x , y );
-
-                        // 3 & 4
-                        doBox( id , want , toscan );
-                    }
-                }
-            }
-            GEODEBUG( "done search" )
-
-        }
-
-        void doBox( const IndexDetails& id , const Box& want , const GeoHash& toscan , int depth = 0 ) {
-            Box testBox( _spec , toscan );
-            if ( logLevel > 2 ) {
-                cout << "\t";
-                for ( int i=0; i<depth; i++ )
-                    cout << "\t";
-                cout << " doBox: " << testBox.toString() << "\t" << toscan.toString() << " scanned so far: " << _nscanned << endl;
-            }
-            else {
-                GEODEBUGPRINT(testBox.toString());
-            }
-
-            if (_alreadyScanned.contains(testBox, _spec->_error)) {
-                GEODEBUG("skipping box: already scanned");
-                return; // been here, done this
-            }
-
-            double intPer = testBox.intersects( want );
-
-            if ( intPer <= 0 ) {
-                GEODEBUG("skipping box: not in want");
-                return;
-            }
-
-            bool goDeeper = intPer < .5 && depth < 2;
+            if (hopper) min.checkCur( found , hopper );
 
-            long long myscanned = 0;
+            // TODO: Might be able to avoid doing a full lookup in some cases here,
+            // but would add complexity and we're hitting pretty much the exact same data.
+            // Cannot set this = min in general, however.
+            max.bucket = ii->locate( id , id.head , start.wrap() ,
+                                     ordering , max.pos , max.found , minDiskLoc, 1 );
 
-            BtreeLocation loc;
-            loc.bucket = id.head.btree()->locate( id , id.head , toscan.wrap() , Ordering::make(_spec->_order) ,
-                                                  loc.pos , loc.found , minDiskLoc );
-            loc.checkCur( _found , _hopper.get() );
-            while ( loc.hasPrefix( toscan ) && loc.advance( 1 , _found , _hopper.get() ) ) {
-                _nscanned++;
-                if ( ++myscanned > 100 && goDeeper ) {
-                    doBox( id , want , toscan + "00" , depth + 1);
-                    doBox( id , want , toscan + "01" , depth + 1);
-                    doBox( id , want , toscan + "10" , depth + 1);
-                    doBox( id , want , toscan + "11" , depth + 1);
-                    return;
-                }
-            }
+            if (hopper) max.checkCur( found , hopper );
 
+            return ! min.bucket.isNull() || ! max.bucket.isNull();
         }
-
-
-        const Geo2dType * _spec;
-
-        Point _startPt;
-        GeoHash _start;
-        GeoHash _prefix;
-        int _numWanted;
-        BSONObj _filter;
-        double _maxDistance;
-        double _scanDistance;
-        shared_ptr<GeoHopper> _hopper;
-
-        long long _nscanned;
-        int _found;
-        GeoDistType _type;
-
-        Box _alreadyScanned;
     };
 
+
     class GeoCursorBase : public Cursor {
     public:
+
+        static const shared_ptr< CoveredIndexMatcher > emptyMatcher;
+
         GeoCursorBase( const Geo2dType * spec )
             : _spec( spec ), _id( _spec->getDetails() ) {
 
@@ -1106,68 +1166,34 @@ namespace mongo {
         const IndexDetails * _id;
     };
 
-    class GeoSearchCursor : public GeoCursorBase {
-    public:
-        GeoSearchCursor( shared_ptr<GeoSearch> s )
-            : GeoCursorBase( s->_spec ) ,
-              _s( s ) , _cur( s->_hopper->_points.begin() ) , _end( s->_hopper->_points.end() ), _nscanned() {
-            if ( _cur != _end ) {
-                ++_nscanned;
-            }
-        }
-
-        virtual ~GeoSearchCursor() {}
-
-        virtual bool ok() {
-            return _cur != _end;
-        }
+    const shared_ptr< CoveredIndexMatcher > GeoCursorBase::emptyMatcher( new CoveredIndexMatcher( BSONObj(), BSONObj(), false ) );
 
-        virtual Record* _current() { assert(ok()); return _cur->_loc.rec(); }
-        virtual BSONObj current() { assert(ok()); return _cur->_o; }
-        virtual DiskLoc currLoc() { assert(ok()); return _cur->_loc; }
-        virtual bool advance() {
-            if( ok() ){
-                _cur++;
-                incNscanned();
-                return ok();
-            }
-            return false;
-        }
-        virtual BSONObj currKey() const { return _cur->_key; }
-
-        virtual string toString() {
-            return "GeoSearchCursor";
-        }
-
-
-        virtual BSONObj prettyStartKey() const {
-            return BSON( _s->_spec->_geo << _s->_prefix.toString() );
-        }
-        virtual BSONObj prettyEndKey() const {
-            GeoHash temp = _s->_prefix;
-            temp.move( 1 , 1 );
-            return BSON( _s->_spec->_geo << temp.toString() );
-        }
+    // TODO: Pull out the cursor bit from the browse, have GeoBrowse as field of cursor to clean up
+    // this hierarchy a bit.  Also probably useful to look at whether GeoAccumulator can be a member instead
+    // of a superclass.
+    class GeoBrowse : public GeoCursorBase , public GeoAccumulator {
+    public:
 
-        virtual long long nscanned() { return _nscanned; }
+        // The max points which should be added to an expanding box
+        static const int maxPointsHeuristic = 300;
 
-        virtual CoveredIndexMatcher *matcher() const {
-        	return _s->_hopper->_matcher.get();
-        }
+        // Expand states
+        enum State {
+            START ,
+            DOING_EXPAND ,
+            DONE_NEIGHBOR ,
+            DONE
+        } _state;
 
-        shared_ptr<GeoSearch> _s;
-        GeoHopper::Holder::iterator _cur;
-        GeoHopper::Holder::iterator _end;
+        GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj(), bool uniqueDocs = true, bool needDistance = false )
+            : GeoCursorBase( g ), GeoAccumulator( g , filter, uniqueDocs, needDistance ) ,
+              _type( type ) , _filter( filter ) , _firstCall(true), _nscanned(), _centerPrefix(0, 0, 0) {
 
-        void incNscanned() { if ( ok() ) { ++_nscanned; } }
-        long long _nscanned;
-    };
+            // Set up the initial expand state
+            _state = START;
+            _neighbor = -1;
+            _foundInExp = 0;
 
-    class GeoBrowse : public GeoCursorBase , public GeoAccumulator {
-    public:
-        GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj() )
-            : GeoCursorBase( g ) ,GeoAccumulator( g , filter ) ,
-              _type( type ) , _filter( filter ) , _firstCall(true), _nscanned() {
         }
 
         virtual string toString() {
@@ -1177,7 +1203,7 @@ namespace mongo {
         virtual bool ok() {
             bool first = _firstCall;
             if ( _firstCall ) {
-                fillStack();
+                fillStack( maxPointsHeuristic );
                 _firstCall = false;
             }
             if ( ! _cur.isEmpty() || _stack.size() ) {
@@ -1188,7 +1214,7 @@ namespace mongo {
             }
 
             while ( moreToDo() ) {
-                fillStack();
+                fillStack( maxPointsHeuristic );
                 if ( ! _cur.isEmpty() ) {
                     if ( first ) {
                         ++_nscanned;
@@ -1214,7 +1240,7 @@ namespace mongo {
                 return false;
 
             while ( _cur.isEmpty() && moreToDo() )
-                fillStack();
+                fillStack( maxPointsHeuristic );
             return ! _cur.isEmpty() && ++_nscanned;
         }
 
@@ -1223,18 +1249,308 @@ namespace mongo {
         virtual DiskLoc currLoc() { assert(ok()); return _cur._loc; }
         virtual BSONObj currKey() const { return _cur._key; }
 
-        virtual CoveredIndexMatcher *matcher() const {
-        	return _matcher.get();
+        virtual CoveredIndexMatcher* matcher() const {
+            if( _matcher.get() ) return _matcher.get();
+            else return GeoCursorBase::emptyMatcher.get();
+        }
+
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const {
+            if( _matcher.get() ) return _matcher;
+            else return GeoCursorBase::emptyMatcher;
+        }
+
+        // Are we finished getting points?
+        virtual bool moreToDo() {
+            return _state != DONE;
         }
 
-        virtual bool moreToDo() = 0;
-        virtual void fillStack() = 0;
+        virtual bool supportGetMore() { return true; }
+
+        // Fills the stack, but only checks a maximum number of maxToCheck points at a time.
+        // Further calls to this function will continue the expand/check neighbors algorithm.
+        virtual void fillStack( int maxToCheck, int maxToAdd = -1, bool onlyExpand = false ) {
+
+#ifdef GEODEBUGGING
+            log() << "Filling stack with maximum of " << maxToCheck << ", state : " << (int) _state << endl;
+#endif
+
+            if( maxToAdd < 0 ) maxToAdd = maxToCheck;
+            int maxFound = _foundInExp + maxToCheck;
+            assert( maxToCheck > 0 );
+            assert( maxFound > 0 );
+            assert( _found <= 0x7fffffff ); // conversion to int
+            int maxAdded = static_cast<int>(_found) + maxToAdd;
+            assert( maxAdded >= 0 ); // overflow check
+
+            bool isNeighbor = _centerPrefix.constrains();
+
+            // Starting a box expansion
+            if ( _state == START ) {
+
+                // Get the very first hash point, if required
+                if( ! isNeighbor )
+                    _prefix = expandStartHash();
+
+                GEODEBUG( "initializing btree" );
+
+#ifdef GEODEBUGGING
+                log() << "Initializing from b-tree with hash of " << _prefix << " @ " << Box( _g, _prefix ) << endl;
+#endif
+
+                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , _prefix , _foundInExp , this ) )
+                    _state = isNeighbor ? DONE_NEIGHBOR : DONE;
+                else {
+                    _state = DOING_EXPAND;
+                    _lastPrefix.reset();
+                }
+
+                GEODEBUG( (_state == DONE_NEIGHBOR || _state == DONE ? "not initialized" : "initializedFig") );
+
+            }
+
+            // Doing the actual box expansion
+            if ( _state == DOING_EXPAND ) {
+
+                while ( true ) {
+
+                    GEODEBUG( "box prefix [" << _prefix << "]" );
+#ifdef GEODEBUGGING
+                    if( _prefix.constrains() ) {
+                        log() << "current expand box : " << Box( _g, _prefix ).toString() << endl;
+                    }
+                    else {
+                        log() << "max expand box." << endl;
+                    }
+#endif
+
+                    GEODEBUG( "expanding box points... ");
+
+                    // Record the prefix we're actively exploring...
+                    _expPrefix.reset( new GeoHash( _prefix ) );
+
+                    // Find points inside this prefix
+                    while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _foundInExp , this ) && _foundInExp < maxFound && _found < maxAdded );
+                    while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _foundInExp , this ) && _foundInExp < maxFound && _found < maxAdded );
+
+#ifdef GEODEBUGGING
+
+                    log() << "finished expand, checked : " << ( maxToCheck - ( maxFound - _foundInExp ) )
+                          << " found : " << ( maxToAdd - ( maxAdded - _found ) )
+                          << " max : " << maxToCheck << " / " << maxToAdd << endl;
+
+#endif
+
+                    GEODEBUG( "finished expand, found : " << ( maxToAdd - ( maxAdded - _found ) ) );
+                    if( _foundInExp >= maxFound || _found >= maxAdded ) return;
+
+                    // We've searched this prefix fully, remember
+                    _lastPrefix.reset( new GeoHash( _prefix ));
+
+                    // If we've searched the entire space, we're finished.
+                    if ( ! _prefix.constrains() ) {
+                        GEODEBUG( "box exhausted" );
+                        _state = DONE;
+                        notePrefix();
+                        return;
+                    }
+
+                    // If we won't fit in the box, and we're not doing a sub-scan, increase the size
+                    if ( ! fitsInBox( _g->sizeEdge( _prefix ) ) && _fringe.size() <= 1 ) {
+
+                        // If we're still not expanded bigger than the box size, expand again
+                        // TODO: Is there an advantage to scanning prior to expanding?
+                        _prefix = _prefix.up();
+                        continue;
+
+                    }
+
+                    // We're done and our size is large enough
+                    _state = DONE_NEIGHBOR;
+
+                    // Go to the next sub-box, if applicable
+                    if( _fringe.size() > 0 ) _fringe.pop_back();
+                    // Go to the next neighbor if this was the last sub-search
+                    if( _fringe.size() == 0 ) _neighbor++;
+
+                    break;
+
+                }
+
+                notePrefix();
+            }
+
+            // If we doeighbors
+            if( onlyExpand ) return;
+
+            // If we're done expanding the current box...
+            if( _state == DONE_NEIGHBOR ) {
+
+                // Iterate to the next neighbor
+                // Loop is useful for cases where we want to skip over boxes entirely,
+                // otherwise recursion increments the neighbors.
+                for ( ; _neighbor < 9; _neighbor++ ) {
+
+                    // If we have no fringe for the neighbor, make sure we have the default fringe
+                    if( _fringe.size() == 0 ) _fringe.push_back( "" );
+
+                    if( ! isNeighbor ) {
+                        _centerPrefix = _prefix;
+                        _centerBox = Box( _g, _centerPrefix );
+                        isNeighbor = true;
+                    }
+
+                    int i = (_neighbor / 3) - 1;
+                    int j = (_neighbor % 3) - 1;
+
+                    if ( ( i == 0 && j == 0 ) ||
+                            ( i < 0 && _centerBox._min._x <= _g->_min ) ||
+                            ( j < 0 && _centerBox._min._y <= _g->_min ) ||
+                            ( i > 0 && _centerBox._max._x >= _g->_max ) ||
+                            ( j > 0 && _centerBox._max._y >= _g->_max ) ) {
+                        continue; // main box or wrapped edge
+                        // TODO:  We may want to enable wrapping in future, probably best as layer on top of
+                        // this search.
+                    }
+
+                    // Make sure we've got a reasonable center
+                    assert( _centerPrefix.constrains() );
+
+                    GeoHash _neighborPrefix = _centerPrefix;
+                    _neighborPrefix.move( i, j );
+
+                    GEODEBUG( "moving to " << i << " , " << j << " fringe : " << _fringe.size() );
+                    PREFIXDEBUG( _centerPrefix, _g );
+                    PREFIXDEBUG( _neighborPrefix , _g );
+                    while( _fringe.size() > 0 ) {
+
+                        _prefix = _neighborPrefix + _fringe.back();
+                        Box cur( _g , _prefix );
+
+                        PREFIXDEBUG( _prefix, _g );
+
+                        double intAmt = intersectsBox( cur );
+
+                        // No intersection
+                        if( intAmt <= 0 ) {
+                            GEODEBUG( "skipping box" << cur.toString() );
+                            _fringe.pop_back();
+                            continue;
+                        }
+                        // Large intersection, refine search
+                        else if( intAmt > 0.5 && _prefix.canRefine() && _fringe.back().size() < 4 /* two bits */ ) {
+
+                            GEODEBUG( "Adding to fringe: " << _fringe.back() << " curr prefix : " << _prefix << " bits : " << _prefix.getBits() );
+
+                            // log() << "Diving to level : " << ( _fringe.back().size() / 2 + 1 ) << endl;
+
+                            string lastSuffix = _fringe.back();
+                            _fringe.pop_back();
+                            _fringe.push_back( lastSuffix + "00" );
+                            _fringe.push_back( lastSuffix + "01" );
+                            _fringe.push_back( lastSuffix + "11" );
+                            _fringe.push_back( lastSuffix + "10" );
+
+                            continue;
+                        }
+
+                        // Restart our search from a diff box.
+                        _state = START;
+
+                        assert( ! onlyExpand );
+
+                        assert( _found <= 0x7fffffff );
+                        fillStack( maxFound - _foundInExp, maxAdded - static_cast<int>(_found) );
+
+                        // When we return from the recursive fillStack call, we'll either have checked enough points or
+                        // be entirely done.  Max recurse depth is < 8 * 16.
+
+                        // If we're maxed out on points, return
+                        if( _foundInExp >= maxFound || _found >= maxAdded ) {
+                            // Make sure we'll come back to add more points
+                            assert( _state == DOING_EXPAND );
+                            return;
+                        }
+
+                        // Otherwise we must be finished to return
+                        assert( _state == DONE );
+                        return;
+
+                    }
+
+                }
+
+                // Finished with neighbors
+                _state = DONE;
+            }
 
-        virtual void addSpecific( const KeyNode& node , double d ) {
-            if ( _cur.isEmpty() )
-                _cur = GeoPoint( node , d );
-            else
-                _stack.push_back( GeoPoint( node , d ) );
+        }
+
+        // The initial geo hash box for our first expansion
+        virtual GeoHash expandStartHash() = 0;
+
+        // Whether the current box width is big enough for our search area
+        virtual bool fitsInBox( double width ) = 0;
+
+        // The amount the current box overlaps our search area
+        virtual double intersectsBox( Box& cur ) = 0;
+
+        virtual int addSpecific( const GeoKeyNode& node , const Point& keyP , bool onBounds , double keyD , bool newDoc ) {
+
+            int found = 0;
+
+            // We need to handle every possible point in this method, even those not in the key value, to
+            // avoid us tracking which hashes we've already seen.
+            if( ! newDoc ){
+                // log() << "Already handled doc!" << endl;
+                return 0;
+            }
+
+            if( _uniqueDocs && ! onBounds ) {
+                // log() << "Added ind to " << _type << endl;
+                _stack.push_front( GeoPoint( node ) );
+                found++;
+            }
+            else {
+                // We now handle every possible point in the document, even those not in the key value,
+                // since we're iterating through them anyway - prevents us from having to save the hashes
+                // we've seen per-doc
+
+                // If we're filtering by hash, get the original
+                bool expensiveExact = expensiveExactCheck();
+
+                vector< BSONObj > locs;
+                getPointsFor( node._key, node.recordLoc.obj(), locs, true );
+                for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ){
+
+                    double d = -1;
+                    Point p( *i );
+
+                    // We can avoid exact document checks by redoing approx checks,
+                    // if the exact checks are more expensive.
+                    bool needExact = true;
+                    if( expensiveExact ){
+                        assert( false );
+                        KeyResult result = approxKeyCheck( p, d );
+                        if( result == BAD ) continue;
+                        else if( result == GOOD ) needExact = false;
+                    }
+
+                    if( ! needExact || exactDocCheck( p, d ) ){
+                        // log() << "Added mult to " << _type << endl;
+                        _stack.push_front( GeoPoint( node ) );
+                        found++;
+                        // If returning unique, just exit after first point is added
+                        if( _uniqueDocs ) break;
+                    }
+                }
+            }
+
+            if ( _cur.isEmpty() && _stack.size() > 0 ){
+                _cur = _stack.front();
+                _stack.pop_front();
+            }
+
+            return found;
         }
 
         virtual long long nscanned() {
@@ -1244,6 +1560,35 @@ namespace mongo {
             return _nscanned;
         }
 
+        virtual void explainDetails( BSONObjBuilder& b ){
+            b << "keysChecked" << _keysChecked;
+            b << "lookedAt" << _lookedAt;
+            b << "matchesPerfd" << _matchesPerfd;
+            b << "objectsLoaded" << _objectsLoaded;
+            b << "pointsLoaded" << _pointsLoaded;
+        }
+
+        virtual BSONObj prettyIndexBounds() const {
+
+            vector<GeoHash>::const_iterator i = _expPrefixes.end();
+            if( _expPrefixes.size() > 0 && *(--i) != *( _expPrefix.get() ) )
+                _expPrefixes.push_back( *( _expPrefix.get() ) );
+
+            BSONObjBuilder bob;
+            BSONArrayBuilder bab;
+            for( i = _expPrefixes.begin(); i != _expPrefixes.end(); ++i ){
+                bab << Box( _g, *i ).toBSON();
+            }
+            bob << _g->_geo << bab.arr();
+
+            return bob.obj();
+
+        }
+
+        void notePrefix() {
+            _expPrefixes.push_back( _prefix );
+        }
+
         string _type;
         BSONObj _filter;
         list<GeoPoint> _stack;
@@ -1253,189 +1598,695 @@ namespace mongo {
 
         long long _nscanned;
 
+        // The current box we're expanding (-1 is first/center box)
+        int _neighbor;
+
+        // The points we've found so far
+        // TODO:  Long long?
+        int _foundInExp;
+
+        // The current hash prefix we're expanding and the center-box hash prefix
+        GeoHash _prefix;
+        shared_ptr<GeoHash> _lastPrefix;
+        GeoHash _centerPrefix;
+        list<string> _fringe;
+        int recurseDepth;
+        Box _centerBox;
+
+        // Start and end of our search range in the current box
+        BtreeLocation _min;
+        BtreeLocation _max;
+
+        shared_ptr<GeoHash> _expPrefix;
+        mutable vector<GeoHash> _expPrefixes;
+
     };
 
-    class GeoCircleBrowse : public GeoBrowse {
+
+    class GeoHopper : public GeoBrowse {
     public:
+        typedef multiset<GeoPoint> Holder;
 
-        enum State {
-            START ,
-            DOING_EXPAND ,
-            DOING_AROUND ,
-            DONE
-        } _state;
+        GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = true )
+            : GeoBrowse( g, "search", filter, uniqueDocs, needDistance ), _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _distError( type == GEO_PLAIN ? g->_error : g->_errorSphere ), _farthest(0)
+        {}
 
-        GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center")
-            : GeoBrowse( g , "circle" , filter ) {
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
 
-            uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 );
-            BSONObjIterator i(circle);
-            BSONElement center = i.next();
-            _start = g->_tohash(center);
-            _startPt = Point(center);
-            _prefix = _start;
-            _maxDistance = i.next().numberDouble();
-            uassert( 13061 , "need a max distance > 0 " , _maxDistance > 0 );
-            _maxDistance += g->_error;
+            // Always check approximate distance, since it lets us avoid doing
+            // checks of the rest of the object if it succeeds
 
-            _state = START;
-            _found = 0;
+            switch (_type) {
+            case GEO_PLAIN:
+                d = _near.distance( p );
+                break;
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                d = spheredist_deg( _near, p );
+                break;
+            default: assert( false );
+            }
+            assert( d >= 0 );
 
-            if (type == "$center") {
-                _type = GEO_PLAIN;
-                _xScanDistance = _maxDistance;
-                _yScanDistance = _maxDistance;
+            GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near.toString()
+                      << "\t" << p.toString() << "\t" << d
+                      << " farthest: " << farthest() );
+
+            // If we need more points
+            double borderDist = ( _points.size() < _max ? _maxDistance : farthest() );
+
+            if( d >= borderDist - 2 * _distError && d <= borderDist + 2 * _distError ) return BORDER;
+            else return d < borderDist ? GOOD : BAD;
+
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+
+            bool within = false;
+
+            // Get the appropriate distance for the type
+            switch ( _type ) {
+            case GEO_PLAIN:
+                d = _near.distance( p );
+                within = _near.distanceWithin( p, _maxDistance );
+                break;
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                d = spheredist_deg( _near, p );
+                within = ( d <= _maxDistance );
+                break;
+            default: assert( false );
             }
-            else if (type == "$centerSphere") {
-                uassert(13461, "Spherical MaxDistance > PI. Are you sure you are using radians?", _maxDistance < M_PI);
 
-                _type = GEO_SPHERE;
-                _yScanDistance = rad2deg(_maxDistance);
-                _xScanDistance = computeXScanDistance(_startPt._y, _yScanDistance);
+            return within;
+        }
 
-                uassert(13462, "Spherical distance would require wrapping, which isn't implemented yet",
-                        (_startPt._x + _xScanDistance < 180) && (_startPt._x - _xScanDistance > -180) &&
-                        (_startPt._y + _yScanDistance < 90) && (_startPt._y - _yScanDistance > -90));
+        // Always in distance units, whether radians or normal
+        double farthest() const {
+            return _farthest;
+        }
+
+        virtual int addSpecific( const GeoKeyNode& node, const Point& keyP, bool onBounds, double keyD, bool newDoc ) {
 
-                GEODEBUGPRINT(_maxDistance);
-                GEODEBUGPRINT(_xScanDistance);
-                GEODEBUGPRINT(_yScanDistance);
+            // Unique documents
+
+            GeoPoint newPoint( node, keyD, false );
+
+            int prevSize = _points.size();
+
+            // STEP 1 : Remove old duplicate points from the set if needed
+            if( _uniqueDocs ){
+
+                // Lookup old point with same doc
+                map< DiskLoc , Holder::iterator >::iterator oldPointIt = _seenPts.find( newPoint.loc() );
+
+                if( oldPointIt != _seenPts.end() ){
+                    const GeoPoint& oldPoint = *(oldPointIt->second);
+                    // We don't need to care if we've already seen this same approx pt or better,
+                    // or we've already gone to disk once for the point
+                    if( oldPoint < newPoint ){
+                        GEODEBUG( "\t\tOld point closer than new point" );
+                        return 0;
+                    }
+                    GEODEBUG( "\t\tErasing old point " << oldPointIt->first.obj() );
+                    _points.erase( oldPointIt->second );
+                }
             }
-            else {
-                uassert(13460, "invalid $center query type: " + type, false);
+
+            Holder::iterator newIt = _points.insert( newPoint );
+            if( _uniqueDocs ) _seenPts[ newPoint.loc() ] = newIt;
+
+            GEODEBUG( "\t\tInserted new point " << newPoint.toString() << " approx : " << keyD );
+
+            assert( _max > 0 );
+
+            Holder::iterator lastPtIt = _points.end();
+            lastPtIt--;
+            _farthest = lastPtIt->distance() + 2 * _distError;
+
+            return _points.size() - prevSize;
+
+        }
+
+        // Removes extra points from end of _points set.
+        // Check can be a bit costly if we have lots of exact points near borders,
+        // so we'll do this every once and awhile.
+        void processExtraPoints(){
+
+            if( _points.size() == 0 ) return;
+
+            int prevSize = _points.size();
+
+            // Erase all points from the set with a position >= _max *and*
+            // whose distance isn't close to the _max - 1 position distance
+
+            int numToErase = _points.size() - _max;
+            if( numToErase < 0 ) numToErase = 0;
+
+            // Get the first point definitely in the _points array
+            Holder::iterator startErase = _points.end();
+            for( int i = 0; i < numToErase + 1; i++ ) startErase--;
+            _farthest = startErase->distance() + 2 * _distError;
+
+            GEODEBUG( "\t\tPotentially erasing " << numToErase << " points, " << " size : " << _points.size() << " max : " << _max << " dist : " << startErase->distance() << " farthest dist : " << _farthest << " from error : " << _distError );
+
+            startErase++;
+            while( numToErase > 0 && startErase->distance() <= _farthest ){
+                GEODEBUG( "\t\tNot erasing point " << startErase->toString() );
+                numToErase--;
+                startErase++;
+                assert( startErase != _points.end() || numToErase == 0 );
             }
 
-            ok();
+            if( _uniqueDocs ){
+                for( Holder::iterator i = startErase; i != _points.end(); ++i )
+                    _seenPts.erase( i->loc() );
+            }
+
+            _points.erase( startErase, _points.end() );
+
+            int diff = _points.size() - prevSize;
+            if( diff > 0 ) _found += diff;
+            else _found -= -diff;
+
         }
 
-        virtual bool moreToDo() {
-            return _state != DONE;
+        unsigned _max;
+        Point _near;
+        Holder _points;
+        double _maxDistance;
+        GeoDistType _type;
+        double _distError;
+        double _farthest;
+
+        map< DiskLoc , Holder::iterator > _seenPts;
+
+    };
+
+
+
+    class GeoSearch : public GeoHopper {
+    public:
+        GeoSearch( const Geo2dType * g , const Point& startPt , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = false )
+           : GeoHopper( g , numWanted , startPt , filter , maxDistance, type, uniqueDocs, needDistance ),
+             _start( g->hash( startPt._x, startPt._y ) ),
+             // TODO:  Remove numWanted...
+             _numWanted( numWanted ),
+             _type(type)
+        {
+
+           assert( g->getDetails() );
+            _nscanned = 0;
+            _found = 0;
+
+            if( _maxDistance < 0 ){
+               _scanDistance = numeric_limits<double>::max();
+            }
+            else if (type == GEO_PLAIN) {
+                _scanDistance = maxDistance + _spec->_error;
+            }
+            else if (type == GEO_SPHERE) {
+                checkEarthBounds( startPt );
+                // TODO: consider splitting into x and y scan distances
+                _scanDistance = computeXScanDistance( startPt._y, rad2deg( _maxDistance ) + _spec->_error );
+            }
+
+            assert( _scanDistance > 0 );
+
         }
 
-        virtual void fillStack() {
+        void exec() {
 
-            if ( _state == START ) {
-                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max ,
-                                               _prefix , _found , this ) ) {
-                    _state = DONE;
-                    return;
+            if( _numWanted == 0 ) return;
+
+            /*
+             * Search algorithm
+             * 1) use geohash prefix to find X items
+             * 2) compute max distance from want to an item
+             * 3) find optimal set of boxes that complete circle
+             * 4) use regular btree cursors to scan those boxes
+             */
+
+#ifdef GEODEBUGGING
+
+           log() << "start near search for " << _numWanted << " points near " << _near << " (max dist " << _maxDistance << ")" << endl;
+
+#endif
+
+           // Part 1
+           {
+               do {
+                   long long f = found();
+                   assert( f <= 0x7fffffff );
+                   fillStack( maxPointsHeuristic, _numWanted - static_cast<int>(f) , true );
+                   processExtraPoints();
+               } while( _state != DONE && _state != DONE_NEIGHBOR &&
+                        found() < _numWanted &&
+                        (! _prefix.constrains() || _g->sizeEdge( _prefix ) <= _scanDistance ) );
+
+               // If we couldn't scan or scanned everything, we're done
+               if( _state == DONE ){
+                   expandEndPoints();
+                   return;
+               }
+           }
+
+#ifdef GEODEBUGGING
+
+           log() << "part 1 of near search completed, found " << found() << " points (out of " << _foundInExp << " scanned)"
+                 << " in expanded region " << _prefix << " @ " << Box( _g, _prefix )
+                 << " with furthest distance " << farthest() << endl;
+
+#endif
+
+           // Part 2
+            {
+
+               // Find farthest distance for completion scan
+                double farDist = farthest();
+                if( found() < _numWanted ) {
+                    // Not enough found in Phase 1
+                    farDist = _scanDistance;
                 }
-                _state = DOING_EXPAND;
-            }
+                else if ( _type == GEO_PLAIN ) {
+                   // Enough found, but need to search neighbor boxes
+                    farDist += _spec->_error;
+                }
+                else if ( _type == GEO_SPHERE ) {
+                   // Enough found, but need to search neighbor boxes
+                    farDist = std::min( _scanDistance, computeXScanDistance( _near._y, rad2deg( farDist ) ) + 2 * _spec->_error );
+                }
+                assert( farDist >= 0 );
+                GEODEBUGPRINT( farDist );
 
+                // Find the box that includes all the points we need to return
+                _want = Box( _near._x - farDist , _near._y - farDist , farDist * 2 );
+                GEODEBUGPRINT( _want.toString() );
 
-            if ( _state == DOING_AROUND ) {
-                // TODO could rework and return rather than looping
-                for (int i=-1; i<=1; i++) {
-                    for (int j=-1; j<=1; j++) {
-                        if (i == 0 && j == 0)
-                            continue; // main box
+                // log() << "Found : " << found() << " wanted : " << _numWanted << " Far distance : " << farDist << " box : " << _want << endl;
 
-                        GeoHash newBox = _prefix;
-                        newBox.move(i, j);
+                // Remember the far distance for further scans
+                _scanDistance = farDist;
 
-                        PREFIXDEBUG(newBox, _g);
-                        if (needToCheckBox(newBox)) {
-                            // TODO consider splitting into quadrants
-                            getPointsForPrefix(newBox);
-                        }
-                        else  {
-                            GEODEBUG("skipping box");
-                        }
-                    }
+                // Reset the search, our distances have probably changed
+                if( _state == DONE_NEIGHBOR ){
+                   _state = DOING_EXPAND;
+                   _neighbor = -1;
                 }
 
-                _state = DONE;
+#ifdef GEODEBUGGING
+
+                log() << "resetting search with start at " << _start << " (edge length " << _g->sizeEdge( _start ) << ")" << endl;
+
+#endif
+
+                // Do regular search in the full region
+                do {
+                   fillStack( maxPointsHeuristic );
+                   processExtraPoints();
+                }
+                while( _state != DONE );
+
+            }
+
+            GEODEBUG( "done near search with " << _points.size() << " points " );
+
+            expandEndPoints();
+
+        }
+
+        void addExactPoints( const GeoPoint& pt, Holder& points, bool force ){
+            int before, after;
+            addExactPoints( pt, points, before, after, force );
+        }
+
+        void addExactPoints( const GeoPoint& pt, Holder& points, int& before, int& after, bool force ){
+
+            before = 0;
+            after = 0;
+
+            GEODEBUG( "Adding exact points for " << pt.toString() );
+
+            if( pt.isExact() ){
+                if( force ) points.insert( pt );
                 return;
             }
 
-            if (_state == DOING_EXPAND) {
-                GEODEBUG( "circle prefix [" << _prefix << "]" );
-                PREFIXDEBUG(_prefix, _g);
+            vector<BSONObj> locs;
+            getPointsFor( pt.key(), pt.obj(), locs, _uniqueDocs );
+
+            GeoPoint nearestPt( pt, -1, true );
+
+            for( vector<BSONObj>::iterator i = locs.begin(); i != locs.end(); i++ ){
 
-                while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) );
-                while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) );
+                Point loc( *i );
 
-                if ( ! _prefix.constrains() ) {
-                    GEODEBUG( "\t exhausted the btree" );
-                    _state = DONE;
-                    return;
+                double d;
+                if( ! exactDocCheck( loc, d ) ) continue;
+
+                if( _uniqueDocs && ( nearestPt.distance() < 0 || d < nearestPt.distance() ) ){
+                    nearestPt._distance = d;
+                    nearestPt._pt = *i;
+                    continue;
+                }
+                else if( ! _uniqueDocs ){
+                    GeoPoint exactPt( pt, d, true );
+                    exactPt._pt = *i;
+                    GEODEBUG( "Inserting exact pt " << exactPt.toString() << " for " << pt.toString() << " exact : " << d << " is less? " << ( exactPt < pt ) << " bits : " << _g->_bits );
+                    points.insert( exactPt );
+                    exactPt < pt ? before++ : after++;
                 }
 
-                Point ll (_g, _prefix);
-                GeoHash trHash = _prefix;
-                trHash.move( 1 , 1 );
-                Point tr (_g, trHash);
-                double sideLen = fabs(tr._x - ll._x);
+            }
+
+            if( _uniqueDocs && nearestPt.distance() >= 0 ){
+                GEODEBUG( "Inserting unique exact pt " << nearestPt.toString() << " for " << pt.toString() << " exact : " << nearestPt.distance() << " is less? " << ( nearestPt < pt ) << " bits : " << _g->_bits );
+                points.insert( nearestPt );
+                if( nearestPt < pt ) before++;
+                else after++;
+            }
+
+        }
+
+        // TODO: Refactor this back into holder class, allow to run periodically when we are seeing a lot of pts
+        void expandEndPoints( bool finish = true ){
+
+            processExtraPoints();
+
+            // All points in array *could* be in maxDistance
+
+            // Step 1 : Trim points to max size
+            // TODO:  This check will do little for now, but is skeleton for future work in incremental $near
+            // searches
+            if( _max > 0 ){
+
+                int numToErase = _points.size() - _max;
+
+                if( numToErase > 0 ){
+
+                    Holder tested;
+
+                    // Work backward through all points we're not sure belong in the set
+                    Holder::iterator maybePointIt = _points.end();
+                    maybePointIt--;
+                    double approxMin = maybePointIt->distance() - 2 * _distError;
+
+                    GEODEBUG( "\t\tNeed to erase " << numToErase << " max : " << _max << " min dist " << approxMin << " error : " << _distError << " starting from : " << (*maybePointIt).toString() );
+
+                    // Insert all
+                    int erased = 0;
+                    while( _points.size() > 0 && ( maybePointIt->distance() >= approxMin || erased < numToErase ) ){
+
+                        Holder::iterator current = maybePointIt--;
+
+                        addExactPoints( *current, tested, true );
+                        _points.erase( current );
+                        erased++;
+
+                        if( tested.size() )
+                            approxMin = tested.begin()->distance() - 2 * _distError;
 
-                if (sideLen > std::max(_xScanDistance, _yScanDistance)) { // circle must be contained by surrounding squares
-                    if ( (ll._x + _xScanDistance < _startPt._x && ll._y + _yScanDistance < _startPt._y) &&
-                            (tr._x - _xScanDistance > _startPt._x && tr._y - _yScanDistance > _startPt._y) ) {
-                        GEODEBUG("square fully contains circle");
-                        _state = DONE;
                     }
-                    else if (_prefix.getBits() > 1) {
-                        GEODEBUG("checking surrounding squares");
-                        _state = DOING_AROUND;
+
+                    GEODEBUG( "\t\tEnding search at point " << ( _points.size() == 0 ? "(beginning)" : maybePointIt->toString() ) );
+
+                    int numToAddBack = erased - numToErase;
+                    assert( numToAddBack >= 0 );
+
+                    GEODEBUG( "\t\tNum tested valid : " << tested.size() << " erased : " << erased << " added back : " << numToAddBack );
+
+#ifdef GEODEBUGGING
+                    for( Holder::iterator it = tested.begin(); it != tested.end(); it++ ){
+                        log() << "Tested Point: " << *it << endl;
                     }
-                    else {
-                        GEODEBUG("using simple search");
-                        _prefix = _prefix.up();
+#endif
+                    Holder::iterator testedIt = tested.begin();
+                    for( int i = 0; i < numToAddBack && testedIt != tested.end(); i++ ){
+                        _points.insert( *testedIt );
+                        testedIt++;
                     }
                 }
-                else {
-                    _prefix = _prefix.up();
+            }
+
+#ifdef GEODEBUGGING
+            for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+                log() << "Point: " << *it << endl;
+            }
+#endif
+            // We've now trimmed first set of unneeded points
+
+            GEODEBUG( "\t\t Start expanding, num points : " << _points.size() << " max : " << _max );
+
+            // Step 2: iterate through all points and add as needed
+
+            unsigned expandedPoints = 0;
+            Holder::iterator it = _points.begin();
+            double expandWindowEnd = -1;
+            while( it != _points.end() ){
+                const GeoPoint& currPt = *it;
+
+                // TODO: If one point is exact, maybe not 2 * _distError
+
+                // See if we're in an expand window
+                bool inWindow = currPt.distance() <= expandWindowEnd;
+                // If we're not, and we're done with points, break
+                if( ! inWindow && expandedPoints >= _max ) break;
+
+                bool expandApprox = ! currPt.isExact() && ( ! _uniqueDocs || ( finish && _needDistance ) || inWindow );
+
+                if( expandApprox ){
+
+                    // Add new point(s)
+                    // These will only be added in a radius of 2 * _distError around the current point,
+                    // so should not affect previously valid points.
+                    int before, after;
+                    addExactPoints( currPt, _points, before, after, false );
+                    expandedPoints += before;
+
+                    if( _max > 0 && expandedPoints < _max )
+                        expandWindowEnd = currPt.distance() + 2 * _distError;
+
+                    // Iterate to the next point
+                    Holder::iterator current = it++;
+                    // Erase the current point
+                    _points.erase( current );
+
+                }
+                else{
+                    expandedPoints++;
+                    it++;
                 }
+            }
 
-                return;
+            GEODEBUG( "\t\tFinished expanding, num points : " << _points.size() << " max : " << _max );
+
+            // Finish
+            // TODO:  Don't really need to trim?
+            for( ; expandedPoints > _max; expandedPoints-- ) it--;
+            _points.erase( it, _points.end() );
+
+#ifdef GEODEBUGGING
+            for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+                log() << "Point: " << *it << endl;
             }
+#endif
+        }
 
-            /* Clients are expected to use moreToDo before calling
-             * fillStack, so DONE is checked for there. If any more
-             * State values are defined, you should handle them
-             * here. */
-            assert(0);
+        virtual GeoHash expandStartHash(){
+           return _start;
         }
 
-        bool needToCheckBox(const GeoHash& prefix) {
-            Point ll (_g, prefix);
-            if (fabs(ll._x - _startPt._x) <= _xScanDistance) return true;
-            if (fabs(ll._y - _startPt._y) <= _yScanDistance) return true;
+        // Whether the current box width is big enough for our search area
+        virtual bool fitsInBox( double width ){
+           return width >= _scanDistance;
+        }
+
+        // Whether the current box overlaps our search area
+        virtual double intersectsBox( Box& cur ){
+            return cur.intersects( _want );
+        }
+
+        GeoHash _start;
+        int _numWanted;
+        double _scanDistance;
+
+        long long _nscanned;
+        int _found;
+        GeoDistType _type;
+
+        Box _want;
+    };
+
+    class GeoSearchCursor : public GeoCursorBase {
+    public:
+
+        GeoSearchCursor( shared_ptr<GeoSearch> s )
+            : GeoCursorBase( s->_spec ) ,
+              _s( s ) , _cur( s->_points.begin() ) , _end( s->_points.end() ), _nscanned() {
+            if ( _cur != _end ) {
+                ++_nscanned;
+            }
+        }
 
-            GeoHash trHash = prefix;
-            trHash.move( 1 , 1 );
-            Point tr (_g, trHash);
+        virtual ~GeoSearchCursor() {}
 
-            if (fabs(tr._x - _startPt._x) <= _xScanDistance) return true;
-            if (fabs(tr._y - _startPt._y) <= _yScanDistance) return true;
+        virtual bool ok() {
+            return _cur != _end;
+        }
 
+        virtual Record* _current() { assert(ok()); return _cur->_loc.rec(); }
+        virtual BSONObj current() { assert(ok()); return _cur->_o; }
+        virtual DiskLoc currLoc() { assert(ok()); return _cur->_loc; }
+        virtual bool advance() {
+            if( ok() ){
+                _cur++;
+                incNscanned();
+                return ok();
+            }
             return false;
         }
+        virtual BSONObj currKey() const { return _cur->_key; }
 
-        void getPointsForPrefix(const GeoHash& prefix) {
-            if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ) {
-                return;
+        virtual string toString() {
+            return "GeoSearchCursor";
+        }
+
+
+        virtual BSONObj prettyStartKey() const {
+            return BSON( _s->_g->_geo << _s->_prefix.toString() );
+        }
+        virtual BSONObj prettyEndKey() const {
+            GeoHash temp = _s->_prefix;
+            temp.move( 1 , 1 );
+            return BSON( _s->_g->_geo << temp.toString() );
+        }
+
+        virtual long long nscanned() { return _nscanned; }
+
+        virtual CoveredIndexMatcher* matcher() const {
+            if( _s->_matcher.get() ) return _s->_matcher.get();
+            else return emptyMatcher.get();
+        }
+
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const {
+            if( _s->_matcher.get() ) return _s->_matcher;
+            else return emptyMatcher;
+        }
+
+        shared_ptr<GeoSearch> _s;
+        GeoHopper::Holder::iterator _cur;
+        GeoHopper::Holder::iterator _end;
+
+        void incNscanned() { if ( ok() ) { ++_nscanned; } }
+        long long _nscanned;
+    };
+
+    class GeoCircleBrowse : public GeoBrowse {
+    public:
+
+        GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center", bool uniqueDocs = true )
+            : GeoBrowse( g , "circle" , filter, uniqueDocs ) {
+
+            uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 );
+
+            BSONObjIterator i(circle);
+            BSONElement center = i.next();
+
+            uassert( 13656 , "the first field of $center object must be a location object" , center.isABSONObj() );
+
+            // Get geohash and exact center point
+            // TODO: For wrapping search, may be useful to allow center points outside-of-bounds here.
+            // Calculating the nearest point as a hash start inside the region would then be required.
+            _start = g->_tohash(center);
+            _startPt = Point(center);
+
+            _maxDistance = i.next().numberDouble();
+            uassert( 13061 , "need a max distance >= 0 " , _maxDistance >= 0 );
+
+            if (type == "$center") {
+                // Look in box with bounds of maxDistance in either direction
+                _type = GEO_PLAIN;
+                _xScanDistance = _maxDistance + _g->_error;
+                _yScanDistance = _maxDistance + _g->_error;
+            }
+            else if (type == "$centerSphere") {
+                // Same, but compute maxDistance using spherical transform
+
+                uassert(13461, "Spherical MaxDistance > PI. Are you sure you are using radians?", _maxDistance < M_PI);
+                checkEarthBounds( _startPt );
+
+                _type = GEO_SPHERE;
+                _yScanDistance = rad2deg( _maxDistance ) + _g->_error;
+                _xScanDistance = computeXScanDistance(_startPt._y, _yScanDistance);
+
+                uassert(13462, "Spherical distance would require wrapping, which isn't implemented yet",
+                        (_startPt._x + _xScanDistance < 180) && (_startPt._x - _xScanDistance > -180) &&
+                        (_startPt._y + _yScanDistance < 90) && (_startPt._y - _yScanDistance > -90));
+            }
+            else {
+                uassert(13460, "invalid $center query type: " + type, false);
             }
 
-            while ( _min.hasPrefix( prefix ) && _min.advance( -1 , _found , this ) );
-            while ( _max.hasPrefix( prefix ) && _max.advance( 1 , _found , this ) );
+            // Bounding box includes fudge factor.
+            // TODO:  Is this correct, since fudge factor may be spherically transformed?
+            _bBox._min = Point( _startPt._x - _xScanDistance, _startPt._y - _yScanDistance );
+            _bBox._max = Point( _startPt._x + _xScanDistance, _startPt._y + _yScanDistance );
+
+            GEODEBUG( "Bounding box for circle query : " << _bBox.toString() << " (max distance : " << _maxDistance << ")" << " starting from " << _startPt.toString() );
+
+            ok();
         }
 
+        virtual GeoHash expandStartHash() {
+            return _start;
+        }
+
+        virtual bool fitsInBox( double width ) {
+            return width >= std::max(_xScanDistance, _yScanDistance);
+        }
 
-        virtual bool checkDistance( const GeoHash& h , double& d ) {
+        virtual double intersectsBox( Box& cur ) {
+            return cur.intersects( _bBox );
+        }
+
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+            // Inexact hash distance checks.
+            double error = 0;
             switch (_type) {
             case GEO_PLAIN:
-                d = _g->distance( _start , h );
+                d = _startPt.distance( p );
+                error = _g->_error;
+                break;
+            case GEO_SPHERE: {
+                checkEarthBounds( p );
+                d = spheredist_deg( _startPt, p );
+                error = _g->_errorSphere;
+                break;
+            }
+            default: assert( false );
+            }
+
+            // If our distance is in the error bounds...
+            if( d >= _maxDistance - error && d <= _maxDistance + error ) return BORDER;
+            return d > _maxDistance ? BAD : GOOD;
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+
+            switch (_type) {
+            case GEO_PLAIN: {
+                if( _startPt.distanceWithin( p, _maxDistance ) ) return true;
                 break;
+            }
             case GEO_SPHERE:
-                d = spheredist_deg(_startPt, Point(_g, h));
+                checkEarthBounds( p );
+                if( spheredist_deg( _startPt , p ) <= _maxDistance ) return true;
                 break;
-            default:
-                assert(0);
+            default: assert( false );
             }
 
-            GEODEBUG( "\t " << h << "\t" << d );
-            return d <= _maxDistance;
+            return false;
         }
 
         GeoDistType _type;
@@ -1444,153 +2295,158 @@ namespace mongo {
         double _maxDistance; // user input
         double _xScanDistance; // effected by GeoDistType
         double _yScanDistance; // effected by GeoDistType
-
-        int _found;
-
-        GeoHash _prefix;
-        BtreeLocation _min;
-        BtreeLocation _max;
+        Box _bBox;
 
     };
 
     class GeoBoxBrowse : public GeoBrowse {
     public:
 
-        enum State {
-            START ,
-            DOING_EXPAND ,
-            DONE
-        } _state;
-
-        GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj() )
-            : GeoBrowse( g , "box" , filter ) {
+        GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj(), bool uniqueDocs = true )
+            : GeoBrowse( g , "box" , filter, uniqueDocs ) {
 
             uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 );
+
+            // Initialize an *exact* box from the given obj.
             BSONObjIterator i(box);
-            _bl = g->_tohash( i.next() );
-            _tr = g->_tohash( i.next() );
+            _want._min = Point( i.next() );
+            _want._max = Point( i.next() );
 
-            _want._min = Point( _g , _bl );
-            _want._max = Point( _g , _tr );
+            _wantRegion = _want;
+            _wantRegion.fudge( g ); // Need to make sure we're checking regions within error bounds of where we want
+            fixBox( g, _wantRegion );
+            fixBox( g, _want );
 
             uassert( 13064 , "need an area > 0 " , _want.area() > 0 );
 
-            _state = START;
-            _found = 0;
-
             Point center = _want.center();
-            _prefix = _g->hash( center._x , center._y );
+            _start = _g->hash( center._x , center._y );
 
             GEODEBUG( "center : " << center.toString() << "\t" << _prefix );
 
-            {
-                GeoHash a(0LL,32);
-                GeoHash b(0LL,32);
-                b.move(1,1);
-                _fudge = _g->distance(a,b);
-            }
-
-            _wantLen = _fudge + std::max((_want._max._x - _want._min._x), (_want._max._y - _want._min._y));
+            _fudge = _g->_error;
+            _wantLen = _fudge +
+                       std::max( ( _want._max._x - _want._min._x ) ,
+                                 ( _want._max._y - _want._min._y ) ) / 2;
 
             ok();
         }
 
-        virtual bool moreToDo() {
-            return _state != DONE;
+        void fixBox( const Geo2dType* g, Box& box ) {
+            if( box._min._x > box._max._x )
+                swap( box._min._x, box._max._x );
+            if( box._min._y > box._max._y )
+                swap( box._min._y, box._max._y );
+
+            double gMin = g->_min;
+            double gMax = g->_max;
+
+            if( box._min._x < gMin ) box._min._x = gMin;
+            if( box._min._y < gMin ) box._min._y = gMin;
+            if( box._max._x > gMax) box._max._x = gMax;
+            if( box._max._y > gMax ) box._max._y = gMax;
         }
 
-        virtual void fillStack() {
-            if ( _state == START ) {
+        void swap( double& a, double& b ) {
+            double swap = a;
+            a = b;
+            b = swap;
+        }
 
-                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max ,
-                                               _prefix , _found , this ) ) {
-                    _state = DONE;
-                    return;
-                }
-                _state = DOING_EXPAND;
-            }
+        virtual GeoHash expandStartHash() {
+            return _start;
+        }
 
-            if ( _state == DOING_EXPAND ) {
-                int started = _found;
-                while ( started == _found || _state == DONE ) {
-                    GEODEBUG( "box prefix [" << _prefix << "]" );
-                    while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) );
-                    while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) );
+        virtual bool fitsInBox( double width ) {
+            return width >= _wantLen;
+        }
 
-                    if ( _state == DONE )
-                        return;
+        virtual double intersectsBox( Box& cur ) {
+            return cur.intersects( _wantRegion );
+        }
 
-                    if ( ! _prefix.constrains() ) {
-                        GEODEBUG( "box exhausted" );
-                        _state = DONE;
-                        return;
-                    }
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+            if( _want.onBoundary( p, _fudge ) ) return BORDER;
+            else return _want.inside( p, _fudge ) ? GOOD : BAD;
 
-                    if (_g->sizeEdge(_prefix) < _wantLen) {
-                        _prefix = _prefix.up();
-                    }
-                    else {
-                        for (int i=-1; i<=1; i++) {
-                            for (int j=-1; j<=1; j++) {
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+            return _want.inside( p );
+        }
 
-                                if (i == 0 && j == 0)
-                                    continue; // main box
+        Box _want;
+        Box _wantRegion;
+        double _wantLen;
+        double _fudge;
 
-                                GeoHash newBox = _prefix;
-                                newBox.move(i, j);
+        GeoHash _start;
 
-                                PREFIXDEBUG(newBox, _g);
+    };
 
-                                Box cur( _g , newBox );
-                                if (_want.intersects(cur)) {
-                                    // TODO consider splitting into quadrants
-                                    getPointsForPrefix(newBox);
-                                }
-                                else  {
-                                    GEODEBUG("skipping box");
-                                }
-                            }
-                        }
-                        _state = DONE;
-                    }
+    class GeoPolygonBrowse : public GeoBrowse {
+    public:
 
-                }
-                return;
+        GeoPolygonBrowse( const Geo2dType* g , const BSONObj& polyPoints ,
+                          BSONObj filter = BSONObj(), bool uniqueDocs = true ) : GeoBrowse( g , "polygon" , filter, uniqueDocs ) {
+
+            GEODEBUG( "In Polygon" )
+
+            BSONObjIterator i( polyPoints );
+            BSONElement first = i.next();
+            _poly.add( Point( first ) );
+
+            while ( i.more() ) {
+                _poly.add( Point( i.next() ) );
             }
 
+            uassert( 14030, "polygon must be defined by three points or more", _poly.size() >= 3 );
+
+            _bounds = _poly.bounds();
+            _bounds.fudge( g ); // We need to check regions within the error bounds of these bounds
+            _bounds.truncate( g ); // We don't need to look anywhere outside the space
+
+            _maxDim = _g->_error + _bounds.maxDim() / 2;
+
+            ok();
         }
 
-        void getPointsForPrefix(const GeoHash& prefix) {
-            if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , prefix , _found , this ) ) {
-                return;
-            }
+        // The initial geo hash box for our first expansion
+        virtual GeoHash expandStartHash() {
+            return _g->hash( _bounds.center() );
+        }
 
-            while ( _min.hasPrefix( prefix ) && _min.advance( -1 , _found , this ) );
-            while ( _max.hasPrefix( prefix ) && _max.advance( 1 , _found , this ) );
+        // Whether the current box width is big enough for our search area
+        virtual bool fitsInBox( double width ) {
+            return _maxDim <= width;
         }
 
-        virtual bool checkDistance( const GeoHash& h , double& d ) {
-            bool res = _want.inside( Point( _g , h ) , _fudge );
-            GEODEBUG( "\t want : " << _want.toString()
-                      << " point: " << Point( _g , h ).toString()
-                      << " in : " << res );
-            return res;
+        // Whether the current box overlaps our search area
+        virtual double intersectsBox( Box& cur ) {
+            return cur.intersects( _bounds );
         }
 
-        GeoHash _bl;
-        GeoHash _tr;
-        Box _want;
-        double _wantLen;
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
 
-        int _found;
+            int in = _poly.contains( p, _g->_error );
 
-        GeoHash _prefix;
-        BtreeLocation _min;
-        BtreeLocation _max;
+            if( in == 0 ) return BORDER;
+            else return in > 0 ? GOOD : BAD;
 
-        double _fudge;
-    };
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+            return _poly.contains( p );
+        }
+
+    private:
 
+        Polygon _poly;
+        Box _bounds;
+        double _maxDim;
+
+        GeoHash _start;
+    };
 
     shared_ptr<Cursor> Geo2dType::newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
         if ( numWanted < 0 )
@@ -1605,66 +2461,92 @@ namespace mongo {
             if ( _geo != e.fieldName() )
                 continue;
 
-            if ( e.type() != Object )
-                continue;
+            if ( e.type() == Array ) {
+                // If we get an array query, assume it is a location, and do a $within { $center : [[x, y], 0] } search
+                shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ), "$center", true ) );
+                return c;
+            }
+            else if ( e.type() == Object ) {
 
-            switch ( e.embeddedObject().firstElement().getGtLtOp() ) {
-            case BSONObj::opNEAR: {
-                BSONObj n = e.embeddedObject();
-                e = n.firstElement();
+                // TODO:  Filter out _geo : { $special... } field so it doesn't get matched accidentally,
+                // if matcher changes
 
-                const char* suffix = e.fieldName() + 5; // strlen("$near") == 5;
-                GeoDistType type;
-                if (suffix[0] == '\0') {
-                    type = GEO_PLAIN;
-                }
-                else if (strcmp(suffix, "Sphere") == 0) {
-                    type = GEO_SPHERE;
-                }
-                else {
-                    uassert(13464, string("invalid $near search type: ") + e.fieldName(), false);
-                    type = GEO_PLAIN; // prevents uninitialized warning
-                }
+                switch ( e.embeddedObject().firstElement().getGtLtOp() ) {
+                case BSONObj::opNEAR: {
+                    BSONObj n = e.embeddedObject();
+                    e = n.firstElement();
 
-                double maxDistance = numeric_limits<double>::max();
-                if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ) {
-                    BSONObjIterator i(e.embeddedObject());
-                    i.next();
-                    i.next();
-                    BSONElement e = i.next();
-                    if ( e.isNumber() )
-                        maxDistance = e.numberDouble();
-                }
-                {
-                    BSONElement e = n["$maxDistance"];
-                    if ( e.isNumber() )
-                        maxDistance = e.numberDouble();
-                }
-                shared_ptr<GeoSearch> s( new GeoSearch( this , _tohash(e) , numWanted , query , maxDistance, type ) );
-                s->exec();
-                shared_ptr<Cursor> c;
-                c.reset( new GeoSearchCursor( s ) );
-                return c;
-            }
-            case BSONObj::opWITHIN: {
-                e = e.embeddedObject().firstElement();
-                uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() );
-                e = e.embeddedObject().firstElement();
-                string type = e.fieldName();
-                if ( startsWith(type,  "$center") ) {
-                    uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() );
-                    shared_ptr<Cursor> c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type) );
+                    const char* suffix = e.fieldName() + 5; // strlen("$near") == 5;
+                    GeoDistType type;
+                    if (suffix[0] == '\0') {
+                        type = GEO_PLAIN;
+                    }
+                    else if (strcmp(suffix, "Sphere") == 0) {
+                        type = GEO_SPHERE;
+                    }
+                    else {
+                        uassert(13464, string("invalid $near search type: ") + e.fieldName(), false);
+                        type = GEO_PLAIN; // prevents uninitialized warning
+                    }
+
+                    double maxDistance = numeric_limits<double>::max();
+                    if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ) {
+                        BSONObjIterator i(e.embeddedObject());
+                        i.next();
+                        i.next();
+                        BSONElement e = i.next();
+                        if ( e.isNumber() )
+                            maxDistance = e.numberDouble();
+                    }
+                    {
+                        BSONElement e = n["$maxDistance"];
+                        if ( e.isNumber() )
+                            maxDistance = e.numberDouble();
+                    }
+
+                    bool uniqueDocs = false;
+                    if( ! n["$uniqueDocs"].eoo() ) uniqueDocs = n["$uniqueDocs"].trueValue();
+
+                    shared_ptr<GeoSearch> s( new GeoSearch( this , Point( e ) , numWanted , query , maxDistance, type, uniqueDocs ) );
+                    s->exec();
+                    shared_ptr<Cursor> c;
+                    c.reset( new GeoSearchCursor( s ) );
                     return c;
                 }
-                else if ( type == "$box" ) {
-                    uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() );
-                    shared_ptr<Cursor> c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query ) );
+                case BSONObj::opWITHIN: {
+
+                    e = e.embeddedObject().firstElement();
+                    uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() );
+
+                    BSONObj context = e.embeddedObject();
+                    e = e.embeddedObject().firstElement();
+                    string type = e.fieldName();
+
+                    bool uniqueDocs = true;
+                    if( ! context["$uniqueDocs"].eoo() ) uniqueDocs = context["$uniqueDocs"].trueValue();
+
+                    if ( startsWith(type,  "$center") ) {
+                        uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() );
+                        shared_ptr<Cursor> c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type, uniqueDocs ) );
+                        return c;
+                    }
+                    else if ( type == "$box" ) {
+                        uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() );
+                        shared_ptr<Cursor> c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
+                        return c;
+                    }
+                    else if ( startsWith( type, "$poly" ) ) {
+                        uassert( 14029 , "$polygon has to take an object or array" , e.isABSONObj() );
+                        shared_ptr<Cursor> c( new GeoPolygonBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
+                        return c;
+                    }
+                    throw UserException( 13058 , (string)"unknown $within type: " + type );
+                }
+                default:
+                    // Otherwise... assume the object defines a point, and we want to do a zero-radius $within $center
+                    shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ) ) );
                     return c;
                 }
-                throw UserException( 13058 , (string)"unknown $with type: " + type );
-            }
-            default:
-                break;
             }
         }
 
@@ -1682,7 +2564,7 @@ namespace mongo {
         bool slaveOk() const { return true; }
         void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; }
         bool slaveOverrideOk() { return true; }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string ns = dbname + "." + cmdObj.firstElement().valuestr();
 
             NamespaceDetails * d = nsdetails( ns.c_str() );
@@ -1713,12 +2595,20 @@ namespace mongo {
             assert( &id == g->getDetails() );
 
             int numWanted = 100;
-            if ( cmdObj["num"].isNumber() )
+            if ( cmdObj["num"].isNumber() ) {
                 numWanted = cmdObj["num"].numberInt();
+                assert( numWanted >= 0 );
+            }
+
+            bool uniqueDocs = false;
+            if( ! cmdObj["uniqueDocs"].eoo() ) uniqueDocs = cmdObj["uniqueDocs"].trueValue();
+
+            bool includeLocs = false;
+            if( ! cmdObj["includeLocs"].eoo() ) includeLocs = cmdObj["includeLocs"].trueValue();
 
             uassert(13046, "'near' param missing/invalid", !cmdObj["near"].eoo());
-            const GeoHash n = g->_tohash( cmdObj["near"] );
-            result.append( "near" , n.toString() );
+            const Point n( cmdObj["near"] );
+            result.append( "near" , g->_tohash( cmdObj["near"] ).toString() );
 
             BSONObj filter;
             if ( cmdObj["query"].type() == Object )
@@ -1732,7 +2622,7 @@ namespace mongo {
             if ( cmdObj["spherical"].trueValue() )
                 type = GEO_SPHERE;
 
-            GeoSearch gs( g , n , numWanted , filter , maxDistance , type);
+            GeoSearch gs( g , n , numWanted , filter , maxDistance , type, uniqueDocs, true );
 
             if ( cmdObj["start"].type() == String) {
                 GeoHash start ((string) cmdObj["start"].valuestr());
@@ -1747,17 +2637,17 @@ namespace mongo {
 
             double totalDistance = 0;
 
-
             BSONObjBuilder arr( result.subarrayStart( "results" ) );
             int x = 0;
-            for ( GeoHopper::Holder::iterator i=gs._hopper->_points.begin(); i!=gs._hopper->_points.end(); i++ ) {
-                const GeoPoint& p = *i;
+            for ( GeoHopper::Holder::iterator i=gs._points.begin(); i!=gs._points.end(); i++ ) {
 
-                double dis = distanceMultiplier * p._distance;
+                const GeoPoint& p = *i;
+                double dis = distanceMultiplier * p.distance();
                 totalDistance += dis;
 
                 BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ) ) );
                 bb.append( "dis" , dis );
+                if( includeLocs ) bb.append( "loc" , p._pt );
                 bb.append( "obj" , p._o );
                 bb.done();
             }
@@ -1766,10 +2656,10 @@ namespace mongo {
             BSONObjBuilder stats( result.subobjStart( "stats" ) );
             stats.append( "time" , cc().curop()->elapsedMillis() );
             stats.appendNumber( "btreelocs" , gs._nscanned );
-            stats.appendNumber( "nscanned" , gs._hopper->_lookedAt );
-            stats.appendNumber( "objectsLoaded" , gs._hopper->_objectsLoaded );
+            stats.appendNumber( "nscanned" , gs._lookedAt );
+            stats.appendNumber( "objectsLoaded" , gs._objectsLoaded );
             stats.append( "avgDistance" , totalDistance / x );
-            stats.append( "maxDistance" , gs._hopper->farthest() );
+            stats.append( "maxDistance" , gs.farthest() );
             stats.done();
 
             return true;
@@ -1783,7 +2673,7 @@ namespace mongo {
         virtual LockType locktype() const { return READ; }
         bool slaveOk() const { return true; }
         bool slaveOverrideOk() { return true; }
-        bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string ns = dbname + "." + cmdObj.firstElement().valuestr();
 
             NamespaceDetails * d = nsdetails( ns.c_str() );
@@ -1819,7 +2709,8 @@ namespace mongo {
 
             int max = 100000;
 
-            BtreeCursor c( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 );
+            auto_ptr<BtreeCursor> bc( BtreeCursor::make( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 ) );
+            BtreeCursor &c = *bc;
             while ( c.ok() && max-- ) {
                 GeoHash h( c.currKey().firstElement() );
                 int len;
@@ -1837,4 +2728,248 @@ namespace mongo {
 
     } geoWalkCmd;
 
+    struct GeoUnitTest : public UnitTest {
+
+        int round( double d ) {
+            return (int)(.5+(d*1000));
+        }
+
+#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); }
+
+        void run() {
+            assert( ! GeoHash::isBitSet( 0 , 0 ) );
+            assert( ! GeoHash::isBitSet( 0 , 31 ) );
+            assert( GeoHash::isBitSet( 1 , 31 ) );
+
+            IndexSpec i( BSON( "loc" << "2d" ) );
+            Geo2dType g( &geo2dplugin , &i );
+            {
+                double x = 73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+
+            {
+                double x = -73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+
+            {
+                GeoHash h( "0000" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0001" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0000" );
+
+                h.init( "0001" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0100" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0001" );
+
+
+                h.init( "0000" );
+                h.move( 1 , 0 );
+                GEOHEQ( h , "0010" );
+            }
+
+            {
+                Box b( 5 , 5 , 2 );
+                assert( "(5,5) -->> (7,7)" == b.toString() );
+            }
+
+            {
+                GeoHash a = g.hash( 1 , 1 );
+                GeoHash b = g.hash( 4 , 5 );
+                assert( 5 == (int)(g.distance( a , b ) ) );
+                a = g.hash( 50 , 50 );
+                b = g.hash( 42 , 44 );
+                assert( round(10) == round(g.distance( a , b )) );
+            }
+
+            {
+                GeoHash x("0000");
+                assert( 0 == x.getHash() );
+                x.init( 0 , 1 , 32 );
+                GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" )
+
+                assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) );
+                assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) );
+            }
+
+            {
+                GeoHash x("1010");
+                GEOHEQ( x , "1010" );
+                GeoHash y = x + "01";
+                GEOHEQ( y , "101001" );
+            }
+
+            {
+
+                GeoHash a = g.hash( 5 , 5 );
+                GeoHash b = g.hash( 5 , 7 );
+                GeoHash c = g.hash( 100 , 100 );
+                /*
+                cout << "a: " << a << endl;
+                cout << "b: " << b << endl;
+                cout << "c: " << c << endl;
+
+                cout << "a: " << a.toStringHex1() << endl;
+                cout << "b: " << b.toStringHex1() << endl;
+                cout << "c: " << c.toStringHex1() << endl;
+                */
+                BSONObj oa = a.wrap();
+                BSONObj ob = b.wrap();
+                BSONObj oc = c.wrap();
+                /*
+                cout << "a: " << oa.hexDump() << endl;
+                cout << "b: " << ob.hexDump() << endl;
+                cout << "c: " << oc.hexDump() << endl;
+                */
+                assert( oa.woCompare( ob ) < 0 );
+                assert( oa.woCompare( oc ) < 0 );
+
+            }
+
+            {
+                GeoHash x( "000000" );
+                x.move( -1 , 0 );
+                GEOHEQ( x , "101010" );
+                x.move( 1 , -1 );
+                GEOHEQ( x , "010101" );
+                x.move( 0 , 1 );
+                GEOHEQ( x , "000000" );
+            }
+
+            {
+                GeoHash prefix( "110011000000" );
+                GeoHash entry(  "1100110000011100000111000001110000011100000111000001000000000000" );
+                assert( ! entry.hasPrefix( prefix ) );
+
+                entry = GeoHash("1100110000001100000111000001110000011100000111000001000000000000");
+                assert( entry.toString().find( prefix.toString() ) == 0 );
+                assert( entry.hasPrefix( GeoHash( "1100" ) ) );
+                assert( entry.hasPrefix( prefix ) );
+            }
+
+            {
+                GeoHash a = g.hash( 50 , 50 );
+                GeoHash b = g.hash( 48 , 54 );
+                assert( round( 4.47214 ) == round( g.distance( a , b ) ) );
+            }
+
+
+            {
+                Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) );
+                assert( b.inside( 29.763 , -95.363 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) );
+            }
+
+            {
+                GeoHash a( "11001111" );
+                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11") ) );
+                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11110000") ) );
+            }
+
+            {
+                int N = 10000;
+                {
+                    Timer t;
+                    for ( int i=0; i<N; i++ ) {
+                        unsigned x = (unsigned)rand();
+                        unsigned y = (unsigned)rand();
+                        GeoHash h( x , y );
+                        unsigned a,b;
+                        h.unhash_slow( a,b );
+                        assert( a == x );
+                        assert( b == y );
+                    }
+                    //cout << "slow: " << t.millis() << endl;
+                }
+
+                {
+                    Timer t;
+                    for ( int i=0; i<N; i++ ) {
+                        unsigned x = (unsigned)rand();
+                        unsigned y = (unsigned)rand();
+                        GeoHash h( x , y );
+                        unsigned a,b;
+                        h.unhash_fast( a,b );
+                        assert( a == x );
+                        assert( b == y );
+                    }
+                    //cout << "fast: " << t.millis() << endl;
+                }
+
+            }
+
+            {
+                // see http://en.wikipedia.org/wiki/Great-circle_distance#Worked_example
+
+                {
+                    Point BNA (-86.67, 36.12);
+                    Point LAX (-118.40, 33.94);
+
+                    double dist1 = spheredist_deg(BNA, LAX);
+                    double dist2 = spheredist_deg(LAX, BNA);
+
+                    // target is 0.45306
+                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+                }
+                {
+                    Point BNA (-1.5127, 0.6304);
+                    Point LAX (-2.0665, 0.5924);
+
+                    double dist1 = spheredist_rad(BNA, LAX);
+                    double dist2 = spheredist_rad(LAX, BNA);
+
+                    // target is 0.45306
+                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+                }
+                {
+                    Point JFK (-73.77694444, 40.63861111 );
+                    Point LAX (-118.40, 33.94);
+
+                    double dist = spheredist_deg(JFK, LAX) * EARTH_RADIUS_MILES;
+                    assert( dist > 2469 && dist < 2470 );
+                }
+
+                {
+                    Point BNA (-86.67, 36.12);
+                    Point LAX (-118.40, 33.94);
+                    Point JFK (-73.77694444, 40.63861111 );
+                    assert( spheredist_deg(BNA, BNA) < 1e-6);
+                    assert( spheredist_deg(LAX, LAX) < 1e-6);
+                    assert( spheredist_deg(JFK, JFK) < 1e-6);
+
+                    Point zero (0, 0);
+                    Point antizero (0,-180);
+
+                    // these were known to cause NaN
+                    assert( spheredist_deg(zero, zero) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6);
+                }
+            }
+        }
+    } geoUnitTest;
+
+
 }
+
diff --git a/db/geo/core.h b/db/geo/core.h
index 602b513..b779978 100644
--- a/db/geo/core.h
+++ b/db/geo/core.h
@@ -59,6 +59,7 @@ namespace mongo {
 
     class GeoHash {
     public:
+
         GeoHash()
             : _hash(0),_bits(0) {
         }
@@ -71,6 +72,14 @@ namespace mongo {
             init( hash );
         }
 
+        static GeoHash makeFromBinData(const char *bindata, unsigned bits) {
+            GeoHash h;
+            h._bits = bits;
+            h._copy( (char*)&h._hash , bindata );
+            h._fix();
+            return h;
+        }
+
         explicit GeoHash( const BSONElement& e , unsigned bits=32 ) {
             _bits = bits;
             if ( e.type() == BinData ) {
@@ -80,7 +89,7 @@ namespace mongo {
                 _bits = bits;
             }
             else {
-                cout << "GeoHash cons e : " << e << endl;
+                cout << "GeoHash bad element: " << e << endl;
                 uassert(13047,"wrong type for geo index. if you're using a pre-release version, need to rebuild index",0);
             }
             _fix();
@@ -214,6 +223,10 @@ namespace mongo {
             return _bits > 0;
         }
 
+        bool canRefine() const {
+           return _bits < 32;
+        }
+
         void move( int x , int y ) {
             assert( _bits );
             _move( 0 , x );
@@ -265,10 +278,19 @@ namespace mongo {
             return *this;
         }
 
-        bool operator==(const GeoHash& h ) {
+        bool operator==(const GeoHash& h ) const {
             return _hash == h._hash && _bits == h._bits;
         }
 
+        bool operator!=(const GeoHash& h ) const {
+            return !( *this == h );
+        }
+
+        bool operator<(const GeoHash& h ) const {
+            if( _hash != h._hash ) return _hash < h._hash;
+            return _bits < h._bits;
+        }
+
         GeoHash& operator+=( const char * s ) {
             unsigned pos = _bits * 2;
             _bits += strlen(s) / 2;
@@ -289,6 +311,10 @@ namespace mongo {
             return n;
         }
 
+        GeoHash operator+( string s ) const {
+           return operator+( s.c_str() );
+        }
+
         void _fix() {
             static long long FULL = 0xFFFFFFFFFFFFFFFFLL;
             long long mask = FULL << ( 64 - ( _bits * 2 ) );
@@ -322,7 +348,7 @@ namespace mongo {
 
     private:
 
-        void _copy( char * dst , const char * src ) const {
+        static void _copy( char * dst , const char * src ) {
             for ( unsigned a=0; a<8; a++ ) {
                 dst[a] = src[7-a];
             }
@@ -378,9 +404,61 @@ namespace mongo {
         double distance( const Point& p ) const {
             double a = _x - p._x;
             double b = _y - p._y;
+
+            // Avoid numerical error if possible...
+            if( a == 0 ) return abs( _y - p._y );
+            if( b == 0 ) return abs( _x - p._x );
+
             return sqrt( ( a * a ) + ( b * b ) );
         }
 
+        /**
+         * Distance method that compares x or y coords when other direction is zero,
+         * avoids numerical error when distances are very close to radius but axis-aligned.
+         *
+         * An example of the problem is:
+         * (52.0 - 51.9999) - 0.0001 = 3.31965e-15 and 52.0 - 51.9999 > 0.0001 in double arithmetic
+         * but:
+         * 51.9999 + 0.0001 <= 52.0
+         *
+         * This avoids some (but not all!) suprising results in $center queries where points are
+         * ( radius + center.x, center.y ) or vice-versa.
+         */
+        bool distanceWithin( const Point& p, double radius ) const {
+            double a = _x - p._x;
+            double b = _y - p._y;
+
+            if( a == 0 ) {
+                //
+                // Note:  For some, unknown reason, when a 32-bit g++ optimizes this call, the sum is
+                // calculated imprecisely.  We need to force the compiler to always evaluate it correctly,
+                // hence the weirdness.
+                //
+                // On some 32-bit linux machines, removing the volatile keyword or calculating the sum inline
+                // will make certain geo tests fail.  Of course this check will force volatile for all 32-bit systems,
+                // not just affected systems.
+                if( sizeof(void*) <= 4 ){
+                    volatile double sum = _y > p._y ? p._y + radius : _y + radius;
+                    return _y > p._y ? sum >= _y : sum >= p._y;
+                }
+                else {
+                    // Original math, correct for most systems
+                    return _y > p._y ? p._y + radius >= _y : _y + radius >= p._y;
+                }
+            }
+            if( b == 0 ) {
+                if( sizeof(void*) <= 4 ){
+                    volatile double sum = _x > p._x ? p._x + radius : _x + radius;
+                    return _x > p._x ? sum >= _x : sum >= p._x;
+                }
+                else {
+                    return _x > p._x ? p._x + radius >= _x : _x + radius >= p._x;
+                }
+            }
+
+            return sqrt( ( a * a ) + ( b * b ) ) <= radius;
+        }
+
         string toString() const {
             StringBuilder buf(32);
             buf << "(" << _x << "," << _y << ")";
@@ -396,6 +474,12 @@ namespace mongo {
     extern const double EARTH_RADIUS_KM;
     extern const double EARTH_RADIUS_MILES;
 
+    // Technically lat/long bounds, not really tied to earth radius.
+    inline void checkEarthBounds( Point p ) {
+        uassert( 14808, str::stream() << "point " << p.toString() << " must be in earth-like bounds of long : [-180, 180), lat : [-90, 90] ",
+                 p._x >= -180 && p._x < 180 && p._y >= -90 && p._y <= 90 );
+    }
+
     inline double deg2rad(double deg) { return deg * (M_PI/180); }
     inline double rad2deg(double rad) { return rad * (180/M_PI); }
 
diff --git a/db/geo/haystack.cpp b/db/geo/haystack.cpp
index 7f278ca..a5dd478 100644
--- a/db/geo/haystack.cpp
+++ b/db/geo/haystack.cpp
@@ -119,7 +119,7 @@ namespace mongo {
             return ss.str();
         }
 
-        void _add( const BSONObj& obj, const string& root , const BSONElement& e , BSONObjSetDefaultOrder& keys ) const {
+        void _add( const BSONObj& obj, const string& root , const BSONElement& e , BSONObjSet& keys ) const {
             BSONObjBuilder buf;
             buf.append( "" , root );
             if ( e.eoo() )
@@ -132,7 +132,7 @@ namespace mongo {
             keys.insert( key );
         }
 
-        void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
 
             BSONElement loc = obj.getFieldDotted( _geo );
             if ( loc.eoo() )
@@ -207,15 +207,15 @@ namespace mongo {
                     GEOQUADDEBUG( "KEY: " << key );
 
                     set<DiskLoc> thisPass;
-                    BtreeCursor cursor( nsd , idxNo , *getDetails() , key , key , true , 1 );
-                    while ( cursor.ok() ) {
-                        pair<set<DiskLoc>::iterator, bool> p = thisPass.insert( cursor.currLoc() );
+                    scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsd , idxNo , *getDetails() , key , key , true , 1 ) );
+                    while ( cursor->ok() ) {
+                        pair<set<DiskLoc>::iterator, bool> p = thisPass.insert( cursor->currLoc() );
                         if ( p.second ) {
-                            hopper.got( cursor.currLoc() );
-                            GEOQUADDEBUG( "\t" << cursor.current() );
+                            hopper.got( cursor->currLoc() );
+                            GEOQUADDEBUG( "\t" << cursor->current() );
                             btreeMatches++;
                         }
-                        cursor.advance();
+                        cursor->advance();
                     }
                 }
 
@@ -264,7 +264,7 @@ namespace mongo {
         virtual LockType locktype() const { return READ; }
         bool slaveOk() const { return true; }
         bool slaveOverrideOk() const { return true; }
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
 
             string ns = dbname + "." + cmdObj.firstElement().valuestr();
 
diff --git a/db/index.cpp b/db/index.cpp
index 218ecb3..67a0d44 100644
--- a/db/index.cpp
+++ b/db/index.cpp
@@ -1,4 +1,4 @@
-// index.cpp
+/** @file index.cpp */
 
 /**
 *    Copyright (C) 2008 10gen Inc.
@@ -20,12 +20,86 @@
 #include "namespace-inl.h"
 #include "index.h"
 #include "btree.h"
-#include "query.h"
 #include "background.h"
 #include "repl/rs.h"
+#include "ops/delete.h"
+
 
 namespace mongo {
 
+    template< class V >
+    class IndexInterfaceImpl : public IndexInterface { 
+    public:
+        typedef typename V::KeyOwned KeyOwned;
+        virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering);
+
+/*        virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+            int& pos, bool& found, const DiskLoc &recordLoc, int direction) { 
+            return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction);
+        }
+        */
+        virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) { 
+            return thisLoc.btree<V>()->fullValidate(thisLoc, order);
+        }
+        virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const { 
+            return thisLoc.btree<V>()->findSingle(indexdetails,thisLoc,key);
+        } 
+        virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const {
+            return thisLoc.btree<V>()->unindex(thisLoc, id, key, recordLoc);
+        }
+        virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                      const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                      IndexDetails& idx, bool toplevel = true) const {
+            return thisLoc.btree<V>()->bt_insert(thisLoc, recordLoc, key, order, dupsAllowed, idx, toplevel);
+        }
+        virtual DiskLoc addBucket(const IndexDetails& id) { 
+            return BtreeBucket<V>::addBucket(id);
+        }
+        virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head, DiskLoc self, const Ordering& ordering) { 
+            const BtreeBucket<V> *h = head.btree<V>();
+            for( vector<BSONObj*>::iterator i = addedKeys.begin(); i != addedKeys.end(); i++ ) {
+                KeyOwned k(**i);
+                bool dup = h->wouldCreateDup(idx, head, k, ordering, self);
+                uassert( 11001 , h->dupKeyError( idx , k ) , !dup);
+            }
+        }
+
+        // for geo:
+        virtual bool isUsed(DiskLoc thisLoc, int pos) { return thisLoc.btree<V>()->isUsed(pos); }
+        virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj& key, DiskLoc& recordLoc) {
+            typename BtreeBucket<V>::KeyNode kn = thisLoc.btree<V>()->keyNode(pos);
+            key = kn.key.toBson();
+            recordLoc = kn.recordLoc;
+        }
+        virtual BSONObj keyAt(DiskLoc thisLoc, int pos) {
+            return thisLoc.btree<V>()->keyAt(pos).toBson();
+        }
+        virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) { 
+            return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction);
+        }
+        virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) { 
+            return thisLoc.btree<V>()->advance(thisLoc,keyOfs,direction,caller);
+        }
+    };
+
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o); // key.cpp
+
+    template <>
+    int IndexInterfaceImpl< V0 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) { 
+        return oldCompare(l, r, ordering);
+    }
+
+    template <>
+    int IndexInterfaceImpl< V1 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) { 
+        return l.woCompare(r, ordering, /*considerfieldname*/false);
+    }
+
+    IndexInterfaceImpl<V0> iii_v0;
+    IndexInterfaceImpl<V1> iii_v1;
+
+    IndexInterface *IndexDetails::iis[] = { &iii_v0, &iii_v1 };
+
     int removeFromSysIndexes(const char *ns, const char *idxName) {
         string system_indexes = cc().database()->name + ".system.indexes";
         BSONObjBuilder b;
@@ -66,7 +140,7 @@ namespace mongo {
     }
 
     const IndexSpec& IndexDetails::getSpec() const {
-        scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
         return NamespaceDetailsTransient::get_inlock( info.obj()["ns"].valuestr() ).getIndexSpec( this );
     }
 
@@ -104,13 +178,15 @@ namespace mongo {
         }
     }
 
-    void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSetDefaultOrder& keys) const {
+    void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const {
         getSpec().getKeys( obj, keys );
     }
 
-    void setDifference(BSONObjSetDefaultOrder &l, BSONObjSetDefaultOrder &r, vector<BSONObj*> &diff) {
-        BSONObjSetDefaultOrder::iterator i = l.begin();
-        BSONObjSetDefaultOrder::iterator j = r.begin();
+    void setDifference(BSONObjSet &l, BSONObjSet &r, vector<BSONObj*> &diff) {
+        // l and r must use the same ordering spec.
+        verify( 14819, l.key_comp().order() == r.key_comp().order() );
+        BSONObjSet::iterator i = l.begin();
+        BSONObjSet::iterator j = r.begin();
         while ( 1 ) {
             if ( i == l.end() )
                 break;
@@ -189,7 +265,6 @@ namespace mongo {
         uassert(10097, "bad table to index name on add index attempt",
                 cc().database()->name == nsToDatabase(sourceNS.c_str()));
 
-
         BSONObj key = io.getObjectField("key");
         uassert(12524, "index key pattern too large", key.objsize() <= 2048);
         if( !validKeyPattern(key) ) {
@@ -260,30 +335,53 @@ namespace mongo {
         string pluginName = IndexPlugin::findPluginName( key );
         IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0;
 
-        if ( plugin ) {
-            fixedIndexObject = plugin->adjustIndexSpec( io );
-        }
-        else if ( io["v"].eoo() ) {
-            // add "v" if it doesn't exist
-            // if it does - leave whatever value was there
-            // this is for testing and replication
-            BSONObjBuilder b( io.objsize() + 32 );
-            b.appendElements( io );
-            b.append( "v" , 0 );
+
+        { 
+            BSONObj o = io;
+            if ( plugin ) {
+                o = plugin->adjustIndexSpec(o);
+            }
+            BSONObjBuilder b;
+            int v = DefaultIndexVersionNumber;
+            if( !o["v"].eoo() ) {
+                double vv = o["v"].Number();
+                // note (one day) we may be able to fresh build less versions than we can use
+                // isASupportedIndexVersionNumber() is what we can use
+                uassert(14803, str::stream() << "this version of mongod cannot build new indexes of version number " << vv, 
+                    vv == 0 || vv == 1);
+                v = (int) vv;
+            }
+            // idea is to put things we use a lot earlier
+            b.append("v", v);
+            b.append(o["key"]);
+            if( o["unique"].trueValue() )
+                b.appendBool("unique", true); // normalize to bool true in case was int 1 or something...
+            b.append(o["ns"]);
+
+            {
+                // stripping _id
+                BSONObjIterator i(o);
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    string s = e.fieldName();
+                    if( s != "_id" && s != "v" && s != "ns" && s != "unique" && s != "key" )
+                        b.append(e);
+                }
+            }
+        
             fixedIndexObject = b.obj();
         }
 
         return true;
     }
 
-
     void IndexSpec::reset( const IndexDetails * details ) {
         _details = details;
         reset( details->info );
     }
 
-    void IndexSpec::reset( const DiskLoc& loc ) {
-        info = loc.obj();
+    void IndexSpec::reset( const BSONObj& _info ) {
+        info = _info;
         keyPattern = info["key"].embeddedObjectUserCheck();
         if ( keyPattern.objsize() == 0 ) {
             out() << info.toString() << endl;
diff --git a/db/index.h b/db/index.h
index d13bd1d..54b0639 100644
--- a/db/index.h
+++ b/db/index.h
@@ -22,9 +22,34 @@
 #include "diskloc.h"
 #include "jsobj.h"
 #include "indexkey.h"
+#include "key.h"
 
 namespace mongo {
 
+    class IndexInterface {
+    protected:
+        virtual ~IndexInterface() { }
+    public:
+        virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering) = 0;
+        virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) = 0;
+        virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const = 0;
+        virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const = 0;
+        virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+            const BSONObj& key, const Ordering &order, bool dupsAllowed,
+            IndexDetails& idx, bool toplevel = true) const = 0;
+        virtual DiskLoc addBucket(const IndexDetails&) = 0;
+        virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head, 
+            DiskLoc self, const Ordering& ordering) = 0;
+
+        // these are for geo
+        virtual bool isUsed(DiskLoc thisLoc, int pos) = 0;
+        virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj&, DiskLoc& recordLoc) = 0;
+        virtual BSONObj keyAt(DiskLoc thisLoc, int pos) = 0;
+        virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                               int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) = 0;
+        virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
+    };
+
     /* Details about a particular index. There is one of these effectively for each object in
        system.namespaces (although this also includes the head pointer, which is not in that
        collection).
@@ -45,7 +70,7 @@ namespace mongo {
         /* Location of index info object. Format:
 
              { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
-               [, unique: <bool>, background: <bool>]
+               [, unique: <bool>, background: <bool>, v:<version>]
              }
 
            This object is in the system.indexes collection.  Note that since we
@@ -68,7 +93,7 @@ namespace mongo {
            only when it's a "multikey" array.
            keys will be left empty if key not found in the object.
         */
-        void getKeysFromObject( const BSONObj& obj, BSONObjSetDefaultOrder& keys) const;
+        void getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const;
 
         /* get the key pattern for this object.
            e.g., { lastname:1, firstname:1 }
@@ -86,7 +111,6 @@ namespace mongo {
 
         /* true if the specified key is in the index */
         bool hasKey(const BSONObj& key);
-        bool wouldCreateDup(const BSONObj& key, DiskLoc self);
 
         // returns name of this index's storage area
         // database.table.$index
@@ -126,6 +150,21 @@ namespace mongo {
             return io.getStringField("ns");
         }
 
+        static int versionForIndexObj( const BSONObj &obj ) {
+            BSONElement e = obj["v"];
+            if( e.type() == NumberInt ) 
+                return e._numberInt();
+            // should normally be an int.  this is for backward compatibility
+            int v = e.numberInt();
+            uassert(14802, "index v field should be Integer type", v == 0);
+            return v;            
+        }
+        
+        int version() const {
+            return versionForIndexObj( info.obj() );
+        }
+
+        /** @return true if index has unique constraint */
         bool unique() const {
             BSONObj io = info.obj();
             return io["unique"].trueValue() ||
@@ -133,33 +172,43 @@ namespace mongo {
                    isIdIndex();
         }
 
-        /* if set, when building index, if any duplicates, drop the duplicating object */
+        /** return true if dropDups was set when building index (if any duplicates, dropdups drops the duplicating objects) */
         bool dropDups() const {
             return info.obj().getBoolField( "dropDups" );
         }
 
-        /* delete this index.  does NOT clean up the system catalog
-           (system.indexes or system.namespaces) -- only NamespaceIndex.
+        /** delete this index.  does NOT clean up the system catalog
+            (system.indexes or system.namespaces) -- only NamespaceIndex.
         */
         void kill_idx();
 
         const IndexSpec& getSpec() const;
 
-        void checkVersion() const {
-            // TODO: cache?
-            massert( 13658 , 
-                     str::stream() << "using a newer index version: " << info.obj() << " v: " << info.obj().getIntField("v" ) , 
-                     info.obj().getIntField("v") <= 0 );
-        }
-
         string toString() const {
             return info.obj().toString();
         }
+
+        /** @return true if supported.  supported means we can use the index, including adding new keys.
+                    it may not mean we can build the index version in question: we may not maintain building 
+                    of indexes in old formats in the future.
+        */
+        static bool isASupportedIndexVersionNumber(int v) { return (v&1)==v; } // v == 0 || v == 1
+
+        /** @return the interface for this interface, which varies with the index version.
+            used for backward compatibility of index versions/formats.
+        */
+        IndexInterface& idxInterface() const { 
+            int v = version();
+            dassert( isASupportedIndexVersionNumber(v) );
+            return *iis[v&1];
+        }
+
+        static IndexInterface *iis[];
     };
 
     struct IndexChanges { /*on an update*/
-        BSONObjSetDefaultOrder oldkeys;
-        BSONObjSetDefaultOrder newkeys;
+        BSONObjSet oldkeys;
+        BSONObjSet newkeys;
         vector<BSONObj*> removed; // these keys were removed as part of the change
         vector<BSONObj*> added;   // these keys were added as part of the change
 
@@ -169,10 +218,8 @@ namespace mongo {
         void dupCheck(IndexDetails& idx, DiskLoc curObjLoc) {
             if( added.empty() || !idx.unique() )
                 return;
-            for( vector<BSONObj*>::iterator i = added.begin(); i != added.end(); i++ ) {
-                bool dup = idx.wouldCreateDup(**i, curObjLoc);
-                uassert( 11001 , "E11001 duplicate key on update", !dup);
-            }
+            const Ordering ordering = Ordering::make(idx.keyPattern());
+            idx.idxInterface().uassertIfDups(idx, added, idx.head, curObjLoc, ordering); // "E11001 duplicate key on update"
         }
     };
 
diff --git a/db/indexkey.cpp b/db/indexkey.cpp
index 34f30fa..6d6fcc5 100644
--- a/db/indexkey.cpp
+++ b/db/indexkey.cpp
@@ -20,11 +20,17 @@
 #include "namespace-inl.h"
 #include "index.h"
 #include "btree.h"
-#include "query.h"
+#include "ops/query.h"
 #include "background.h"
+#include "../util/text.h"
 
 namespace mongo {
 
+    /** old (<= v1.8) : 0
+     1 is new version
+     */
+    const int DefaultIndexVersionNumber = 1;
+    
     map<string,IndexPlugin*> * IndexPlugin::_plugins;
 
     IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec )
@@ -100,6 +106,14 @@ namespace mongo {
         }
 
         {
+            // _undefinedElt
+            BSONObjBuilder b;
+            b.appendUndefined( "" );
+            _undefinedObj = b.obj();
+            _undefinedElt = _undefinedObj.firstElement();
+        }
+        
+        {
             // handle plugins
             string pluginName = IndexPlugin::findPluginName( keyPattern );
             if ( pluginName.size() ) {
@@ -116,131 +130,289 @@ namespace mongo {
         _finishedInit = true;
     }
 
-
-    void IndexSpec::getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
-        if ( _indexType.get() ) {
-            _indexType->getKeys( obj , keys );
-            return;
-        }
-        vector<const char*> fieldNames( _fieldNames );
-        vector<BSONElement> fixed( _fixed );
-        _getKeys( fieldNames , fixed , obj, keys );
-        if ( keys.empty() && ! _sparse )
-            keys.insert( _nullKey );
+    void assertParallelArrays( const char *first, const char *second ) {
+        stringstream ss;
+        ss << "cannot index parallel arrays [" << first << "] [" << second << "]";
+        uasserted( 10088 ,  ss.str() );        
     }
-
-    void IndexSpec::_getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
-        BSONElement arrElt;
-        unsigned arrIdx = ~0;
-        int numNotFound = 0;
-
-        for( unsigned i = 0; i < fieldNames.size(); ++i ) {
-            if ( *fieldNames[ i ] == '\0' )
-                continue;
-
-            BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] );
-
-            if ( e.eoo() ) {
-                e = _nullElt; // no matching field
-                numNotFound++;
+    
+    class KeyGeneratorV0 {
+    public:
+        KeyGeneratorV0( const IndexSpec &spec ) : _spec( spec ) {}
+        
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+            if ( _spec._indexType.get() ) { //plugin (eg geo)
+                _spec._indexType->getKeys( obj , keys );
+                return;
             }
-
-            if ( e.type() != Array )
-                fieldNames[ i ] = ""; // no matching field or non-array match
-
-            if ( *fieldNames[ i ] == '\0' )
-                fixed[ i ] = e; // no need for further object expansion (though array expansion still possible)
-
-            if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here
-                arrIdx = i;
-                arrElt = e;
+            vector<const char*> fieldNames( _spec._fieldNames );
+            vector<BSONElement> fixed( _spec._fixed );
+            _getKeys( fieldNames , fixed , obj, keys );
+            if ( keys.empty() && ! _spec._sparse )
+                keys.insert( _spec._nullKey );
+        }
+        
+    private:
+        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys ) const {
+            BSONElement arrElt;
+            unsigned arrIdx = ~0;
+            int numNotFound = 0;
+            
+            for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+                if ( *fieldNames[ i ] == '\0' )
+                    continue;
+                
+                BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] );
+                
+                if ( e.eoo() ) {
+                    e = _spec._nullElt; // no matching field
+                    numNotFound++;
+                }
+                
+                if ( e.type() != Array )
+                    fieldNames[ i ] = ""; // no matching field or non-array match
+                
+                if ( *fieldNames[ i ] == '\0' )
+                    fixed[ i ] = e; // no need for further object expansion (though array expansion still possible)
+                
+                if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here
+                    arrIdx = i;
+                    arrElt = e;
+                }
+                
+                // enforce single array path here
+                if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) {
+                    assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+                }
             }
-
-            // enforce single array path here
-            if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) {
-                stringstream ss;
-                ss << "cannot index parallel arrays [" << e.fieldName() << "] [" << arrElt.fieldName() << "]";
-                uasserted( 10088 ,  ss.str() );
+            
+            bool allFound = true; // have we found elements for all field names in the key spec?
+            for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) {
+                if ( **i != '\0' ) {
+                    allFound = false;
+                    break;
+                }
             }
-        }
-
-        bool allFound = true; // have we found elements for all field names in the key spec?
-        for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) {
-            if ( **i != '\0' ) {
-                allFound = false;
-                break;
+            
+            if ( _spec._sparse && numNotFound == _spec._nFields ) {
+                // we didn't find any fields
+                // so we're not going to index this document
+                return;
             }
-        }
-
-        if ( _sparse && numNotFound == _nFields ) {
-            // we didn't find any fields
-            // so we're not going to index this document
-            return;
-        }
-
-        bool insertArrayNull = false;
-
-        if ( allFound ) {
-            if ( arrElt.eoo() ) {
-                // no terminal array element to expand
-                BSONObjBuilder b(_sizeTracker);
-                for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
-                    b.appendAs( *i, "" );
-                keys.insert( b.obj() );
+            
+            bool insertArrayNull = false;
+            
+            if ( allFound ) {
+                if ( arrElt.eoo() ) {
+                    // no terminal array element to expand
+                    BSONObjBuilder b(_spec._sizeTracker);
+                    for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
+                        b.appendAs( *i, "" );
+                    keys.insert( b.obj() );
+                }
+                else {
+                    // terminal array element to expand, so generate all keys
+                    BSONObjIterator i( arrElt.embeddedObject() );
+                    if ( i.more() ) {
+                        while( i.more() ) {
+                            BSONObjBuilder b(_spec._sizeTracker);
+                            for( unsigned j = 0; j < fixed.size(); ++j ) {
+                                if ( j == arrIdx )
+                                    b.appendAs( i.next(), "" );
+                                else
+                                    b.appendAs( fixed[ j ], "" );
+                            }
+                            keys.insert( b.obj() );
+                        }
+                    }
+                    else if ( fixed.size() > 1 ) {
+                        insertArrayNull = true;
+                    }
+                }
             }
             else {
-                // terminal array element to expand, so generate all keys
+                // nonterminal array element to expand, so recurse
+                assert( !arrElt.eoo() );
                 BSONObjIterator i( arrElt.embeddedObject() );
                 if ( i.more() ) {
                     while( i.more() ) {
-                        BSONObjBuilder b(_sizeTracker);
-                        for( unsigned j = 0; j < fixed.size(); ++j ) {
-                            if ( j == arrIdx )
-                                b.appendAs( i.next(), "" );
-                            else
-                                b.appendAs( fixed[ j ], "" );
+                        BSONElement e = i.next();
+                        if ( e.type() == Object ) {
+                            _getKeys( fieldNames, fixed, e.embeddedObject(), keys );
                         }
-                        keys.insert( b.obj() );
                     }
                 }
-                else if ( fixed.size() > 1 ) {
+                else {
                     insertArrayNull = true;
                 }
             }
-        }
-        else {
-            // nonterminal array element to expand, so recurse
-            assert( !arrElt.eoo() );
-            BSONObjIterator i( arrElt.embeddedObject() );
-            if ( i.more() ) {
-                while( i.more() ) {
-                    BSONElement e = i.next();
-                    if ( e.type() == Object ) {
-                        _getKeys( fieldNames, fixed, e.embeddedObject(), keys );
+            
+            if ( insertArrayNull ) {
+                // x : [] - need to insert undefined
+                BSONObjBuilder b(_spec._sizeTracker);
+                for( unsigned j = 0; j < fixed.size(); ++j ) {
+                    if ( j == arrIdx ) {
+                        b.appendUndefined( "" );
+                    }
+                    else {
+                        BSONElement e = fixed[j];
+                        if ( e.eoo() )
+                            b.appendNull( "" );
+                        else
+                            b.appendAs( e , "" );
                     }
                 }
+                keys.insert( b.obj() );
             }
-            else {
-                insertArrayNull = true;
+        }
+        
+        const IndexSpec &_spec;
+    };
+
+    class KeyGeneratorV1 {
+    public:
+        KeyGeneratorV1( const IndexSpec &spec ) : _spec( spec ) {}
+        
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+            if ( _spec._indexType.get() ) { //plugin (eg geo)
+                _spec._indexType->getKeys( obj , keys );
+                return;
+            }
+            vector<const char*> fieldNames( _spec._fieldNames );
+            vector<BSONElement> fixed( _spec._fixed );
+            _getKeys( fieldNames , fixed , obj, keys );
+            if ( keys.empty() && ! _spec._sparse )
+                keys.insert( _spec._nullKey );
+        }     
+        
+    private:
+        /**
+         * @param arrayNestedArray - set if the returned element is an array nested directly within arr.
+         */
+        BSONElement extractNextElement( const BSONObj &obj, const BSONObj &arr, const char *&field, bool &arrayNestedArray ) const {
+            string firstField = mongoutils::str::before( field, '.' );
+            bool haveObjField = !obj.getField( firstField ).eoo();
+            BSONElement arrField = arr.getField( firstField );
+            bool haveArrField = !arrField.eoo();
+
+            // An index component field name cannot exist in both a document array and one of that array's children.
+            uassert( 15855 , "Parallel references while expanding indexed field in array", !haveObjField || !haveArrField );
+
+            arrayNestedArray = false;
+			if ( haveObjField ) {
+                return obj.getFieldDottedOrArray( field );
+            }
+            else if ( haveArrField ) {
+                if ( arrField.type() == Array ) {
+                    arrayNestedArray = true;
+                }
+                return arr.getFieldDottedOrArray( field );
             }
+            return BSONElement();
         }
-
-        if ( insertArrayNull ) {
-            // x : [] - need to insert undefined
-            BSONObjBuilder b(_sizeTracker);
-            for( unsigned j = 0; j < fixed.size(); ++j ) {
-                if ( j == arrIdx ) {
-                    b.appendUndefined( "" );
+        
+        void _getKeysArrEltFixed( vector<const char*> &fieldNames , vector<BSONElement> &fixed , const BSONElement &arrEntry, BSONObjSet &keys, int numNotFound, const BSONElement &arrObjElt, const set< unsigned > &arrIdxs, bool mayExpandArrayUnembedded ) const {
+            // set up any terminal array values
+            for( set<unsigned>::const_iterator j = arrIdxs.begin(); j != arrIdxs.end(); ++j ) {
+                if ( *fieldNames[ *j ] == '\0' ) {
+                    fixed[ *j ] = mayExpandArrayUnembedded ? arrEntry : arrObjElt;
+                }
+            }
+            // recurse
+            _getKeys( fieldNames, fixed, ( arrEntry.type() == Object ) ? arrEntry.embeddedObject() : BSONObj(), keys, numNotFound, arrObjElt.embeddedObject() );        
+        }
+        
+        /**
+         * @param fieldNames - fields to index, may be postfixes in recursive calls
+         * @param fixed - values that have already been identified for their index fields
+         * @param obj - object from which keys should be extracted, based on names in fieldNames
+         * @param keys - set where index keys are written
+         * @param numNotFound - number of index fields that have already been identified as missing
+         * @param array - array from which keys should be extracted, based on names in fieldNames
+         *        If obj and array are both nonempty, obj will be one of the elements of array.
+         */        
+        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys, int numNotFound = 0, const BSONObj &array = BSONObj() ) const {
+            BSONElement arrElt;
+            set<unsigned> arrIdxs;
+            bool mayExpandArrayUnembedded = true;
+            for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+                if ( *fieldNames[ i ] == '\0' ) {
+                    continue;
+                }
+                
+                bool arrayNestedArray;
+                // Extract element matching fieldName[ i ] from object xor array.
+                BSONElement e = extractNextElement( obj, array, fieldNames[ i ], arrayNestedArray );
+                
+                if ( e.eoo() ) {
+                    // if field not present, set to null
+                    fixed[ i ] = _spec._nullElt;
+                    // done expanding this field name
+                    fieldNames[ i ] = "";
+                    numNotFound++;
+                }
+                else if ( e.type() == Array ) {
+                    arrIdxs.insert( i );
+                    if ( arrElt.eoo() ) {
+                        // we only expand arrays on a single path -- track the path here
+                        arrElt = e;
+                    }
+                    else if ( e.rawdata() != arrElt.rawdata() ) {
+                        // enforce single array path here
+                        assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+                    }
+                    if ( arrayNestedArray ) {
+                        mayExpandArrayUnembedded = false;   
+                    }
                 }
                 else {
-                    BSONElement e = fixed[j];
-                    if ( e.eoo() )
-                        b.appendNull( "" );
-                    else
-                        b.appendAs( e , "" );
+                    // not an array - no need for further expansion
+                    fixed[ i ] = e;
                 }
             }
-            keys.insert( b.obj() );
+            
+            if ( arrElt.eoo() ) {
+                // No array, so generate a single key.
+                if ( _spec._sparse && numNotFound == _spec._nFields ) {
+                    return;
+                }            
+                BSONObjBuilder b(_spec._sizeTracker);
+                for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i ) {
+                    b.appendAs( *i, "" );
+                }
+                keys.insert( b.obj() );
+            }
+            else if ( arrElt.embeddedObject().firstElement().eoo() ) {
+                // Empty array, so set matching fields to undefined.
+                _getKeysArrEltFixed( fieldNames, fixed, _spec._undefinedElt, keys, numNotFound, arrElt, arrIdxs, true );
+            }
+            else {
+                // Non empty array that can be expanded, so generate a key for each member.
+                BSONObj arrObj = arrElt.embeddedObject();
+                BSONObjIterator i( arrObj );
+                while( i.more() ) {
+                    _getKeysArrEltFixed( fieldNames, fixed, i.next(), keys, numNotFound, arrElt, arrIdxs, mayExpandArrayUnembedded );
+                }
+            }
+        }
+        
+        const IndexSpec &_spec;
+    };
+    
+    void IndexSpec::getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+        switch( indexVersion() ) {
+            case 0: {
+                KeyGeneratorV0 g( *this );
+                g.getKeys( obj, keys );
+                break;
+            }
+            case 1: {
+                KeyGeneratorV1 g( *this );
+                g.getKeys( obj, keys );
+                break;
+            }
+            default:
+                massert( 15869, "Invalid index version for key generation.", false );
         }
     }
 
@@ -275,6 +447,13 @@ namespace mongo {
     IndexSuitability IndexType::suitability( const BSONObj& query , const BSONObj& order ) const {
         return _spec->_suitability( query , order );
     }
+    
+    int IndexSpec::indexVersion() const {
+        if ( !info.hasField( "v" ) ) {
+            return DefaultIndexVersionNumber;
+        }
+        return IndexDetails::versionForIndexObj( info );
+    }    
 
     bool IndexType::scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const {
         return ! order.isEmpty();
diff --git a/db/indexkey.h b/db/indexkey.h
index be73171..c04cd63 100644
--- a/db/indexkey.h
+++ b/db/indexkey.h
@@ -25,6 +25,8 @@
 
 namespace mongo {
 
+    extern const int DefaultIndexVersionNumber;
+    
     class Cursor;
     class IndexSpec;
     class IndexType; // TODO: this name sucks
@@ -44,7 +46,7 @@ namespace mongo {
         IndexType( const IndexPlugin * plugin , const IndexSpec * spec );
         virtual ~IndexType();
 
-        virtual void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const = 0;
+        virtual void getKeys( const BSONObj &obj, BSONObjSet &keys ) const = 0;
         virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const = 0;
 
         /** optional op : changes query to match what's in the index */
@@ -122,7 +124,7 @@ namespace mongo {
             : _details(0) , _finishedInit(false) {
         }
 
-        IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() )
+        explicit IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() )
             : keyPattern(k) , info(m) , _details(0) , _finishedInit(false) {
             _init();
         }
@@ -131,14 +133,15 @@ namespace mongo {
            this is a DiscLoc of an IndexDetails info
            should have a key field
          */
-        IndexSpec( const DiskLoc& loc ) {
+        explicit IndexSpec( const DiskLoc& loc ) {
             reset( loc );
         }
 
-        void reset( const DiskLoc& loc );
+        void reset( const BSONObj& info );
+        void reset( const DiskLoc& infoLoc ) { reset(infoLoc.obj()); }
         void reset( const IndexDetails * details );
 
-        void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const;
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const;
 
         BSONElement missingField() const { return _nullElt; }
 
@@ -160,33 +163,33 @@ namespace mongo {
 
     protected:
 
+        int indexVersion() const;
+        
         IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ;
 
-        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const;
-
         BSONSizeTracker _sizeTracker;
-
         vector<const char*> _fieldNames;
         vector<BSONElement> _fixed;
 
         BSONObj _nullKey; // a full key with all fields null
-
         BSONObj _nullObj; // only used for _nullElt
         BSONElement _nullElt; // jstNull
 
+        BSONObj _undefinedObj; // only used for _undefinedElt
+        BSONElement _undefinedElt; // undefined
+
         int _nFields; // number of fields in the index
         bool _sparse; // if the index is sparse
-
         shared_ptr<IndexType> _indexType;
-
         const IndexDetails * _details;
 
         void _init();
 
+        friend class IndexType;
+        friend class KeyGeneratorV0;
+        friend class KeyGeneratorV1;
     public:
         bool _finishedInit;
-
-        friend class IndexType;
     };
 
 
diff --git a/db/instance.cpp b/db/instance.cpp
index bb2d9a5..6727867 100644
--- a/db/instance.cpp
+++ b/db/instance.cpp
@@ -19,7 +19,6 @@
 
 #include "pch.h"
 #include "db.h"
-#include "query.h"
 #include "introspect.h"
 #include "repl.h"
 #include "dbmessage.h"
@@ -27,7 +26,7 @@
 #include "lasterror.h"
 #include "security.h"
 #include "json.h"
-#include "replpair.h"
+#include "replutil.h"
 #include "../s/d_logic.h"
 #include "../util/file_allocator.h"
 #include "../util/goodies.h"
@@ -39,6 +38,9 @@
 #include "background.h"
 #include "dur_journal.h"
 #include "dur_recover.h"
+#include "ops/update.h"
+#include "ops/delete.h"
+#include "ops/query.h"
 
 namespace mongo {
 
@@ -56,8 +58,6 @@ namespace mongo {
 
     string dbExecCommand;
 
-    char *appsrvPath = NULL;
-
     DiagLog _diaglog;
 
     bool useCursors = true;
@@ -73,14 +73,12 @@ namespace mongo {
     KillCurrentOp killCurrentOp;
 
     int lockFile = 0;
-#ifdef WIN32
+#ifdef _WIN32
     HANDLE lockFileHandle;
 #endif
 
     // see FSyncCommand:
-    unsigned lockedForWriting;
-    mongo::mutex lockedForWritingMutex("lockedForWriting");
-    bool unlockRequested = false;
+    extern bool lockedForWriting;
 
     void inProgCmd( Message &m, DbResponse &dbresponse ) {
         BSONObjBuilder b;
@@ -113,7 +111,7 @@ namespace mongo {
             unsigned x = lockedForWriting;
             if( x ) {
                 b.append("fsyncLock", x);
-                b.append("info", "use db.$cmd.sys.unlock.findOne() to terminate the fsync write/snapshot lock");
+                b.append("info", "use db.fsyncUnlock() to terminate the fsync write/snapshot lock");
             }
         }
 
@@ -144,16 +142,20 @@ namespace mongo {
         replyToQuery(0, m, dbresponse, obj);
     }
 
+    void unlockFsyncAndWait();
     void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) {
         BSONObj obj;
-        if( ! cc().isAdmin() || strncmp(ns, "admin.", 6) != 0 ) {
+        if ( ! cc().isAdmin() ) { // checks auth
             obj = fromjson("{\"err\":\"unauthorized\"}");
         }
+        else if (strncmp(ns, "admin.", 6) != 0 ) {
+            obj = fromjson("{\"err\":\"unauthorized - this command must be run against the admin DB\"}");
+        }
         else {
             if( lockedForWriting ) {
                 log() << "command: unlock requested" << endl;
-                obj = fromjson("{ok:1,\"info\":\"unlock requested\"}");
-                unlockRequested = true;
+                obj = fromjson("{ok:1,\"info\":\"unlock completed\"}");
+                unlockFsyncAndWait();
             }
             else {
                 obj = fromjson("{ok:0,\"errmsg\":\"not locked\"}");
@@ -178,7 +180,7 @@ namespace mongo {
         }
         catch ( AssertionException& e ) {
             ok = false;
-            op.debug().str << " exception ";
+            op.debug().exceptionInfo = e.getInfo();
             LOGSOME {
                 log() << "assertion " << e.toString() << " ns:" << q.ns << " query:" <<
                 (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
@@ -210,9 +212,7 @@ namespace mongo {
             resp->setData( msgdata, true );
         }
 
-        if ( op.shouldDBProfile( 0 ) ) {
-            op.debug().str << " bytes:" << resp->header()->dataLen();
-        }
+        op.debug().responseLength = resp->header()->dataLen();
 
         dbresponse.response = resp.release();
         dbresponse.responseTo = responseTo;
@@ -220,8 +220,17 @@ namespace mongo {
         return ok;
     }
 
+    void (*reportEventToSystem)(const char *msg) = 0;
+
+    void mongoAbort(const char *msg) { 
+        if( reportEventToSystem ) 
+            reportEventToSystem(msg);
+        rawOut(msg);
+        ::abort();
+    }
+
     // Returns false when request includes 'end'
-    void assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client ) {
+    void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) {
 
         // before we lock...
         int op = m.operation();
@@ -268,11 +277,10 @@ namespace mongo {
             currentOpP = nestedOp.get();
         }
         CurOp& currentOp = *currentOpP;
-        currentOp.reset(client,op);
+        currentOp.reset(remote,op);
 
         OpDebug& debug = currentOp.debug();
-        StringBuilder& ss = debug.str;
-        ss << opToString( op ) << " ";
+        debug.op = op;
 
         int logThreshold = cmdLine.slowMS;
         bool log = logLevel >= 1;
@@ -291,7 +299,7 @@ namespace mongo {
             char *p = m.singleData()->_data;
             int len = strlen(p);
             if ( len > 400 )
-                out() << curTimeMillis() % 10000 <<
+                out() << curTimeMillis64() % 10000 <<
                       " long msg received, len:" << len << endl;
 
             Message *resp = new Message();
@@ -324,7 +332,6 @@ namespace mongo {
                     else if ( op == dbKillCursors ) {
                         currentOp.ensureStarted();
                         logThreshold = 10;
-                        ss << "killcursors ";
                         receivedKillCursors(m);
                     }
                     else {
@@ -335,11 +342,11 @@ namespace mongo {
                 }
                 catch ( UserException& ue ) {
                     tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << ue.toString() << endl;
-                    ss << " exception " << ue.toString();
+                    debug.exceptionInfo = ue.getInfo();
                 }
                 catch ( AssertionException& e ) {
                     tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << e.toString() << endl;
-                    ss << " exception " << e.toString();
+                    debug.exceptionInfo = e.getInfo();
                     log = true;
                 }
             }
@@ -350,12 +357,12 @@ namespace mongo {
 
         //DEV log = true;
         if ( log || ms > logThreshold ) {
-            if( logLevel < 3 && op == dbGetMore && strstr(ns, ".oplog.") && ms < 3000 && !log ) {
+            if( logLevel < 3 && op == dbGetMore && strstr(ns, ".oplog.") && ms < 4300 && !log ) {
                 /* it's normal for getMore on the oplog to be slow because of use of awaitdata flag. */
             }
             else {
-                ss << ' ' << ms << "ms";
-                mongo::tlog() << ss.str() << endl;
+                debug.executionTime = ms;
+                mongo::tlog() << debug << endl;
             }
         }
 
@@ -367,15 +374,16 @@ namespace mongo {
             else {
                 writelock lk;
                 if ( dbHolder.isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ) {
-                    Client::Context c( currentOp.getNS() );
-                    profile(ss.str().c_str(), ms);
+                    Client::Context cx( currentOp.getNS() );
+                    profile(c , currentOp );
                 }
                 else {
                     mongo::log() << "note: not profiling because db went away - probably a close on: " << currentOp.getNS() << endl;
                 }
             }
         }
-
+        
+        debug.reset();
     } /* assembleResponse() */
 
     void receivedKillCursors(Message& m) {
@@ -383,9 +391,10 @@ namespace mongo {
         x++; // reserved
         int n = *x++;
 
-        assert( m.dataSize() == 8 + ( 8 * n ) );
+        uassert( 13659 , "sent 0 cursors to kill" , n != 0 );
+        massert( 13658 , str::stream() << "bad kill cursors size: " << m.dataSize() , m.dataSize() == 8 + ( 8 * n ) );
+        uassert( 13004 , str::stream() << "sent negative cursors to kill: " << n  , n >= 1 );
 
-        uassert( 13004 , "sent 0 cursors to kill" , n >= 1 );
         if ( n > 2000 ) {
             log( n < 30000 ? LL_WARNING : LL_ERROR ) << "receivedKillCursors, n=" << n << endl;
             assert( n < 30000 );
@@ -432,9 +441,7 @@ namespace mongo {
     void receivedUpdate(Message& m, CurOp& op) {
         DbMessage d(m);
         const char *ns = d.getns();
-        assert(*ns);
-        uassert( 10054 ,  "not master", isMasterNs( ns ) );
-        op.debug().str << ns << ' ';
+        op.debug().ns = ns;
         int flags = d.pullInt();
         BSONObj query = d.nextJsObj();
 
@@ -447,18 +454,15 @@ namespace mongo {
         bool upsert = flags & UpdateOption_Upsert;
         bool multi = flags & UpdateOption_Multi;
         bool broadcast = flags & UpdateOption_Broadcast;
-        {
-            string s = query.toString();
-            /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down.
-               instead, let's just story the query BSON in the debug object, and it can toString()
-               lazily
-            */
-            op.debug().str << " query: " << s;
-            op.setQuery(query);
-        }
+        
+        op.debug().query = query;
+        op.setQuery(query);
 
         writelock lk;
 
+        // writelock is used to synchronize stepdowns w/ writes
+        uassert( 10054 ,  "not master", isMasterNs( ns ) );
+        
         // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
         if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
             return;
@@ -472,21 +476,21 @@ namespace mongo {
     void receivedDelete(Message& m, CurOp& op) {
         DbMessage d(m);
         const char *ns = d.getns();
-        assert(*ns);
-        uassert( 10056 ,  "not master", isMasterNs( ns ) );
-        op.debug().str << ns << ' ';
+        op.debug().ns = ns;
         int flags = d.pullInt();
         bool justOne = flags & RemoveOption_JustOne;
         bool broadcast = flags & RemoveOption_Broadcast;
         assert( d.moreJSObjs() );
         BSONObj pattern = d.nextJsObj();
-        {
-            string s = pattern.toString();
-            op.debug().str << " query: " << s;
-            op.setQuery(pattern);
-        }
+        
+        op.debug().query = pattern;
+        op.setQuery(pattern);
 
         writelock lk(ns);
+
+        // writelock is used to synchronize stepdowns w/ writes
+        uassert( 10056 ,  "not master", isMasterNs( ns ) );
+
         // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
         if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
             return;
@@ -500,7 +504,6 @@ namespace mongo {
     QueryResult* emptyMoreResult(long long);
 
     bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) {
-        StringBuilder& ss = curop.debug().str;
         bool ok = true;
 
         DbMessage d(m);
@@ -509,9 +512,9 @@ namespace mongo {
         int ntoreturn = d.pullInt();
         long long cursorid = d.pullInt64();
 
-        ss << ns << " cid:" << cursorid;
-        if( ntoreturn )
-            ss << " ntoreturn:" << ntoreturn;
+        curop.debug().ns = ns;
+        curop.debug().ntoreturn = ntoreturn;
+        curop.debug().cursorid = cursorid;
 
         time_t start = 0;
         int pass = 0;
@@ -523,7 +526,13 @@ namespace mongo {
                 Client::Context ctx(ns);
                 msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust);
             }
-            catch ( GetMoreWaitException& ) {
+            catch ( AssertionException& e ) {
+                exhaust = false;
+                curop.debug().exceptionInfo = e.getInfo();
+                msgdata = emptyMoreResult(cursorid);
+                ok = false;
+            }
+            if (msgdata == 0) {
                 exhaust = false;
                 massert(13073, "shutting down", !inShutdown() );
                 if( pass == 0 ) {
@@ -544,64 +553,89 @@ namespace mongo {
                     sleepmillis(2);
                 continue;
             }
-            catch ( AssertionException& e ) {
-                exhaust = false;
-                ss << " exception " << e.toString();
-                msgdata = emptyMoreResult(cursorid);
-                ok = false;
-            }
             break;
         };
 
         Message *resp = new Message();
         resp->setData(msgdata, true);
-        ss << " bytes:" << resp->header()->dataLen();
-        ss << " nreturned:" << msgdata->nReturned;
+        curop.debug().responseLength = resp->header()->dataLen();
+        curop.debug().nreturned = msgdata->nReturned;
+
         dbresponse.response = resp;
         dbresponse.responseTo = m.header()->id;
+        
         if( exhaust ) {
-            ss << " exhaust ";
+            curop.debug().exhaust = true;
             dbresponse.exhaust = ns;
         }
+
         return ok;
     }
 
+    void checkAndInsert(const char *ns, /*modifies*/BSONObj& js) { 
+        uassert( 10059 , "object to insert too large", js.objsize() <= BSONObjMaxUserSize);
+        {
+            // check no $ modifiers.  note we only check top level.  (scanning deep would be quite expensive)
+            BSONObjIterator i( js );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' );
+            }
+        }
+        theDataFileMgr.insertWithObjMod(ns, js, false); // js may be modified in the call to add an _id field.
+        logOp("i", ns, js);
+    }
+
+    NOINLINE_DECL void insertMulti(DbMessage& d, const char *ns, const BSONObj& _js) { 
+        const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError;
+        int n = 0;
+        BSONObj js(_js);
+        while( 1 ) {
+            try {
+                checkAndInsert(ns, js);
+                ++n;
+                getDur().commitIfNeeded();
+            } catch (const UserException&) {
+                if (!keepGoing || !d.moreJSObjs()){
+                    globalOpCounters.incInsertInWriteLock(n);
+                    throw;
+                }
+                // otherwise ignore and keep going
+            }
+            if( !d.moreJSObjs() )
+                break;
+            js = d.nextJsObj(); // TODO: refactor to do objcheck outside of writelock
+        }
+    }
+
     void receivedInsert(Message& m, CurOp& op) {
         DbMessage d(m);
         const char *ns = d.getns();
-        assert(*ns);
-        uassert( 10058 ,  "not master", isMasterNs( ns ) );
-        op.debug().str << ns;
+        op.debug().ns = ns;
+
+        if( !d.moreJSObjs() ) {
+            // strange.  should we complain?
+            return;
+        }
+        BSONObj js = d.nextJsObj();
 
         writelock lk(ns);
 
+        // writelock is used to synchronize stepdowns w/ writes
+        uassert( 10058 , "not master", isMasterNs(ns) );
+
         if ( handlePossibleShardedMessage( m , 0 ) )
             return;
 
         Client::Context ctx(ns);
-        int n = 0;
-        while ( d.moreJSObjs() ) {
-            BSONObj js = d.nextJsObj();
-            uassert( 10059 , "object to insert too large", js.objsize() <= BSONObjMaxUserSize);
 
-            {
-                // check no $ modifiers
-                BSONObjIterator i( js );
-                while ( i.more() ) {
-                    BSONElement e = i.next();
-                    uassert( 13511 , "object to insert can't have $ modifiers" , e.fieldName()[0] != '$' );
-                }
-            }
-
-            theDataFileMgr.insertWithObjMod(ns, js, false);
-            logOp("i", ns, js);
-
-            if( ++n % 4 == 0 ) {
-                // if we are inserting quite a few, we may need to commit along the way
-                getDur().commitIfNeeded();
-            }
+        if( d.moreJSObjs() ) { 
+            insertMulti(d, ns, js);
+            return;
         }
-        globalOpCounters.incInsertInWriteLock(n);
+
+        checkAndInsert(ns, js);
+        globalOpCounters.incInsertInWriteLock(1);
     }
 
     void getDatabaseNames( vector< string > &names , const string& usePath ) {
@@ -648,7 +682,7 @@ namespace mongo {
         if ( lastError._get() )
             lastError.startRequest( toSend, lastError._get() );
         DbResponse dbResponse;
-        assembleResponse( toSend, dbResponse );
+        assembleResponse( toSend, dbResponse , _clientHost );
         assert( dbResponse.response );
         dbResponse.response->concat(); // can get rid of this if we make response handling smarter
         response = *dbResponse.response;
@@ -656,11 +690,11 @@ namespace mongo {
         return true;
     }
 
-    void DBDirectClient::say( Message &toSend ) {
+    void DBDirectClient::say( Message &toSend, bool isRetry ) {
         if ( lastError._get() )
             lastError.startRequest( toSend, lastError._get() );
         DbResponse dbResponse;
-        assembleResponse( toSend, dbResponse );
+        assembleResponse( toSend, dbResponse , _clientHost );
         getDur().commitIfNeeded();
     }
 
@@ -678,6 +712,8 @@ namespace mongo {
         ClientCursor::erase( id );
     }
 
+    HostAndPort DBDirectClient::_clientHost = HostAndPort( "0.0.0.0" , 0 );
+
     unsigned long long DBDirectClient::count(const string &ns, const BSONObj& query, int options, int limit, int skip ) {
         readlock lk( ns );
         string errmsg;
@@ -749,7 +785,7 @@ namespace mongo {
                     }
                     if( --n <= 0 ) {
                         log() << "shutdown: couldn't acquire write lock, aborting" << endl;
-                        abort();
+                        mongoAbort("couldn't acquire write lock");
                     }
                     log() << "shutdown: waiting for write lock..." << endl;
                 }
@@ -760,11 +796,10 @@ namespace mongo {
         log() << "shutdown: closing all files..." << endl;
         stringstream ss3;
         MemoryMappedFile::closeAllFiles( ss3 );
-        rawOut( ss3.str() );
+        log() << ss3.str() << endl;
 
         if( cmdLine.dur ) {
-            log() << "shutdown: journalCleanup..." << endl;
-            dur::journalCleanup();
+            dur::journalCleanup(true);
         }
 
 #if !defined(__sunos__)
@@ -773,9 +808,9 @@ namespace mongo {
             /* This ought to be an unlink(), but Eliot says the last
                time that was attempted, there was a race condition
                with acquirePathLock().  */
-#ifdef WIN32
+#ifdef _WIN32
             if( _chsize( lockFile , 0 ) )
-                log() << "couldn't remove fs lock " << getLastError() << endl;
+                log() << "couldn't remove fs lock " << WSAGetLastError() << endl;
             CloseHandle(lockFileHandle);
 #else
             if( ftruncate( lockFile , 0 ) )
@@ -786,8 +821,17 @@ namespace mongo {
 #endif
     }
 
+    void exitCleanly( ExitCode code ) {
+        killCurrentOp.killAll();
+        {
+            dblock lk;
+            log() << "now exiting" << endl;
+            dbexit( code );
+        }
+    }
+
     /* not using log() herein in case we are already locked */
-    void dbexit( ExitCode rc, const char *why, bool tryToGetLock ) {
+    NOINLINE_DECL void dbexit( ExitCode rc, const char *why, bool tryToGetLock ) {
 
         auto_ptr<writelocktry> wlt;
         if ( tryToGetLock ) {
@@ -840,14 +884,14 @@ namespace mongo {
         ss << getpid() << endl;
         string s = ss.str();
         const char * data = s.c_str();
-#ifdef WIN32
+#ifdef _WIN32
         assert ( _write( fd, data, strlen( data ) ) );
 #else
         assert ( write( fd, data, strlen( data ) ) );
 #endif
     }
 
-    void acquirePathLock() {
+    void acquirePathLock(bool doingRepair) {
         string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
 
         bool oldFile = false;
@@ -856,7 +900,7 @@ namespace mongo {
             oldFile = true;
         }
 
-#ifdef WIN32
+#ifdef _WIN32
         lockFileHandle = CreateFileA( name.c_str(), GENERIC_READ | GENERIC_WRITE,
             0 /* do not allow anyone else access */, NULL, 
             OPEN_ALWAYS /* success if fh can open */, 0, NULL );
@@ -867,13 +911,15 @@ namespace mongo {
             FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
                 NULL, code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                 (LPSTR)&msg, 0, NULL);
-            uasserted( 13627 , msg );
+            string m = msg;
+            str::stripTrailing(m, "\r\n");
+            uasserted( 13627 , str::stream() << "Unable to create/open lock file: " << name << ' ' << m << " Is a mongod instance already running?" );
         }
         lockFile = _open_osfhandle((intptr_t)lockFileHandle, 0);
 #else
         lockFile = open( name.c_str(), O_RDWR | O_CREAT , S_IRWXU | S_IRWXG | S_IRWXO );
         if( lockFile <= 0 ) {
-            uasserted( 10309 , str::stream() << "Unable to create / open lock file for lockfilepath: " << name << ' ' << errnoWithDescription());
+            uasserted( 10309 , str::stream() << "Unable to create/open lock file: " << name << ' ' << errnoWithDescription() << " Is a mongod instance already running?" );
         }
         if (flock( lockFile, LOCK_EX | LOCK_NB ) != 0) {
             close ( lockFile );
@@ -913,17 +959,18 @@ namespace mongo {
                 }
             }
             else {
-                errmsg = str::stream()
-                         << "************** \n"
-                         << "old lock file: " << name << ".  probably means unclean shutdown\n"
-                         << "recommend removing file and running --repair\n"
-                         << "see: http://dochub.mongodb.org/core/repair for more information\n"
-                         << "*************";
+                if (!dur::haveJournalFiles() && !doingRepair) {
+                    errmsg = str::stream()
+                             << "************** \n"
+                             << "Unclean shutdown detected.\n"
+                             << "Please visit http://dochub.mongodb.org/core/repair for recovery instructions.\n"
+                             << "*************";
+                }
             }
 
             if (!errmsg.empty()) {
                 cout << errmsg << endl;
-#ifdef WIN32
+#ifdef _WIN32
                 CloseHandle( lockFileHandle );
 #else
                 close ( lockFile );
@@ -936,14 +983,13 @@ namespace mongo {
         // Not related to lock file, but this is where we handle unclean shutdown
         if( !cmdLine.dur && dur::haveJournalFiles() ) {
             cout << "**************" << endl;
-            cout << "Error: journal files are present in journal directory, yet starting without --dur enabled." << endl;
+            cout << "Error: journal files are present in journal directory, yet starting without --journal enabled." << endl;
             cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
-            cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl;
             cout << "**************" << endl;
-            uasserted(13597, "can't start without --dur enabled when journal/ files are present");
+            uasserted(13597, "can't start without --journal enabled when journal/ files are present");
         }
 
-#ifdef WIN32
+#ifdef _WIN32
         uassert( 13625, "Unable to truncate lock file", _chsize(lockFile, 0) == 0);
         writePid( lockFile );
         _commit( lockFile );
@@ -951,20 +997,21 @@ namespace mongo {
         uassert( 13342, "Unable to truncate lock file", ftruncate(lockFile, 0) == 0);
         writePid( lockFile );
         fsync( lockFile );
+        flushMyDirectory(name);
 #endif
     }
 #else
-    void acquirePathLock() {
+    void acquirePathLock(bool) {
         // TODO - this is very bad that the code above not running here.
 
         // Not related to lock file, but this is where we handle unclean shutdown
         if( !cmdLine.dur && dur::haveJournalFiles() ) {
             cout << "**************" << endl;
-            cout << "Error: journal files are present in journal directory, yet starting without --dur enabled." << endl;
+            cout << "Error: journal files are present in journal directory, yet starting without --journal enabled." << endl;
             cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
             cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl;
             cout << "**************" << endl;
-            uasserted(13618, "can't start without --dur enabled when journal/ files are present");
+            uasserted(13618, "can't start without --journal enabled when journal/ files are present");
         }
     }
 #endif
diff --git a/db/instance.h b/db/instance.h
index 2516aec..422c77d 100644
--- a/db/instance.h
+++ b/db/instance.h
@@ -103,7 +103,7 @@ namespace mongo {
         ~DbResponse() { delete response; }
     };
 
-    void assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client = unknownAddress );
+    void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort &client );
 
     void getDatabaseNames( vector< string > &names , const string& usePath = dbpath );
 
@@ -130,7 +130,7 @@ namespace mongo {
             return "localhost"; // TODO: should this have the port?
         }
         virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 );
-        virtual void say( Message &toSend );
+        virtual void say( Message &toSend, bool isRetry = false );
         virtual void sayPiggyBack( Message &toSend ) {
             // don't need to piggy back when connected locally
             return say( toSend );
@@ -145,13 +145,19 @@ namespace mongo {
         virtual unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0, int limit=0, int skip=0 );
         
         virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; }
+
+        double getSoTimeout() const { return 0; }
+
+        virtual bool lazySupported() const { return true; }
+    private:
+        static HostAndPort _clientHost;
     };
 
     extern int lockFile;
-#ifdef WIN32
+#ifdef _WIN32
     extern HANDLE lockFileHandle;
 #endif
-    void acquirePathLock();
+    void acquirePathLock(bool doingRepair=false); // if doingRepair=true don't consider unclean shutdown an error
     void maybeCreatePidFile();
 
 } // namespace mongo
diff --git a/db/introspect.cpp b/db/introspect.cpp
index cee0da8..7e1d19c 100644
--- a/db/introspect.cpp
+++ b/db/introspect.cpp
@@ -23,17 +23,66 @@
 #include "pdfile.h"
 #include "jsobj.h"
 #include "pdfile.h"
+#include "curop.h"
 
 namespace mongo {
 
-    void profile( const char *str, int millis) {
-        BSONObjBuilder b;
+    BufBuilder profileBufBuilder; // reused, instead of allocated every time - avoids a malloc/free cycle
+
+    void profile( const Client& c , CurOp& currentOp ) {
+        assertInWriteLock();
+
+        Database *db = c.database();
+        DEV assert( db );
+        const char *ns = db->profileName.c_str();
+        
+        // build object
+        profileBufBuilder.reset();
+        BSONObjBuilder b(profileBufBuilder);
         b.appendDate("ts", jsTime());
-        b.append("info", str);
-        b.append("millis", (double) millis);
+        currentOp.debug().append( currentOp , b );
+
+        b.append("client", c.clientAddress() );
+
+        if ( c.getAuthenticationInfo() )
+            b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );
+
         BSONObj p = b.done();
-        theDataFileMgr.insert(cc().database()->profileName.c_str(),
-                              p.objdata(), p.objsize(), true);
+
+        if (p.objsize() > 100*1024){
+            string small = p.toString(/*isArray*/false, /*full*/false);
+
+            warning() << "can't add full line to system.profile: " << small;
+
+            // rebuild with limited info
+            BSONObjBuilder b(profileBufBuilder);
+            b.appendDate("ts", jsTime());
+            b.append("client", c.clientAddress() );
+            if ( c.getAuthenticationInfo() )
+                b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );
+
+            b.append("err", "profile line too large (max is 100KB)");
+            if (small.size() < 100*1024){ // should be much smaller but if not don't break anything
+                b.append("abbreviated", small);
+            }
+
+            p = b.done();
+        }
+
+        // write: not replicated
+        NamespaceDetails *d = db->namespaceIndex.details(ns);
+        if( d ) {
+            int len = p.objsize();
+            Record *r = theDataFileMgr.fast_oplog_insert(d, ns, len);
+            memcpy(getDur().writingPtr(r->data, len), p.objdata(), len);
+        }
+        else { 
+            static time_t last;
+            if( time(0) > last+10 ) {
+                log() << "profile: warning ns " << ns << " does not exist" << endl;
+                last = time(0);
+            }
+        }
     }
 
 } // namespace mongo
diff --git a/db/introspect.h b/db/introspect.h
index 3f6ef60..209eeac 100644
--- a/db/introspect.h
+++ b/db/introspect.h
@@ -29,7 +29,6 @@ namespace mongo {
        do when database->profile is set
     */
 
-    void profile(const char *str,
-                 int millis);
+    void profile( const Client& c , CurOp& currentOp );
 
 } // namespace mongo
diff --git a/db/jsobj.cpp b/db/jsobj.cpp
index 25ab8a8..dcb7744 100644
--- a/db/jsobj.cpp
+++ b/db/jsobj.cpp
@@ -27,6 +27,7 @@
 #include <limits>
 #include "../util/unittest.h"
 #include "../util/embedded_builder.h"
+#include "../util/stringutils.h"
 #include "json.h"
 #include "jsobjmanipulator.h"
 #include "../util/optime.h"
@@ -44,7 +45,7 @@ BOOST_STATIC_ASSERT( sizeof(mongo::OID) == 12 );
 
 namespace mongo {
 
-    BSONElement nullElement;
+    BSONElement eooElement;
 
     GENOIDLabeler GENOID;
 
@@ -53,52 +54,11 @@ namespace mongo {
     MinKeyLabeler MINKEY;
     MaxKeyLabeler MAXKEY;
 
-    string escape( string s , bool escape_slash=false) {
-        StringBuilder ret;
-        for ( string::iterator i = s.begin(); i != s.end(); ++i ) {
-            switch ( *i ) {
-            case '"':
-                ret << "\\\"";
-                break;
-            case '\\':
-                ret << "\\\\";
-                break;
-            case '/':
-                ret << (escape_slash ? "\\/" : "/");
-                break;
-            case '\b':
-                ret << "\\b";
-                break;
-            case '\f':
-                ret << "\\f";
-                break;
-            case '\n':
-                ret << "\\n";
-                break;
-            case '\r':
-                ret << "\\r";
-                break;
-            case '\t':
-                ret << "\\t";
-                break;
-            default:
-                if ( *i >= 0 && *i <= 0x1f ) {
-                    //TODO: these should be utf16 code-units not bytes
-                    char c = *i;
-                    ret << "\\u00" << toHexLower(&c, 1);
-                }
-                else {
-                    ret << *i;
-                }
-            }
-        }
-        return ret.str();
-    }
-
-    string BSONElement::jsonString( JsonStringFormat format, bool includeFieldNames, int pretty ) const {
+    // need to move to bson/, but has dependency on base64 so move that to bson/util/ first.
+    inline string BSONElement::jsonString( JsonStringFormat format, bool includeFieldNames, int pretty ) const {
         BSONType t = type();
         if ( t == Undefined )
-            return "";
+            return "undefined";
 
         stringstream s;
         if ( includeFieldNames )
@@ -142,19 +102,28 @@ namespace mongo {
             s << "[ ";
             BSONObjIterator i( embeddedObject() );
             BSONElement e = i.next();
-            if ( !e.eoo() )
+            if ( !e.eoo() ) {
+                int count = 0;
                 while ( 1 ) {
                     if( pretty ) {
                         s << '\n';
                         for( int x = 0; x < pretty; x++ )
                             s << "  ";
                     }
-                    s << e.jsonString( format, false, pretty?pretty+1:0 );
-                    e = i.next();
+
+                    if (strtol(e.fieldName(), 0, 10) > count) {
+                        s << "undefined";
+                    }
+                    else {
+                        s << e.jsonString( format, false, pretty?pretty+1:0 );
+                        e = i.next();
+                    }
+                    count++;
                     if ( e.eoo() )
                         break;
                     s << ", ";
                 }
+            }
             s << " ]";
             break;
         }
@@ -250,7 +219,6 @@ namespace mongo {
             }
         }
 
-
         case Code:
             s << _asCode();
             break;
@@ -328,124 +296,6 @@ namespace mongo {
         return def;
     }
 
-    /* wo = "well ordered" */
-    int BSONElement::woCompare( const BSONElement &e,
-                                bool considerFieldName ) const {
-        int lt = (int) canonicalType();
-        int rt = (int) e.canonicalType();
-        int x = lt - rt;
-        if( x != 0 && (!isNumber() || !e.isNumber()) )
-            return x;
-        if ( considerFieldName ) {
-            x = strcmp(fieldName(), e.fieldName());
-            if ( x != 0 )
-                return x;
-        }
-        x = compareElementValues(*this, e);
-        return x;
-    }
-
-    /* must be same type when called, unless both sides are #s
-    */
-    int compareElementValues(const BSONElement& l, const BSONElement& r) {
-        int f;
-        double x;
-
-        switch ( l.type() ) {
-        case EOO:
-        case Undefined:
-        case jstNULL:
-        case MaxKey:
-        case MinKey:
-            f = l.canonicalType() - r.canonicalType();
-            if ( f<0 ) return -1;
-            return f==0 ? 0 : 1;
-        case Bool:
-            return *l.value() - *r.value();
-        case Timestamp:
-        case Date:
-            if ( l.date() < r.date() )
-                return -1;
-            return l.date() == r.date() ? 0 : 1;
-        case NumberLong:
-            if( r.type() == NumberLong ) {
-                long long L = l._numberLong();
-                long long R = r._numberLong();
-                if( L < R ) return -1;
-                if( L == R ) return 0;
-                return 1;
-            }
-            // else fall through
-        case NumberInt:
-        case NumberDouble: {
-            double left = l.number();
-            double right = r.number();
-            bool lNan = !( left <= numeric_limits< double >::max() &&
-                           left >= -numeric_limits< double >::max() );
-            bool rNan = !( right <= numeric_limits< double >::max() &&
-                           right >= -numeric_limits< double >::max() );
-            if ( lNan ) {
-                if ( rNan ) {
-                    return 0;
-                }
-                else {
-                    return -1;
-                }
-            }
-            else if ( rNan ) {
-                return 1;
-            }
-            x = left - right;
-            if ( x < 0 ) return -1;
-            return x == 0 ? 0 : 1;
-        }
-        case jstOID:
-            return memcmp(l.value(), r.value(), 12);
-        case Code:
-        case Symbol:
-        case String:
-            /* todo: utf version */
-            return strcmp(l.valuestr(), r.valuestr());
-        case Object:
-        case Array:
-            return l.embeddedObject().woCompare( r.embeddedObject() );
-        case DBRef: {
-            int lsz = l.valuesize();
-            int rsz = r.valuesize();
-            if ( lsz - rsz != 0 ) return lsz - rsz;
-            return memcmp(l.value(), r.value(), lsz);
-        }
-        case BinData: {
-            int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
-            int rsz = r.objsize();
-            if ( lsz - rsz != 0 ) return lsz - rsz;
-            return memcmp(l.value()+4, r.value()+4, lsz+1);
-        }
-        case RegEx: {
-            int c = strcmp(l.regex(), r.regex());
-            if ( c )
-                return c;
-            return strcmp(l.regexFlags(), r.regexFlags());
-        }
-        case CodeWScope : {
-            f = l.canonicalType() - r.canonicalType();
-            if ( f )
-                return f;
-            f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
-            if ( f )
-                return f;
-            f = strcmp( l.codeWScopeScopeData() , r.codeWScopeScopeData() );
-            if ( f )
-                return f;
-            return 0;
-        }
-        default:
-            out() << "compareElementValues: bad type " << (int) l.type() << endl;
-            assert(false);
-        }
-        return -1;
-    }
-
     /* Matcher --------------------------------------*/
 
 // If the element is something like:
@@ -658,6 +508,12 @@ namespace mongo {
     }
 
     BSONObj staticNull = fromjson( "{'':null}" );
+    BSONObj makeUndefined() {
+        BSONObjBuilder b;
+        b.appendUndefined( "" );
+        return b.obj();
+    }
+    BSONObj staticUndefined = makeUndefined();
 
     /* well ordered compare */
     int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const {
@@ -690,17 +546,19 @@ namespace mongo {
         return -1;
     }
 
-    void BSONObj::getFieldsDotted(const StringData& name, BSONElementSet &ret ) const {
-        BSONElement e = getField( name );
+    template <typename BSONElementColl>
+    void _getFieldsDotted( const BSONObj* obj, const StringData& name, BSONElementColl &ret, bool expandLastArray ) {
+        BSONElement e = obj->getField( name );
+
         if ( e.eoo() ) {
             const char *p = strchr(name.data(), '.');
             if ( p ) {
                 string left(name.data(), p-name.data());
                 const char* next = p+1;
-                BSONElement e = getField( left.c_str() );
+                BSONElement e = obj->getField( left.c_str() );
 
                 if (e.type() == Object) {
-                    e.embeddedObject().getFieldsDotted(next, ret);
+                    e.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
                 }
                 else if (e.type() == Array) {
                     bool allDigits = false;
@@ -711,14 +569,14 @@ namespace mongo {
                         allDigits = (*temp == '.' || *temp == '\0');
                     }
                     if (allDigits) {
-                        e.embeddedObject().getFieldsDotted(next, ret);
+                        e.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
                     }
                     else {
                         BSONObjIterator i(e.embeddedObject());
                         while ( i.more() ) {
                             BSONElement e2 = i.next();
                             if (e2.type() == Object || e2.type() == Array)
-                                e2.embeddedObject().getFieldsDotted(next, ret);
+                                e2.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
                         }
                     }
                 }
@@ -728,7 +586,7 @@ namespace mongo {
             }
         }
         else {
-            if (e.type() == Array) {
+            if (e.type() == Array && expandLastArray) {
                 BSONObjIterator i(e.embeddedObject());
                 while ( i.more() )
                     ret.insert(i.next());
@@ -739,9 +597,16 @@ namespace mongo {
         }
     }
 
+    void BSONObj::getFieldsDotted(const StringData& name, BSONElementSet &ret, bool expandLastArray ) const {
+        _getFieldsDotted( this, name, ret, expandLastArray );
+    }
+    void BSONObj::getFieldsDotted(const StringData& name, BSONElementMSet &ret, bool expandLastArray ) const {
+        _getFieldsDotted( this, name, ret, expandLastArray );
+    }
+
     BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const {
         const char *p = strchr(name, '.');
-        
+
         BSONElement sub;
 
         if ( p ) {
@@ -754,13 +619,13 @@ namespace mongo {
         }
 
         if ( sub.eoo() )
-            return nullElement;
-        else if ( sub.type() == Array || name[0] == '\0')
+            return eooElement;
+        else if ( sub.type() == Array || name[0] == '\0' )
             return sub;
         else if ( sub.type() == Object )
             return sub.embeddedObject().getFieldDottedOrArray( name );
         else
-            return nullElement;
+            return eooElement;
     }
 
     /**
@@ -837,21 +702,6 @@ namespace mongo {
         return BSONElement();
     }
 
-    int BSONObj::getIntField(const char *name) const {
-        BSONElement e = getField(name);
-        return e.isNumber() ? (int) e.number() : INT_MIN;
-    }
-
-    bool BSONObj::getBoolField(const char *name) const {
-        BSONElement e = getField(name);
-        return e.type() == Bool ? e.boolean() : false;
-    }
-
-    const char * BSONObj::getStringField(const char *name) const {
-        BSONElement e = getField(name);
-        return e.type() == String ? e.valuestr() : "";
-    }
-
     /* grab names of all the fields in this object */
     int BSONObj::getFieldNames(set<string>& fields) const {
         int n = 0;
@@ -897,8 +747,7 @@ namespace mongo {
         }
 
         if ( n ) {
-            int len;
-            init( b.decouple(len), true );
+            *this = b.obj();
         }
 
         return n;
@@ -997,22 +846,6 @@ namespace mongo {
         }
     }
 
-    string BSONObj::hexDump() const {
-        stringstream ss;
-        const char *d = objdata();
-        int size = objsize();
-        for( int i = 0; i < size; ++i ) {
-            ss.width( 2 );
-            ss.fill( '0' );
-            ss << hex << (unsigned)(unsigned char)( d[ i ] ) << dec;
-            if ( ( d[ i ] >= '0' && d[ i ] <= '9' ) || ( d[ i ] >= 'A' && d[ i ] <= 'z' ) )
-                ss << '\'' << d[ i ] << '\'';
-            if ( i != size - 1 )
-                ss << ' ';
-        }
-        return ss.str();
-    }
-
     void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base) {
         BSONObjIterator it(obj);
         while (it.more()) {
@@ -1092,7 +925,7 @@ namespace mongo {
             c.appendRegex("x", "goo");
             BSONObj p = c.done();
 
-            assert( !o.woEqual( p ) );
+            assert( !o.binaryEqual( p ) );
             assert( o.woCompare( p ) < 0 );
 
         }
@@ -1197,7 +1030,7 @@ namespace mongo {
             BSONObj a = A.done();
             BSONObj b = B.done();
             BSONObj c = C.done();
-            assert( !a.woEqual( b ) ); // comments on operator==
+            assert( !a.binaryEqual( b ) ); // comments on operator==
             int cmp = a.woCompare(b);
             assert( cmp == 0 );
             cmp = a.woCompare(c);
@@ -1215,105 +1048,154 @@ namespace mongo {
     Labeler::Label NE( "$ne" );
     Labeler::Label SIZE( "$size" );
 
-    void BSONElementManipulator::initTimestamp() {
-        massert( 10332 ,  "Expected CurrentTime type", _element.type() == Timestamp );
-        unsigned long long &timestamp = *( reinterpret_cast< unsigned long long* >( value() ) );
-        if ( timestamp == 0 )
-            timestamp = OpTime::now().asDate();
-    }
-
     void BSONObjBuilder::appendMinForType( const StringData& fieldName , int t ) {
         switch ( t ) {
-        case MinKey: appendMinKey( fieldName ); return;
-        case MaxKey: appendMinKey( fieldName ); return;
+                
+        // Shared canonical types
         case NumberInt:
         case NumberDouble:
         case NumberLong:
             append( fieldName , - numeric_limits<double>::max() ); return;
+        case Symbol:
+        case String:
+            append( fieldName , "" ); return;
+        case Date: 
+            // min varies with V0 and V1 indexes, so we go one type lower.
+            appendBool(fieldName, true);
+            //appendDate( fieldName , numeric_limits<long long>::min() ); 
+            return;
+        case Timestamp: // TODO integrate with Date SERVER-3304
+            appendTimestamp( fieldName , 0 ); return;
+        case Undefined: // shared with EOO
+            appendUndefined( fieldName ); return;
+                
+        // Separate canonical types
+        case MinKey:
+            appendMinKey( fieldName ); return;
+        case MaxKey:
+            appendMaxKey( fieldName ); return;
         case jstOID: {
             OID o;
             memset(&o, 0, sizeof(o));
             appendOID( fieldName , &o);
             return;
         }
-        case Bool: appendBool( fieldName , false); return;
-        case Date: appendDate( fieldName , 0); return;
-        case jstNULL: appendNull( fieldName ); return;
-        case Symbol:
-        case String: append( fieldName , "" ); return;
-        case Object: append( fieldName , BSONObj() ); return;
+        case Bool:
+            appendBool( fieldName , false); return;
+        case jstNULL:
+            appendNull( fieldName ); return;
+        case Object:
+            append( fieldName , BSONObj() ); return;
         case Array:
             appendArray( fieldName , BSONObj() ); return;
         case BinData:
-            appendBinData( fieldName , 0 , Function , (const char *) 0 ); return;
-        case Undefined:
-            appendUndefined( fieldName ); return;
-        case RegEx: appendRegex( fieldName , "" ); return;
+            appendBinData( fieldName , 0 , BinDataGeneral , (const char *) 0 ); return;
+        case RegEx:
+            appendRegex( fieldName , "" ); return;
         case DBRef: {
             OID o;
             memset(&o, 0, sizeof(o));
             appendDBRef( fieldName , "" , o );
             return;
         }
-        case Code: appendCode( fieldName , "" ); return;
-        case CodeWScope: appendCodeWScope( fieldName , "" , BSONObj() ); return;
-        case Timestamp: appendTimestamp( fieldName , 0); return;
-
+        case Code:
+            appendCode( fieldName , "" ); return;
+        case CodeWScope:
+            appendCodeWScope( fieldName , "" , BSONObj() ); return;
         };
-        log() << "type not support for appendMinElementForType: " << t << endl;
+        log() << "type not supported for appendMinElementForType: " << t << endl;
         uassert( 10061 ,  "type not supported for appendMinElementForType" , false );
     }
 
     void BSONObjBuilder::appendMaxForType( const StringData& fieldName , int t ) {
         switch ( t ) {
-        case MinKey: appendMaxKey( fieldName );  break;
-        case MaxKey: appendMaxKey( fieldName ); break;
+                
+        // Shared canonical types
         case NumberInt:
         case NumberDouble:
         case NumberLong:
-            append( fieldName , numeric_limits<double>::max() );
-            break;
-        case BinData:
-            appendMinForType( fieldName , jstOID );
-            break;
+            append( fieldName , numeric_limits<double>::max() ); return;
+        case Symbol:
+        case String:
+            appendMinForType( fieldName, Object ); return;
+        case Date:
+            appendDate( fieldName , numeric_limits<long long>::max() ); return;
+        case Timestamp: // TODO integrate with Date SERVER-3304
+            appendTimestamp( fieldName , numeric_limits<unsigned long long>::max() ); return;
+        case Undefined: // shared with EOO
+            appendUndefined( fieldName ); return;
+
+        // Separate canonical types
+        case MinKey:
+            appendMinKey( fieldName ); return;
+        case MaxKey:
+            appendMaxKey( fieldName ); return;
         case jstOID: {
             OID o;
             memset(&o, 0xFF, sizeof(o));
             appendOID( fieldName , &o);
-            break;
+            return;
         }
-        case Undefined:
+        case Bool:
+            appendBool( fieldName , true ); return;
         case jstNULL:
-            appendMinForType( fieldName , NumberInt );
-        case Bool: appendBool( fieldName , true); break;
-        case Date: appendDate( fieldName , 0xFFFFFFFFFFFFFFFFLL ); break;
-        case Symbol:
-        case String: append( fieldName , BSONObj() ); break;
+            appendNull( fieldName ); return;
+        case Object:
+            appendMinForType( fieldName, Array ); return;
+        case Array:
+            appendMinForType( fieldName, BinData ); return;
+        case BinData:
+            appendMinForType( fieldName, jstOID ); return;
+        case RegEx:
+            appendMinForType( fieldName, DBRef ); return;
+        case DBRef:
+            appendMinForType( fieldName, Code ); return;                
         case Code:
+            appendMinForType( fieldName, CodeWScope ); return;
         case CodeWScope:
-            appendCodeWScope( fieldName , "ZZZ" , BSONObj() ); break;
-        case Timestamp:
-            appendTimestamp( fieldName , numeric_limits<unsigned long long>::max() ); break;
-        default:
-            appendMinForType( fieldName , t + 1 );
+            // This upper bound may change if a new bson type is added.
+            appendMinForType( fieldName , MaxKey ); return;
         }
+        log() << "type not supported for appendMaxElementForType: " << t << endl;
+        uassert( 14853 ,  "type not supported for appendMaxElementForType" , false );
+    }
+
+    int BSONElementFieldSorter( const void * a , const void * b ) {
+        const char * x = *((const char**)a);
+        const char * y = *((const char**)b);
+        x++; y++;
+        return lexNumCmp( x , y );
     }
 
-    const string BSONObjBuilder::numStrs[] = {
-        "0",  "1",  "2",  "3",  "4",  "5",  "6",  "7",  "8",  "9",
-        "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
-        "20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
-        "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
-        "40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
-        "50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
-        "60", "61", "62", "63", "64", "65", "66", "67", "68", "69",
-        "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
-        "80", "81", "82", "83", "84", "85", "86", "87", "88", "89",
-        "90", "91", "92", "93", "94", "95", "96", "97", "98", "99",
-    };
+    bool fieldsMatch(const BSONObj& lhs, const BSONObj& rhs) {
+        BSONObjIterator l(lhs);
+        BSONObjIterator r(rhs);
+
+        while (l.more() && r.more()){
+            if (strcmp(l.next().fieldName(), r.next().fieldName())) {
+                return false;
+            }
+        }
+
+        return !(l.more() || r.more()); // false if lhs and rhs have diff nFields()
+    }
+
+    BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ) {
+        _nfields = o.nFields();
+        _fields = new const char*[_nfields];
+        int x = 0;
+        BSONObjIterator i( o );
+        while ( i.more() ) {
+            _fields[x++] = i.next().rawdata();
+            assert( _fields[x-1] );
+        }
+        assert( x == _nfields );
+        qsort( _fields , _nfields , sizeof(char*) , BSONElementFieldSorter );
+        _cur = 0;
+    }
 
     bool BSONObjBuilder::appendAsNumber( const StringData& fieldName , const string& data ) {
-        if ( data.size() == 0 || data == "-")
+        if ( data.size() == 0 || data == "-" || data == ".")
             return false;
 
         unsigned int pos=0;
@@ -1355,63 +1237,6 @@ namespace mongo {
         catch(bad_lexical_cast &) {
             return false;
         }
-
-    }
-
-    void BSONObjBuilder::appendKeys( const BSONObj& keyPattern , const BSONObj& values ) {
-        BSONObjIterator i(keyPattern);
-        BSONObjIterator j(values);
-
-        while ( i.more() && j.more() ) {
-            appendAs( j.next() , i.next().fieldName() );
-        }
-
-        assert( ! i.more() );
-        assert( ! j.more() );
-    }
-
-    int BSONElementFieldSorter( const void * a , const void * b ) {
-        const char * x = *((const char**)a);
-        const char * y = *((const char**)b);
-        x++; y++;
-        return lexNumCmp( x , y );
-    }
-
-    BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ) {
-        _nfields = o.nFields();
-        _fields = new const char*[_nfields];
-        int x = 0;
-        BSONObjIterator i( o );
-        while ( i.more() ) {
-            _fields[x++] = i.next().rawdata();
-            assert( _fields[x-1] );
-        }
-        assert( x == _nfields );
-        qsort( _fields , _nfields , sizeof(char*) , BSONElementFieldSorter );
-        _cur = 0;
-    }
-
-    /** transform a BSON array into a vector of BSONElements.
-        we match array # positions with their vector position, and ignore
-        any fields with non-numeric field names.
-        */
-    vector<BSONElement> BSONElement::Array() const {
-        chk(mongo::Array);
-        vector<BSONElement> v;
-        BSONObjIterator i(Obj());
-        while( i.more() ) {
-            BSONElement e = i.next();
-            const char *f = e.fieldName();
-            try {
-                unsigned u = stringToNum(f);
-                assert( u < 1000000 );
-                if( u >= v.size() )
-                    v.resize(u+1);
-                v[u] = e;
-            }
-            catch(unsigned) { }
-        }
-        return v;
     }
 
 } // namespace mongo
diff --git a/db/json.cpp b/db/json.cpp
index 4a6fad8..b89ff32 100644
--- a/db/json.cpp
+++ b/db/json.cpp
@@ -258,8 +258,12 @@ namespace mongo {
 
     struct numberValue {
         numberValue( ObjectBuilder &_b ) : b( _b ) {}
-        void operator() ( double d ) const {
-            b.back()->append( b.fieldName(), d );
+        void operator() ( const char *start, const char *end ) const {
+            // We re-parse the numeric string here because spirit parsing of strings
+            // to doubles produces different results from strtod in some cases and
+            // we want to use strtod to ensure consistency with other string to
+            // double conversions in our code.
+            b.back()->append( b.fieldName(), strtod( start, 0 ) );
         }
         ObjectBuilder &b;
     };
@@ -315,6 +319,14 @@ namespace mongo {
         ObjectBuilder &b;
     };
 
+    struct undefinedValue {
+        undefinedValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendUndefined( b.fieldName() );
+        }
+        ObjectBuilder &b;
+    };
+    
     struct dbrefNS {
         dbrefNS( ObjectBuilder &_b ) : b( _b ) {}
         void operator() ( const char *start, const char *end ) const {
@@ -454,12 +466,13 @@ namespace mongo {
                 elements = list_p(value, ch_p(',')[arrayNext( self.b )]);
                 value =
                     str[ stringEnd( self.b ) ] |
-                    number |
+                    number[ numberValue( self.b ) ] |
                     integer |
                     array[ arrayEnd( self.b ) ] |
                     lexeme_d[ str_p( "true" ) ][ trueValue( self.b ) ] |
                     lexeme_d[ str_p( "false" ) ][ falseValue( self.b ) ] |
                     lexeme_d[ str_p( "null" ) ][ nullValue( self.b ) ] |
+                    lexeme_d[ str_p( "undefined" ) ][ undefinedValue( self.b ) ] |
                     singleQuoteStr[ stringEnd( self.b ) ] |
                     date[ dateEnd( self.b ) ] |
                     oid[ oidEnd( self.b ) ] |
@@ -501,7 +514,7 @@ namespace mongo {
 
                 // real_p accepts numbers with nonsignificant zero prefixes, which
                 // aren't allowed in JSON.  Oh well.
-                number = strict_real_p[ numberValue( self.b ) ];
+                number = strict_real_p;
 
                 static int_parser<long long, 10,  1, numeric_limits<long long>::digits10 + 1> long_long_p;
                 integer = long_long_p[ intValue(self.b) ];
diff --git a/db/key.cpp b/db/key.cpp
new file mode 100644
index 0000000..011eea1
--- /dev/null
+++ b/db/key.cpp
@@ -0,0 +1,671 @@
+// @file key.cpp
+
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "key.h"
+#include "../util/unittest.h"
+
+namespace mongo {
+
+    extern const Ordering nullOrdering = Ordering::make(BSONObj());
+
+    // KeyBson is for V0 (version #0) indexes
+
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o);
+
+    // "old" = pre signed dates & such; i.e. btree V0
+    /* must be same canon type when called */
+    int oldCompareElementValues(const BSONElement& l, const BSONElement& r) {
+        dassert( l.canonicalType() == r.canonicalType() );
+        int f;
+        double x;
+
+        switch ( l.type() ) {
+        case EOO:
+        case Undefined: // EOO and Undefined are same canonicalType
+        case jstNULL:
+        case MaxKey:
+        case MinKey:
+            return 0;
+        case Bool:
+            return *l.value() - *r.value();
+        case Timestamp:
+        case Date:
+            // unsigned dates for old version
+            if ( l.date() < r.date() )
+                return -1;
+            return l.date() == r.date() ? 0 : 1;
+        case NumberLong:
+            if( r.type() == NumberLong ) {
+                long long L = l._numberLong();
+                long long R = r._numberLong();
+                if( L < R ) return -1;
+                if( L == R ) return 0;
+                return 1;
+            }
+            // else fall through
+        case NumberInt:
+        case NumberDouble: {
+            double left = l.number();
+            double right = r.number();
+            bool lNan = !( left <= numeric_limits< double >::max() &&
+                           left >= -numeric_limits< double >::max() );
+            bool rNan = !( right <= numeric_limits< double >::max() &&
+                           right >= -numeric_limits< double >::max() );
+            if ( lNan ) {
+                if ( rNan ) {
+                    return 0;
+                }
+                else {
+                    return -1;
+                }
+            }
+            else if ( rNan ) {
+                return 1;
+            }
+            x = left - right;
+            if ( x < 0 ) return -1;
+            return x == 0 ? 0 : 1;
+        }
+        case jstOID:
+            return memcmp(l.value(), r.value(), 12);
+        case Code:
+        case Symbol:
+        case String:
+            // nulls not allowed in the middle of strings in the old version
+            return strcmp(l.valuestr(), r.valuestr());
+        case Object:
+        case Array:
+            return oldCompare(l.embeddedObject(), r.embeddedObject(), nullOrdering);
+        case DBRef: {
+            int lsz = l.valuesize();
+            int rsz = r.valuesize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value(), r.value(), lsz);
+        }
+        case BinData: {
+            int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
+            int rsz = r.objsize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value()+4, r.value()+4, lsz+1);
+        }
+        case RegEx: {
+            int c = strcmp(l.regex(), r.regex());
+            if ( c )
+                return c;
+            return strcmp(l.regexFlags(), r.regexFlags());
+        }
+        case CodeWScope : {
+            f = l.canonicalType() - r.canonicalType();
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeScopeData() , r.codeWScopeScopeData() );
+            if ( f )
+                return f;
+            return 0;
+        }
+        default:
+            out() << "oldCompareElementValues: bad type " << (int) l.type() << endl;
+            assert(false);
+        }
+        return -1;
+    }
+
+    int oldElemCompare(const BSONElement&l , const BSONElement& r) { 
+        int lt = (int) l.canonicalType();
+        int rt = (int) r.canonicalType();
+        int x = lt - rt;
+        if( x )
+            return x;
+        return oldCompareElementValues(l, r);
+    }
+
+    // pre signed dates & such
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o) {
+        BSONObjIterator i(l);
+        BSONObjIterator j(r);
+        unsigned mask = 1;
+        while ( 1 ) {
+            // so far, equal...
+
+            BSONElement l = i.next();
+            BSONElement r = j.next();
+            if ( l.eoo() )
+                return r.eoo() ? 0 : -1;
+            if ( r.eoo() )
+                return 1;
+
+            int x;
+            {
+                x = oldElemCompare(l, r);
+                if( o.descending(mask) )
+                    x = -x;
+            }
+            if ( x != 0 )
+                return x;
+            mask <<= 1;
+        }
+        return -1;
+    }
+
+    /* old style compares:
+       - dates are unsigned 
+       - strings no nulls
+    */
+    int KeyBson::woCompare(const KeyBson& r, const Ordering &o) const { 
+        return oldCompare(_o, r._o, o); 
+    }
+
+    // woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort
+    bool KeyBson::woEqual(const KeyBson& r) const { 
+        return oldCompare(_o, r._o, nullOrdering) == 0;
+    }
+
+    // [ ][HASMORE][x][y][canontype_4bits]
+    enum CanonicalsEtc { 
+        cminkey=1,
+        cnull=2,
+        cdouble=4,
+        cstring=6,
+        cbindata=7,
+        coid=8,
+        cfalse=10,
+        ctrue=11,
+        cdate=12,
+        cmaxkey=14,
+        cCANONTYPEMASK = 0xf,
+        cY = 0x10,
+        cint = cY | cdouble,
+        cX = 0x20,
+        clong = cX | cdouble,
+        cHASMORE = 0x40,
+        cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care
+    };
+
+    // bindata bson type
+    const unsigned BinDataLenMask = 0xf0;  // lengths are powers of 2 of this value
+    const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value.  see BinDataType.
+    const int BinDataLenMax = 32;
+    const int BinDataLengthToCode[] = { 
+        0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 
+        0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/,
+        0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1, 
+        0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1, 
+        0xf0/*32*/ 
+    };
+    const int BinDataCodeToLength[] = { 
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32
+    };
+
+    int binDataCodeToLength(int codeByte) { 
+        return BinDataCodeToLength[codeByte >> 4];
+    }
+
+    /** object cannot be represented in compact format.  so store in traditional bson format 
+        with a leading sentinel byte IsBSON to indicate it's in that format.
+
+        Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here 
+        so that we don't have to do an extra malloc.
+    */
+    void KeyV1Owned::traditional(const BSONObj& obj) { 
+        b.reset();
+        b.appendUChar(IsBSON);
+        b.appendBuf(obj.objdata(), obj.objsize());
+        _keyData = (const unsigned char *) b.buf();
+    }
+
+    // fromBSON to Key format
+    KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
+        BSONObj::iterator i(obj);
+        unsigned char bits = 0;
+        while( 1 ) { 
+            BSONElement e = i.next();
+            if( i.more() )
+                bits |= cHASMORE;
+            switch( e.type() ) { 
+            case MinKey:
+                b.appendUChar(cminkey|bits);
+                break;
+            case jstNULL:
+                b.appendUChar(cnull|bits);
+                break;
+            case MaxKey:
+                b.appendUChar(cmaxkey|bits);
+                break;
+            case Bool:
+                b.appendUChar( (e.boolean()?ctrue:cfalse) | bits );
+                break;
+            case jstOID:
+                b.appendUChar(coid|bits);
+                b.appendBuf(&e.__oid(), sizeof(OID));
+                break;
+            case BinData:
+                {
+                    int t = e.binDataType();
+                    // 0-7 and 0x80 to 0x87 are supported by Key
+                    if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) {
+                        int len;
+                        const char * d = e.binData(len);
+                        if( len <= BinDataLenMax ) {
+                            int code = BinDataLengthToCode[len];
+                            if( code >= 0 ) {
+                                if( t >= 128 )
+                                    t = (t-128) | 0x08;
+                                dassert( (code&t) == 0 );
+                                b.appendUChar( cbindata|bits );
+                                b.appendUChar( code | t );
+                                b.appendBuf(d, len);
+                                break;
+                            }
+                        }
+                    }
+                    traditional(obj);
+                    return;
+                }
+            case Date:
+                b.appendUChar(cdate|bits);
+                b.appendStruct(e.date());
+                break;
+            case String:
+                {
+                    b.appendUChar(cstring|bits);
+                    // note we do not store the terminating null, to save space.
+                    unsigned x = (unsigned) e.valuestrsize() - 1;
+                    if( x > 255 ) { 
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(x);
+                    b.appendBuf(e.valuestr(), x);
+                    break;
+                }
+            case NumberInt:
+                b.appendUChar(cint|bits);
+                b.appendNum((double) e._numberInt());
+                break;
+            case NumberLong:
+                {
+                    long long n = e._numberLong();
+                    long long m = 2LL << 52;
+                    DEV {
+                        long long d = m-1;
+                        assert( ((long long) ((double) -d)) == -d );
+                    }
+                    if( n >= m || n <= -m ) {
+                        // can't represent exactly as a double
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(clong|bits);
+                    b.appendNum((double) n);
+                    break;
+                }
+            case NumberDouble:
+                {
+                    double d = e._numberDouble();
+                    if( isNaN(d) ) {
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(cdouble|bits);
+                    b.appendNum(d);
+                    break;
+                }
+            default:
+                // if other types involved, store as traditional BSON
+                traditional(obj);
+                return;
+            }
+            if( !i.more() )
+                break;
+            bits = 0;
+        }
+        _keyData = (const unsigned char *) b.buf();
+        dassert( b.len() == dataSize() ); // check datasize method is correct
+        dassert( (*_keyData & cNOTUSED) == 0 );
+    }
+
+    BSONObj KeyV1::toBson() const { 
+        assert( _keyData != 0 );
+        if( !isCompactFormat() )
+            return bson();
+
+        BSONObjBuilder b(512);
+        const unsigned char *p = _keyData;
+        while( 1 ) { 
+            unsigned bits = *p++;
+
+            switch( bits & 0x3f ) {
+                case cminkey: b.appendMinKey(""); break;
+                case cnull:   b.appendNull(""); break;
+                case cfalse:  b.appendBool("", false); break;
+                case ctrue:   b.appendBool("", true); break;
+                case cmaxkey: 
+                    b.appendMaxKey(""); 
+                    break;
+                case cstring:
+                    {
+                        unsigned sz = *p++;
+                        // we build the element ourself as we have to null terminate it
+                        BufBuilder &bb = b.bb();
+                        bb.appendNum((char) String);
+                        bb.appendUChar(0); // fieldname ""
+                        bb.appendNum(sz+1);
+                        bb.appendBuf(p, sz);
+                        bb.appendUChar(0); // null char at end of string
+                        p += sz;
+                        break;
+                    }
+                case coid:
+                    b.appendOID("", (OID *) p);
+                    p += sizeof(OID);
+                    break;
+                case cbindata:
+                    {
+                        int len = binDataCodeToLength(*p);
+                        int subtype = (*p) & BinDataTypeMask;
+                        if( subtype & 0x8 ) { 
+                            subtype = (subtype & 0x7) | 0x80;
+                        }
+                        b.appendBinData("", len, (BinDataType) subtype, ++p);
+                        p += len;
+                        break;
+                    }
+                case cdate:
+                    b.appendDate("", (Date_t&) *p);
+                    p += 8;
+                    break;
+                case cdouble:
+                    b.append("", (double&) *p);
+                    p += sizeof(double);
+                    break;
+                case cint:
+                    b.append("", (int) ((double&) *p));
+                    p += sizeof(double);
+                    break;
+                case clong:
+                    b.append("", (long long) ((double&) *p));
+                    p += sizeof(double);
+                    break;
+                default:
+                    assert(false);
+            }
+
+            if( (bits & cHASMORE) == 0 )
+                break;
+        }
+        return b.obj();
+    }
+
+    static int compare(const unsigned char *&l, const unsigned char *&r) { 
+        int lt = (*l & cCANONTYPEMASK);
+        int rt = (*r & cCANONTYPEMASK);
+        int x = lt - rt;
+        if( x ) 
+            return x;
+
+        l++; r++;
+
+        // same type
+        switch( lt ) { 
+        case cdouble:
+            {
+                double L = *((double *) l);
+                double R = *((double *) r);
+                if( L < R )
+                    return -1;
+                if( L != R )
+                    return 1;
+                l += 8; r += 8;
+                break;
+            }
+        case cstring:
+            {
+                int lsz = *l;
+                int rsz = *r;
+                int common = min(lsz, rsz);
+                l++; r++; // skip the size byte
+                // use memcmp as we (will) allow zeros in UTF8 strings
+                int res = memcmp(l, r, common);
+                if( res ) 
+                    return res;
+                // longer string is the greater one
+                int diff = lsz-rsz;
+                if( diff ) 
+                    return diff;
+                l += lsz; r += lsz;
+                break;
+            }
+        case cbindata:
+            {
+                int L = *l;
+                int R = *r;
+                int llen = binDataCodeToLength(L);
+                int diff = L-R; // checks length and subtype simultaneously
+                if( diff ) {
+                    // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...)
+                    int rlen = binDataCodeToLength(R);
+                    if( llen != rlen ) 
+                        return llen - rlen;
+                    return diff;
+                }
+                // same length, same type
+                l++; r++;
+                int res = memcmp(l, r, llen);
+                if( res ) 
+                    return res;
+                l += llen; r += llen;
+                break;
+            }
+        case cdate:
+            {
+                long long L = *((long long *) l);
+                long long R = *((long long *) r);
+                if( L < R )
+                    return -1;
+                if( L > R )
+                    return 1;
+                l += 8; r += 8;
+                break;
+            }
+        case coid:
+            {
+                int res = memcmp(l, r, sizeof(OID));
+                if( res ) 
+                    return res;
+                l += 12; r += 12;
+                break;
+            }
+        default:
+            // all the others are a match -- e.g. null == null
+            ;
+        }
+
+        return 0;
+    }
+
+    // at least one of this and right are traditional BSON format
+    int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const { 
+        BSONObj L = toBson();
+        BSONObj R = right.toBson();
+        return L.woCompare(R, order, /*considerfieldname*/false);
+    }
+
+    int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const {
+        const unsigned char *l = _keyData;
+        const unsigned char *r = right._keyData;
+
+        if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained
+            return compareHybrid(right, order);
+
+        unsigned mask = 1;
+        while( 1 ) { 
+            char lval = *l; 
+            char rval = *r;
+            {
+                int x = compare(l, r); // updates l and r pointers
+                if( x ) {
+                    if( order.descending(mask) )
+                        x = -x;
+                    return x;
+                }
+            }
+
+            {
+                int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE));
+                if( x ) 
+                    return x;
+                if( (lval & cHASMORE) == 0 )
+                    break;
+            }
+
+            mask <<= 1;
+        }
+
+        return 0;
+    }
+
+    static unsigned sizes[] = {
+        0,
+        1, //cminkey=1,
+        1, //cnull=2,
+        0,
+        9, //cdouble=4,
+        0,
+        0, //cstring=6,
+        0,
+        13, //coid=8,
+        0,
+        1, //cfalse=10,
+        1, //ctrue=11,
+        9, //cdate=12,
+        0,
+        1, //cmaxkey=14,
+        0
+    };
+
+    inline unsigned sizeOfElement(const unsigned char *p) { 
+        unsigned type = *p & cCANONTYPEMASK;
+        unsigned sz = sizes[type];
+        if( sz == 0 ) {
+            if( type == cstring ) { 
+                sz = ((unsigned) p[1]) + 2;
+            }
+            else {
+                assert( type == cbindata );
+                sz = binDataCodeToLength(p[1]) + 2;
+            }
+        }
+        return sz;
+    }
+
+    int KeyV1::dataSize() const { 
+        const unsigned char *p = _keyData;
+        if( !isCompactFormat() ) {
+            return bson().objsize() + 1;
+        }
+
+        bool more;
+        do { 
+            unsigned z = sizeOfElement(p);
+            more = (*p & cHASMORE) != 0;
+            p += z;
+        } while( more );
+        return p - _keyData;
+    }
+
+    bool KeyV1::woEqual(const KeyV1& right) const {
+        const unsigned char *l = _keyData;
+        const unsigned char *r = right._keyData;
+
+        if( (*l|*r) == IsBSON ) {
+            return toBson().equal(right.toBson());
+        }
+
+        while( 1 ) { 
+            char lval = *l; 
+            char rval = *r;
+            if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) )
+                return false;
+            l++; r++;
+            switch( lval&cCANONTYPEMASK ) { 
+            case coid:
+                if( *((unsigned*) l) != *((unsigned*) r) )
+                    return false;
+                l += 4; r += 4;
+            case cdate:
+                if( *((unsigned long long *) l) != *((unsigned long long *) r) )
+                    return false;
+                l += 8; r += 8;
+                break;
+            case cdouble:
+                if( *((double *) l) != *((double *) r) )
+                    return false;
+                l += 8; r += 8;
+                break;
+            case cstring:
+                {
+                    if( *l != *r ) 
+                        return false; // not same length
+                    unsigned sz = ((unsigned) *l) + 1;
+                    if( memcmp(l, r, sz) )
+                        return false;
+                    l += sz; r += sz;
+                    break;
+                }
+            case cbindata:
+                {
+                    if( *l != *r )
+                        return false; // len or subtype mismatch
+                    int len = binDataCodeToLength(*l) + 1;
+                    if( memcmp(l, r, len) ) 
+                        return false;
+                    l += len; r += len;
+                    break;
+                }
+            case cminkey:
+            case cnull:
+            case cfalse:
+            case ctrue:
+            case cmaxkey:
+                break;
+            default:
+                assert(false);
+            }
+            if( (lval&cHASMORE) == 0 )
+                break;
+        }
+        return true;
+    }
+
+    struct CmpUnitTest : public UnitTest {
+        void run() {
+            char a[2];
+            char b[2];
+            a[0] = -3;
+            a[1] = 0;
+            b[0] = 3;
+            b[1] = 0;
+            assert( strcmp(a,b)>0 && memcmp(a,b,2)>0 );
+        }
+    } cunittest;
+
+}
diff --git a/db/key.h b/db/key.h
new file mode 100644
index 0000000..9a3495f
--- /dev/null
+++ b/db/key.h
@@ -0,0 +1,112 @@
+// @file key.h class(es) representing individual keys in a btree
+
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+ 
+#include "jsobj.h"
+
+namespace mongo { 
+
+    /** Key class for precomputing a small format index key that is denser than a traditional BSONObj. 
+
+        KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes.
+
+        KeyV1 is the new implementation.
+    */
+    class KeyBson /* "KeyV0" */ { 
+    public:
+        KeyBson() { }
+        explicit KeyBson(const char *keyData) : _o(keyData) { }
+        explicit KeyBson(const BSONObj& obj) : _o(obj) { }
+        int woCompare(const KeyBson& r, const Ordering &o) const;
+        BSONObj toBson() const { return _o; }
+        string toString() const { return _o.toString(); }
+        int dataSize() const { return _o.objsize(); }
+        const char * data() const { return _o.objdata(); }
+        BSONElement _firstElement() const { return _o.firstElement(); }
+        bool isCompactFormat() const { return false; }
+        bool woEqual(const KeyBson& r) const;
+        void assign(const KeyBson& rhs) { *this = rhs; }
+    private:
+        BSONObj _o;
+    };
+
+    class KeyV1Owned;
+
+    // corresponding to BtreeData_V1
+    class KeyV1 { 
+        void operator=(const KeyV1&); // disallowed just to make people be careful as we don't own the buffer
+        KeyV1(const KeyV1Owned&);     // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope
+    public:
+        KeyV1() { _keyData = 0; }
+        ~KeyV1() { DEV _keyData = (const unsigned char *) 1; }
+
+        KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) { 
+            dassert( _keyData > (const unsigned char *) 1 );
+        }
+
+        // explicit version of operator= to be safe
+        void assign(const KeyV1& rhs) { 
+            _keyData = rhs._keyData;
+        }
+
+        /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format. 
+                   when BSON, we are just a wrapper
+        */
+        explicit KeyV1(const char *keyData) : _keyData((unsigned char *) keyData) { }
+
+        int woCompare(const KeyV1& r, const Ordering &o) const;
+        bool woEqual(const KeyV1& r) const;
+        BSONObj toBson() const;
+        string toString() const { return toBson().toString(); }
+
+        /** get the key data we want to store in the btree bucket */
+        const char * data() const { return (const char *) _keyData; }
+
+        /** @return size of data() */
+        int dataSize() const;
+
+        /** only used by geo, which always has bson keys */
+        BSONElement _firstElement() const { return bson().firstElement(); }
+        bool isCompactFormat() const { return *_keyData != IsBSON; }
+    protected:
+        enum { IsBSON = 0xff };
+        const unsigned char *_keyData;
+        BSONObj bson() const {
+            dassert( !isCompactFormat() );
+            return BSONObj((const char *) _keyData+1);
+        }
+    private:
+        int compareHybrid(const KeyV1& right, const Ordering& order) const;
+    };
+
+    class KeyV1Owned : public KeyV1 { 
+        KeyV1Owned(const KeyV1Owned&); // not copyable -- StackBufBuilder is not copyable and that owns our buffer
+        void operator=(const KeyV1Owned&);
+    public:
+        /** @obj a BSON object to be translated to KeyV1 format.  If the object isn't 
+                 representable in KeyV1 format (which happens, intentionally, at times)
+                 it will stay as bson herein.
+        */
+        KeyV1Owned(const BSONObj& obj);
+    private:
+        StackBufBuilder b;
+        void traditional(const BSONObj& obj); // store as traditional bson not as compact format
+    };
+
+};
diff --git a/db/lasterror.cpp b/db/lasterror.cpp
index 240c84b..4ed4dfb 100644
--- a/db/lasterror.cpp
+++ b/db/lasterror.cpp
@@ -18,7 +18,7 @@
 #include "pch.h"
 
 #include "../util/unittest.h"
-#include "../util/message.h"
+#include "../util/net/message.h"
 
 
 #include "lasterror.h"
@@ -85,7 +85,7 @@ namespace mongo {
 
     LastError * LastErrorHolder::disableForCommand() {
         LastError *le = _get();
-        assert( le );
+        uassert(13649, "no operation yet", le);
         le->disabled = true;
         le->nPrev--; // caller is a command that shouldn't count as an operation
         return le;
diff --git a/db/matcher.cpp b/db/matcher.cpp
index 38e8e05..2b92d57 100644
--- a/db/matcher.cpp
+++ b/db/matcher.cpp
@@ -25,6 +25,7 @@
 #include "diskloc.h"
 #include "../scripting/engine.h"
 #include "db.h"
+#include "queryutil.h"
 #include "client.h"
 
 #include "pdfile.h"
@@ -40,6 +41,8 @@ namespace {
                 options.set_multiline(true);
             else if ( *flags == 'x' )
                 options.set_extended(true);
+            else if ( *flags == 's' )
+                options.set_dotall(true);
             flags++;
         }
         return options;
@@ -61,8 +64,14 @@ namespace mongo {
         }
         ~Where() {
 
-            if ( scope.get() )
-                scope->execSetup( "_mongo.readOnly = false;" , "make not read only" );
+            if ( scope.get() ){
+                try {
+                    scope->execSetup( "_mongo.readOnly = false;" , "make not read only" );
+                }
+                catch( DBException& e ){
+                    warning() << "javascript scope cleanup interrupted" << causedBy( e ) << endl;
+                }
+            }
 
             if ( jsScope ) {
                 delete jsScope;
@@ -83,74 +92,77 @@ namespace mongo {
     };
 
     Matcher::~Matcher() {
-        delete where;
-        where = 0;
+        delete _where;
+        _where = 0;
     }
 
-    ElementMatcher::ElementMatcher( BSONElement _e , int _op, bool _isNot )
-        : toMatch( _e ) , compareOp( _op ), isNot( _isNot ), subMatcherOnPrimitives(false) {
-        if ( _op == BSONObj::opMOD ) {
-            BSONObj o = _e.embeddedObject();
-            mod = o["0"].numberInt();
-            modm = o["1"].numberInt();
+    ElementMatcher::ElementMatcher( BSONElement e , int op, bool isNot )
+        : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) {
+        if ( op == BSONObj::opMOD ) {
+            BSONObj o = e.embeddedObject();
+            _mod = o["0"].numberInt();
+            _modm = o["1"].numberInt();
 
-            uassert( 10073 ,  "mod can't be 0" , mod );
+            uassert( 10073 ,  "mod can't be 0" , _mod );
         }
-        else if ( _op == BSONObj::opTYPE ) {
-            type = (BSONType)(_e.numberInt());
+        else if ( op == BSONObj::opTYPE ) {
+            _type = (BSONType)(e.numberInt());
         }
-        else if ( _op == BSONObj::opELEM_MATCH ) {
-            BSONElement m = _e;
+        else if ( op == BSONObj::opELEM_MATCH ) {
+            BSONElement m = e;
             uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object );
             BSONObj x = m.embeddedObject();
             if ( x.firstElement().getGtLtOp() == 0 ) {
-                subMatcher.reset( new Matcher( x ) );
-                subMatcherOnPrimitives = false;
+                _subMatcher.reset( new Matcher( x ) );
+                _subMatcherOnPrimitives = false;
             }
             else {
                 // meant to act on primitives
-                subMatcher.reset( new Matcher( BSON( "" << x ) ) );
-                subMatcherOnPrimitives = true;
+                _subMatcher.reset( new Matcher( BSON( "" << x ) ) );
+                _subMatcherOnPrimitives = true;
             }
         }
     }
 
-    ElementMatcher::ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot )
-        : toMatch( _e ) , compareOp( _op ), isNot( _isNot ), subMatcherOnPrimitives(false) {
+    ElementMatcher::ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot )
+        : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) {
 
-        myset.reset( new set<BSONElement,element_lt>() );
+        _myset.reset( new set<BSONElement,element_lt>() );
 
         BSONObjIterator i( array );
         while ( i.more() ) {
             BSONElement ie = i.next();
-            if ( _op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+            if ( op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
                 shared_ptr<Matcher> s;
                 s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) );
-                allMatchers.push_back( s );
+                _allMatchers.push_back( s );
             }
             else if ( ie.type() == RegEx ) {
-                if ( !myregex.get() ) {
-                    myregex.reset( new vector< RegexMatcher >() );
+                if ( !_myregex.get() ) {
+                    _myregex.reset( new vector< RegexMatcher >() );
                 }
-                myregex->push_back( RegexMatcher() );
-                RegexMatcher &rm = myregex->back();
-                rm.re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) );
-                rm.fieldName = 0; // no need for field name
-                rm.regex = ie.regex();
-                rm.flags = ie.regexFlags();
-                rm.isNot = false;
+                _myregex->push_back( RegexMatcher() );
+                RegexMatcher &rm = _myregex->back();
+                rm._re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) );
+                rm._fieldName = 0; // no need for field name
+                rm._regex = ie.regex();
+                rm._flags = ie.regexFlags();
+                rm._isNot = false;
                 bool purePrefix;
-                string prefix = simpleRegex(rm.regex, rm.flags, &purePrefix);
+                string prefix = simpleRegex(rm._regex, rm._flags, &purePrefix);
                 if (purePrefix)
-                    rm.prefix = prefix;
+                    rm._prefix = prefix;
             }
             else {
-                myset->insert(ie);
+                uassert( 15882, "$elemMatch not allowed within $in",
+                         ie.type() != Object ||
+                         ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
+                _myset->insert(ie);
             }
         }
 
-        if ( allMatchers.size() ) {
-            uassert( 13020 , "with $all, can't mix $elemMatch and others" , myset->size() == 0 && !myregex.get());
+        if ( _allMatchers.size() ) {
+            uassert( 13020 , "with $all, can't mix $elemMatch and others" , _myset->size() == 0 && !_myregex.get());
         }
 
     }
@@ -158,23 +170,23 @@ namespace mongo {
 
     void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot) {
 
-        if ( nRegex >= 4 ) {
+        if ( _nRegex >= 4 ) {
             out() << "ERROR: too many regexes in query" << endl;
         }
         else {
-            RegexMatcher& rm = regexs[nRegex];
-            rm.re.reset( new pcrecpp::RE(regex, flags2options(flags)) );
-            rm.fieldName = fieldName;
-            rm.regex = regex;
-            rm.flags = flags;
-            rm.isNot = isNot;
-            nRegex++;
+            RegexMatcher& rm = _regexs[_nRegex];
+            rm._re.reset( new pcrecpp::RE(regex, flags2options(flags)) );
+            rm._fieldName = fieldName;
+            rm._regex = regex;
+            rm._flags = flags;
+            rm._isNot = isNot;
+            _nRegex++;
 
             if (!isNot) { //TODO something smarter
                 bool purePrefix;
                 string prefix = simpleRegex(regex, flags, &purePrefix);
                 if (purePrefix)
-                    rm.prefix = prefix;
+                    rm._prefix = prefix;
             }
         }
     }
@@ -201,7 +213,7 @@ namespace mongo {
             break;
         }
         case BSONObj::NE: {
-            haveNeg = true;
+            _haveNeg = true;
             shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
             _builders.push_back( b );
             b->appendAs(fe, e.fieldName());
@@ -209,15 +221,22 @@ namespace mongo {
             break;
         }
         case BSONObj::opALL:
-            all = true;
-        case BSONObj::opIN:
+            _all = true;
+        case BSONObj::opIN: {
             uassert( 13276 , "$in needs an array" , fe.isABSONObj() );
-            basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+            _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+            BSONObjIterator i( fe.embeddedObject() );
+            while( i.more() ) {
+                if ( i.next().type() == Array ) {
+                    _hasArray = true;
+                }
+            }
             break;
+        }
         case BSONObj::NIN:
             uassert( 13277 , "$nin needs an array" , fe.isABSONObj() );
-            haveNeg = true;
-            basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+            _haveNeg = true;
+            _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
             break;
         case BSONObj::opMOD:
         case BSONObj::opTYPE:
@@ -226,7 +245,7 @@ namespace mongo {
             _builders.push_back( b );
             b->appendAs(fe, e.fieldName());
             // these are types where ElementMatcher has all the info
-            basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) );
+            _basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) );
             break;
         }
         case BSONObj::opSIZE: {
@@ -234,7 +253,7 @@ namespace mongo {
             _builders.push_back( b );
             b->appendAs(fe, e.fieldName());
             addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot);
-            haveSize = true;
+            _haveSize = true;
             break;
         }
         case BSONObj::opEXISTS: {
@@ -270,99 +289,93 @@ namespace mongo {
         return true;
     }
 
-    void Matcher::parseOr( const BSONElement &e, bool subMatcher, list< shared_ptr< Matcher > > &matchers ) {
-        uassert( 13090, "nested $or/$nor not allowed", !subMatcher );
-        uassert( 13086, "$or/$nor must be a nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
+    void Matcher::parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers ) {
+        uassert( 13086, "$and/$or/$nor must be a nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
         BSONObjIterator j( e.embeddedObject() );
         while( j.more() ) {
             BSONElement f = j.next();
-            uassert( 13087, "$or/$nor match element must be an object", f.type() == Object );
-            // until SERVER-109 this is never a covered index match, so don't constrain index key for $or matchers
+            uassert( 13087, "$and/$or/$nor match element must be an object", f.type() == Object );
             matchers.push_back( shared_ptr< Matcher >( new Matcher( f.embeddedObject(), true ) ) );
         }
     }
 
-    bool Matcher::parseOrNor( const BSONElement &e, bool subMatcher ) {
+    bool Matcher::parseClause( const BSONElement &e ) {
         const char *ef = e.fieldName();
         if ( ef[ 0 ] != '$' )
             return false;
-        if ( ef[ 1 ] == 'o' && ef[ 2 ] == 'r' && ef[ 3 ] == 0 ) {
-            parseOr( e, subMatcher, _orMatchers );
+        if ( ef[ 1 ] == 'a' && ef[ 2 ] == 'n' && ef[ 3 ] == 'd' ) {
+            parseExtractedClause( e, _andMatchers );
+        }
+        else if ( ef[ 1 ] == 'o' && ef[ 2 ] == 'r' && ef[ 3 ] == 0 ) {
+            parseExtractedClause( e, _orMatchers );
         }
         else if ( ef[ 1 ] == 'n' && ef[ 2 ] == 'o' && ef[ 3 ] == 'r' && ef[ 4 ] == 0 ) {
-            parseOr( e, subMatcher, _norMatchers );
+            parseExtractedClause( e, _norMatchers );
         }
         else {
             return false;
         }
         return true;
     }
-
-    /* _jsobj          - the query pattern
-    */
-    Matcher::Matcher(const BSONObj &_jsobj, bool subMatcher) :
-        where(0), jsobj(_jsobj), haveSize(), all(), hasArray(0), haveNeg(), _atomic(false), nRegex(0) {
-
-        BSONObjIterator i(jsobj);
-        while ( i.more() ) {
-            BSONElement e = i.next();
+    
+    void Matcher::parseMatchExpressionElement( const BSONElement &e, bool nested ) {
+        
+        uassert( 13629 , "can't have undefined in a query expression" , e.type() != Undefined );
+        
+        if ( parseClause( e ) ) {
+            return;   
+        }
+        
+        if ( ( e.type() == CodeWScope || e.type() == Code || e.type() == String ) && strcmp(e.fieldName(), "$where")==0 ) {
+            // $where: function()...
+            uassert( 10066 , "$where may only appear once in query", _where == 0 );
+            uassert( 10067 , "$where query, but no script engine", globalScriptEngine );
+            massert( 13089 , "no current client needed for $where" , haveClient() );
+            _where = new Where();
+            _where->scope = globalScriptEngine->getPooledScope( cc().ns() );
+            _where->scope->localConnect( cc().database()->name.c_str() );
             
-            uassert( 13629 , "can't have undefined in a query expression" , e.type() != Undefined );
-
-            if ( parseOrNor( e, subMatcher ) ) {
-                continue;
-            }
-
-            if ( ( e.type() == CodeWScope || e.type() == Code || e.type() == String ) && strcmp(e.fieldName(), "$where")==0 ) {
-                // $where: function()...
-                uassert( 10066 , "$where occurs twice?", where == 0 );
-                uassert( 10067 , "$where query, but no script engine", globalScriptEngine );
-                massert( 13089 , "no current client needed for $where" , haveClient() );
-                where = new Where();
-                where->scope = globalScriptEngine->getPooledScope( cc().ns() );
-                where->scope->localConnect( cc().database()->name.c_str() );
-
-                if ( e.type() == CodeWScope ) {
-                    where->setFunc( e.codeWScopeCode() );
-                    where->jsScope = new BSONObj( e.codeWScopeScopeData() , 0 );
-                }
-                else {
-                    const char *code = e.valuestr();
-                    where->setFunc(code);
-                }
-
-                where->scope->execSetup( "_mongo.readOnly = true;" , "make read only" );
-
-                continue;
+            if ( e.type() == CodeWScope ) {
+                _where->setFunc( e.codeWScopeCode() );
+                _where->jsScope = new BSONObj( e.codeWScopeScopeData() );
             }
-
-            if ( e.type() == RegEx ) {
-                addRegex( e.fieldName(), e.regex(), e.regexFlags() );
-                continue;
+            else {
+                const char *code = e.valuestr();
+                _where->setFunc(code);
             }
-
-            // greater than / less than...
-            // e.g., e == { a : { $gt : 3 } }
-            //       or
-            //            { a : { $in : [1,2,3] } }
-            if ( e.type() == Object ) {
-                // support {$regex:"a|b", $options:"imx"}
-                const char* regex = NULL;
-                const char* flags = "";
-
-                // e.g., fe == { $gt : 3 }
-                BSONObjIterator j(e.embeddedObject());
-                bool isOperator = false;
-                while ( j.more() ) {
-                    BSONElement fe = j.next();
-                    const char *fn = fe.fieldName();
-
-                    if ( fn[0] == '$' && fn[1] ) {
-                        isOperator = true;
-
-                        if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) {
-                            haveNeg = true;
-                            switch( fe.type() ) {
+            
+            _where->scope->execSetup( "_mongo.readOnly = true;" , "make read only" );
+            
+            return;
+        }
+        
+        if ( e.type() == RegEx ) {
+            addRegex( e.fieldName(), e.regex(), e.regexFlags() );
+            return;
+        }
+        
+        // greater than / less than...
+        // e.g., e == { a : { $gt : 3 } }
+        //       or
+        //            { a : { $in : [1,2,3] } }
+        if ( e.type() == Object ) {
+            // support {$regex:"a|b", $options:"imx"}
+            const char* regex = NULL;
+            const char* flags = "";
+            
+            // e.g., fe == { $gt : 3 }
+            BSONObjIterator j(e.embeddedObject());
+            bool isOperator = false;
+            while ( j.more() ) {
+                BSONElement fe = j.next();
+                const char *fn = fe.fieldName();
+                
+                if ( fn[0] == '$' && fn[1] ) {
+                    isOperator = true;
+                    
+                    if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) {
+                        _haveNeg = true;
+                        switch( fe.type() ) {
                             case Object: {
                                 BSONObjIterator k( fe.embeddedObject() );
                                 uassert( 13030, "$not cannot be empty", k.more() );
@@ -376,65 +389,98 @@ namespace mongo {
                                 break;
                             default:
                                 uassert( 13031, "invalid use of $not", false );
-                            }
-                        }
-                        else {
-                            if ( !addOp( e, fe, false, regex, flags ) ) {
-                                isOperator = false;
-                                break;
-                            }
                         }
                     }
                     else {
-                        isOperator = false;
-                        break;
+                        if ( !addOp( e, fe, false, regex, flags ) ) {
+                            isOperator = false;
+                            break;
+                        }
                     }
                 }
-                if (regex) {
-                    addRegex(e.fieldName(), regex, flags);
+                else {
+                    isOperator = false;
+                    break;
                 }
-                if ( isOperator )
-                    continue;
-            }
-
-            if ( e.type() == Array ) {
-                hasArray = true;
             }
-            else if( strcmp(e.fieldName(), "$atomic") == 0 ) {
-                _atomic = e.trueValue();
-                continue;
+            if (regex) {
+                addRegex(e.fieldName(), regex, flags);
             }
+            if ( isOperator )
+                return;
+        }
+        
+        if ( e.type() == Array ) {
+            _hasArray = true;
+        }
+        else if( strcmp(e.fieldName(), "$atomic") == 0 ) {
+            uassert( 14844, "$atomic specifier must be a top level field", !nested );
+            _atomic = e.trueValue();
+            return;
+        }
+        
+        // normal, simple case e.g. { a : "foo" }
+        addBasic(e, BSONObj::Equality, false);
+    }
+    
+    /* _jsobj          - the query pattern
+    */
+    Matcher::Matcher(const BSONObj &jsobj, bool nested) :
+        _where(0), _jsobj(jsobj), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false), _nRegex(0) {
 
-            // normal, simple case e.g. { a : "foo" }
-            addBasic(e, BSONObj::Equality, false);
+        BSONObjIterator i(_jsobj);
+        while ( i.more() ) {
+            parseMatchExpressionElement( i.next(), nested );
         }
     }
 
-    Matcher::Matcher( const Matcher &other, const BSONObj &key ) :
-        where(0), constrainIndexKey_( key ), haveSize(), all(), hasArray(0), haveNeg(), _atomic(false), nRegex(0) {
-        // do not include fields which would make keyMatch() false
-        for( vector< ElementMatcher >::const_iterator i = other.basics.begin(); i != other.basics.end(); ++i ) {
-            if ( key.hasField( i->toMatch.fieldName() ) ) {
-                switch( i->compareOp ) {
+    Matcher::Matcher( const Matcher &docMatcher, const BSONObj &key ) :
+        _where(0), _constrainIndexKey( key ), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false), _nRegex(0) {
+        // Filter out match components that will provide an incorrect result
+        // given a key from a single key index.
+        for( vector< ElementMatcher >::const_iterator i = docMatcher._basics.begin(); i != docMatcher._basics.end(); ++i ) {
+            if ( key.hasField( i->_toMatch.fieldName() ) ) {
+                switch( i->_compareOp ) {
                 case BSONObj::opSIZE:
                 case BSONObj::opALL:
                 case BSONObj::NE:
                 case BSONObj::NIN:
+                case BSONObj::opEXISTS: // We can't match on index in this case.
+                case BSONObj::opTYPE: // For $type:10 (null), a null key could be a missing field or a null value field.
+                    break;
+                case BSONObj::opIN: {
+                    bool inContainsArray = false;
+                    for( set<BSONElement,element_lt>::const_iterator j = i->_myset->begin(); j != i->_myset->end(); ++j ) {
+                        if ( j->type() == Array ) {
+                            inContainsArray = true;
+                            break;
+                        }
+                    }
+                    // Can't match an array to its first indexed element.
+                    if ( !i->_isNot && !inContainsArray ) {
+                        _basics.push_back( *i );
+                    }
                     break;
+                }
                 default: {
-                    if ( !i->isNot && i->toMatch.type() != Array ) {
-                        basics.push_back( *i );
+                    // Can't match an array to its first indexed element.
+                    if ( !i->_isNot && i->_toMatch.type() != Array ) {
+                        _basics.push_back( *i );
                     }
                 }
                 }
             }
         }
-        for( int i = 0; i < other.nRegex; ++i ) {
-            if ( !other.regexs[ i ].isNot && key.hasField( other.regexs[ i ].fieldName ) ) {
-                regexs[ nRegex++ ] = other.regexs[ i ];
+        for( int i = 0; i < docMatcher._nRegex; ++i ) {
+            if ( !docMatcher._regexs[ i ]._isNot && key.hasField( docMatcher._regexs[ i ]._fieldName ) ) {
+                _regexs[ _nRegex++ ] = docMatcher._regexs[ i ];
             }
         }
-        for( list< shared_ptr< Matcher > >::const_iterator i = other._orMatchers.begin(); i != other._orMatchers.end(); ++i ) {
+        // Recursively filter match components for and and or matchers.
+        for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._andMatchers.begin(); i != docMatcher._andMatchers.end(); ++i ) {
+            _andMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) );
+        }
+        for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._orMatchers.begin(); i != docMatcher._orMatchers.end(); ++i ) {
             _orMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) );
         }
     }
@@ -443,12 +489,12 @@ namespace mongo {
         switch (e.type()) {
         case String:
         case Symbol:
-            if (rm.prefix.empty())
-                return rm.re->PartialMatch(e.valuestr());
+            if (rm._prefix.empty())
+                return rm._re->PartialMatch(e.valuestr());
             else
-                return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size());
+                return !strncmp(e.valuestr(), rm._prefix.c_str(), rm._prefix.size());
         case RegEx:
-            return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags());
+            return !strcmp(rm._regex, e.regex()) && !strcmp(rm._flags, e.regexFlags());
         default:
             return false;
         }
@@ -463,11 +509,11 @@ namespace mongo {
 
         if ( op == BSONObj::opIN ) {
             // { $in : [1,2,3] }
-            int count = bm.myset->count(l);
+            int count = bm._myset->count(l);
             if ( count )
                 return count;
-            if ( bm.myregex.get() ) {
-                for( vector<RegexMatcher>::const_iterator i = bm.myregex->begin(); i != bm.myregex->end(); ++i ) {
+            if ( bm._myregex.get() ) {
+                for( vector<RegexMatcher>::const_iterator i = bm._myregex->begin(); i != bm._myregex->end(); ++i ) {
                     if ( regexMatches( *i, l ) ) {
                         return true;
                     }
@@ -493,11 +539,11 @@ namespace mongo {
             if ( ! l.isNumber() )
                 return false;
 
-            return l.numberLong() % bm.mod == bm.modm;
+            return l.numberLong() % bm._mod == bm._modm;
         }
 
         if ( op == BSONObj::opTYPE ) {
-            return bm.type == l.type();
+            return bm._type == l.type();
         }
 
         /* check LT, GTE, ... */
@@ -512,16 +558,14 @@ namespace mongo {
 
     int Matcher::matchesNe(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm , MatchDetails * details ) {
         int ret = matchesDotted( fieldName, toMatch, obj, BSONObj::Equality, bm , false , details );
-        if ( bm.toMatch.type() != jstNULL )
+        if ( bm._toMatch.type() != jstNULL )
             return ( ret <= 0 ) ? 1 : 0;
         else
             return -ret;
     }
 
-    int retMissing( const ElementMatcher &bm ) {
-        if ( bm.compareOp != BSONObj::opEXISTS )
-            return 0;
-        return bm.toMatch.boolean() ? -1 : 1;
+    int retExistsFound( const ElementMatcher &bm ) {
+        return bm._toMatch.trueValue() ? 1 : -1;
     }
 
     /* Check if a particular field matches.
@@ -547,11 +591,11 @@ namespace mongo {
         DEBUGMATCHER( "\t matchesDotted : " << fieldName << " hasDetails: " << ( details ? "yes" : "no" ) );
         if ( compareOp == BSONObj::opALL ) {
 
-            if ( em.allMatchers.size() ) {
+            if ( em._allMatchers.size() ) {
                 BSONElement e = obj.getFieldDotted( fieldName );
                 uassert( 13021 , "$all/$elemMatch needs to be applied to array" , e.type() == Array );
 
-                for ( unsigned i=0; i<em.allMatchers.size(); i++ ) {
+                for ( unsigned i=0; i<em._allMatchers.size(); i++ ) {
                     bool found = false;
                     BSONObjIterator x( e.embeddedObject() );
                     while ( x.more() ) {
@@ -559,7 +603,7 @@ namespace mongo {
 
                         if ( f.type() != Object )
                             continue;
-                        if ( em.allMatchers[i]->matches( f.embeddedObject() ) ) {
+                        if ( em._allMatchers[i]->matches( f.embeddedObject() ) ) {
                             found = true;
                             break;
                         }
@@ -572,13 +616,13 @@ namespace mongo {
                 return 1;
             }
 
-            if ( em.myset->size() == 0 && !em.myregex.get() )
+            if ( em._myset->size() == 0 && !em._myregex.get() )
                 return -1; // is this desired?
 
             BSONElementSet myValues;
             obj.getFieldsDotted( fieldName , myValues );
 
-            for( set< BSONElement, element_lt >::const_iterator i = em.myset->begin(); i != em.myset->end(); ++i ) {
+            for( set< BSONElement, element_lt >::const_iterator i = em._myset->begin(); i != em._myset->end(); ++i ) {
                 // ignore nulls
                 if ( i->type() == jstNULL )
                     continue;
@@ -587,10 +631,10 @@ namespace mongo {
                     return -1;
             }
 
-            if ( !em.myregex.get() )
+            if ( !em._myregex.get() )
                 return 1;
 
-            for( vector< RegexMatcher >::const_iterator i = em.myregex->begin(); i != em.myregex->end(); ++i ) {
+            for( vector< RegexMatcher >::const_iterator i = em._myregex->begin(); i != em._myregex->end(); ++i ) {
                 bool match = false;
                 for( BSONElementSet::const_iterator j = myValues.begin(); j != myValues.end(); ++j ) {
                     if ( regexMatches( *i, *j ) ) {
@@ -608,15 +652,15 @@ namespace mongo {
         if ( compareOp == BSONObj::NE )
             return matchesNe( fieldName, toMatch, obj, em , details );
         if ( compareOp == BSONObj::NIN ) {
-            for( set<BSONElement,element_lt>::const_iterator i = em.myset->begin(); i != em.myset->end(); ++i ) {
+            for( set<BSONElement,element_lt>::const_iterator i = em._myset->begin(); i != em._myset->end(); ++i ) {
                 int ret = matchesNe( fieldName, *i, obj, em , details );
                 if ( ret != 1 )
                     return ret;
             }
-            if ( em.myregex.get() ) {
+            if ( em._myregex.get() ) {
                 BSONElementSet s;
                 obj.getFieldsDotted( fieldName, s );
-                for( vector<RegexMatcher>::const_iterator i = em.myregex->begin(); i != em.myregex->end(); ++i ) {
+                for( vector<RegexMatcher>::const_iterator i = em._myregex->begin(); i != em._myregex->end(); ++i ) {
                     for( BSONElementSet::const_iterator j = s.begin(); j != s.end(); ++j ) {
                         if ( regexMatches( *i, *j ) ) {
                             return -1;
@@ -628,13 +672,13 @@ namespace mongo {
         }
 
         BSONElement e;
-        bool indexed = !constrainIndexKey_.isEmpty();
+        bool indexed = !_constrainIndexKey.isEmpty();
         if ( indexed ) {
-            e = obj.getFieldUsingIndexNames(fieldName, constrainIndexKey_);
+            e = obj.getFieldUsingIndexNames(fieldName, _constrainIndexKey);
             if( e.eoo() ) {
                 cout << "obj: " << obj << endl;
                 cout << "fieldName: " << fieldName << endl;
-                cout << "constrainIndexKey_: " << constrainIndexKey_ << endl;
+                cout << "_constrainIndexKey: " << _constrainIndexKey << endl;
                 assert( !e.eoo() );
             }
         }
@@ -655,6 +699,7 @@ namespace mongo {
                 }
             }
 
+            // An array was encountered while scanning for components of the field name.
             if ( isArr ) {
                 DEBUGMATCHER( "\t\t isArr 1 : obj : " << obj );
                 BSONObjIterator ai(obj);
@@ -662,11 +707,16 @@ namespace mongo {
                 while ( ai.moreWithEOO() ) {
                     BSONElement z = ai.next();
 
-                    if( strcmp(z.fieldName(),fieldName) == 0 && valuesMatch(z, toMatch, compareOp, em) ) {
-                        // "field.<n>" array notation was used
-                        if ( details )
-                            details->elemMatchKey = z.fieldName();
-                        return 1;
+                    if( strcmp(z.fieldName(),fieldName) == 0 ) {
+                        if ( compareOp == BSONObj::opEXISTS ) {
+                         	return retExistsFound( em );
+                        }
+                        if (valuesMatch(z, toMatch, compareOp, em) ) {
+	                        // "field.<n>" array notation was used
+    	                    if ( details )
+        	                    details->_elemMatchKey = z.fieldName();
+            	            return 1;
+                        }
                     }
 
                     if ( z.type() == Object ) {
@@ -674,7 +724,7 @@ namespace mongo {
                         int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, em, false, details );
                         if ( cmp > 0 ) {
                             if ( details )
-                                details->elemMatchKey = z.fieldName();
+                                details->_elemMatchKey = z.fieldName();
                             return 1;
                         }
                         else if ( cmp < 0 ) {
@@ -682,11 +732,12 @@ namespace mongo {
                         }
                     }
                 }
-                return found ? -1 : retMissing( em );
+                return found ? -1 : 0;
             }
 
             if( p ) {
-                return retMissing( em );
+                // Left portion of field name was not found or wrong type.
+                return 0;
             }
             else {
                 e = obj.getField(fieldName);
@@ -694,7 +745,11 @@ namespace mongo {
         }
 
         if ( compareOp == BSONObj::opEXISTS ) {
-            return ( e.eoo() ^ ( toMatch.boolean() ^ em.isNot ) ) ? 1 : -1;
+            if( e.eoo() ) {
+             	return 0;
+            } else {
+             	return retExistsFound( em );   
+            }
         }
         else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) &&
                   valuesMatch(e, toMatch, compareOp, em ) ) {
@@ -708,16 +763,16 @@ namespace mongo {
 
                 if ( compareOp == BSONObj::opELEM_MATCH ) {
                     if ( z.type() == Object ) {
-                        if ( em.subMatcher->matches( z.embeddedObject() ) ) {
+                        if ( em._subMatcher->matches( z.embeddedObject() ) ) {
                             if ( details )
-                                details->elemMatchKey = z.fieldName();
+                                details->_elemMatchKey = z.fieldName();
                             return 1;
                         }
                     }
-                    else if ( em.subMatcherOnPrimitives ) {
-                        if ( z.type() && em.subMatcher->matches( z.wrap( "" ) ) ) {
+                    else if ( em._subMatcherOnPrimitives ) {
+                        if ( z.type() && em._subMatcher->matches( z.wrap( "" ) ) ) {
                             if ( details )
-                                details->elemMatchKey = z.fieldName();
+                                details->_elemMatchKey = z.fieldName();
                             return 1;
                         }
                     }
@@ -725,21 +780,22 @@ namespace mongo {
                 else {
                     if ( valuesMatch( z, toMatch, compareOp, em) ) {
                         if ( details )
-                            details->elemMatchKey = z.fieldName();
+                            details->_elemMatchKey = z.fieldName();
                         return 1;
                     }
                 }
 
             }
 
+            // match an entire array to itself
             if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ) {
-                // match an entire array to itself
                 return 1;
             }
-
+            if ( compareOp == BSONObj::opIN && valuesMatch( e, toMatch, compareOp, em ) ) {
+             	return 1;   
+            }
         }
         else if ( e.eoo() ) {
-            // 0 indicates "missing element"
             return 0;
         }
         return -1;
@@ -754,56 +810,89 @@ namespace mongo {
         could be slow sometimes. */
 
         // check normal non-regex cases:
-        for ( unsigned i = 0; i < basics.size(); i++ ) {
-            ElementMatcher& bm = basics[i];
-            BSONElement& m = bm.toMatch;
+        for ( unsigned i = 0; i < _basics.size(); i++ ) {
+            ElementMatcher& bm = _basics[i];
+            BSONElement& m = bm._toMatch;
             // -1=mismatch. 0=missing element. 1=match
-            int cmp = matchesDotted(m.fieldName(), m, jsobj, bm.compareOp, bm , false , details );
-            if ( bm.compareOp != BSONObj::opEXISTS && bm.isNot )
+            int cmp = matchesDotted(m.fieldName(), m, jsobj, bm._compareOp, bm , false , details );
+            if ( cmp == 0 && bm._compareOp == BSONObj::opEXISTS ) {
+                // If missing, match cmp is opposite of $exists spec.
+                cmp = -retExistsFound(bm);
+            }
+            if ( bm._isNot )
                 cmp = -cmp;
             if ( cmp < 0 )
                 return false;
             if ( cmp == 0 ) {
                 /* missing is ok iff we were looking for null */
-                if ( m.type() == jstNULL || m.type() == Undefined || ( bm.compareOp == BSONObj::opIN && bm.myset->count( staticNull.firstElement() ) > 0 ) ) {
-                    if ( ( bm.compareOp == BSONObj::NE ) ^ bm.isNot ) {
+                if ( m.type() == jstNULL || m.type() == Undefined || ( bm._compareOp == BSONObj::opIN && bm._myset->count( staticNull.firstElement() ) > 0 ) ) {
+                    if ( ( bm._compareOp == BSONObj::NE ) ^ bm._isNot ) {
                         return false;
                     }
                 }
                 else {
-                    if ( !bm.isNot ) {
+                    if ( !bm._isNot ) {
                         return false;
                     }
                 }
             }
         }
 
-        for ( int r = 0; r < nRegex; r++ ) {
-            RegexMatcher& rm = regexs[r];
+        for ( int r = 0; r < _nRegex; r++ ) {
+            RegexMatcher& rm = _regexs[r];
             BSONElementSet s;
-            if ( !constrainIndexKey_.isEmpty() ) {
-                BSONElement e = jsobj.getFieldUsingIndexNames(rm.fieldName, constrainIndexKey_);
-                if ( !e.eoo() )
+            if ( !_constrainIndexKey.isEmpty() ) {
+                BSONElement e = jsobj.getFieldUsingIndexNames(rm._fieldName, _constrainIndexKey);
+
+                // Should only have keys nested one deep here, for geo-indices
+                // TODO: future indices may nest deeper?
+                if( e.type() == Array ){
+                	BSONObjIterator i( e.Obj() );
+                	while( i.more() ){
+                		s.insert( i.next() );
+                	}
+                }
+                else if ( !e.eoo() )
                     s.insert( e );
+
             }
             else {
-                jsobj.getFieldsDotted( rm.fieldName, s );
+                jsobj.getFieldsDotted( rm._fieldName, s );
             }
             bool match = false;
             for( BSONElementSet::const_iterator i = s.begin(); i != s.end(); ++i )
                 if ( regexMatches(rm, *i) )
                     match = true;
-            if ( !match ^ rm.isNot )
+            if ( !match ^ rm._isNot )
                 return false;
         }
 
+        if ( _orDedupConstraints.size() > 0 ) {
+            for( vector< shared_ptr< FieldRangeVector > >::const_iterator i = _orDedupConstraints.begin();
+                i != _orDedupConstraints.end(); ++i ) {
+                if ( (*i)->matches( jsobj ) ) {
+                    return false;
+                }
+            }
+        }
+        
+        if ( _andMatchers.size() > 0 ) {
+            for( list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin();
+                i != _andMatchers.end(); ++i ) {
+                // SERVER-3192 Track field matched using details the same as for
+                // top level fields, at least for now.
+                if ( !(*i)->matches( jsobj, details ) ) {
+                    return false;
+                }
+            }
+        }
+
         if ( _orMatchers.size() > 0 ) {
             bool match = false;
             for( list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin();
                     i != _orMatchers.end(); ++i ) {
                 // SERVER-205 don't submit details - we don't want to track field
-                // matched within $or, and at this point we've already loaded the
-                // whole document
+                // matched within $or
                 if ( (*i)->matches( jsobj ) ) {
                     match = true;
                     break;
@@ -818,40 +907,30 @@ namespace mongo {
             for( list< shared_ptr< Matcher > >::const_iterator i = _norMatchers.begin();
                     i != _norMatchers.end(); ++i ) {
                 // SERVER-205 don't submit details - we don't want to track field
-                // matched within $nor, and at this point we've already loaded the
-                // whole document
+                // matched within $nor
                 if ( (*i)->matches( jsobj ) ) {
                     return false;
                 }
             }
         }
 
-        for( vector< shared_ptr< FieldRangeVector > >::const_iterator i = _orConstraints.begin();
-                i != _orConstraints.end(); ++i ) {
-            if ( (*i)->matches( jsobj ) ) {
-                return false;
-            }
-        }
-
-        if ( where ) {
-            if ( where->func == 0 ) {
+        if ( _where ) {
+            if ( _where->func == 0 ) {
                 uassert( 10070 , "$where compile error", false);
                 return false; // didn't compile
             }
 
-            if ( where->jsScope ) {
-                where->scope->init( where->jsScope );
+            if ( _where->jsScope ) {
+                _where->scope->init( _where->jsScope );
             }
-            where->scope->setThis( const_cast< BSONObj * >( &jsobj ) );
-            where->scope->setObject( "obj", const_cast< BSONObj & >( jsobj ) );
-            where->scope->setBoolean( "fullObject" , true ); // this is a hack b/c fullObject used to be relevant
+            _where->scope->setObject( "obj", const_cast< BSONObj & >( jsobj ) );
+            _where->scope->setBoolean( "fullObject" , true ); // this is a hack b/c fullObject used to be relevant
 
-            int err = where->scope->invoke( where->func , BSONObj() , 1000 * 60 , false );
-            where->scope->setThis( 0 );
+            int err = _where->scope->invoke( _where->func , 0, &jsobj , 1000 * 60 , false );
             if ( err == -3 ) { // INVOKE_ERROR
                 stringstream ss;
                 ss << "error on invocation of $where function:\n"
-                   << where->scope->getError();
+                   << _where->scope->getError();
                 uassert( 10071 , ss.str(), false);
                 return false;
             }
@@ -859,38 +938,45 @@ namespace mongo {
                 uassert( 10072 , "unknown error in invocation of $where function", false);
                 return false;
             }
-            return where->scope->getBoolean( "return" ) != 0;
+            return _where->scope->getBoolean( "return" ) != 0;
 
         }
 
         return true;
     }
 
-    bool Matcher::hasType( BSONObj::MatchType type ) const {
-        for ( unsigned i=0; i<basics.size() ; i++ )
-            if ( basics[i].compareOp == type )
-                return true;
-        return false;
-    }
-
-    bool Matcher::sameCriteriaCount( const Matcher &other ) const {
-        if ( !( basics.size() == other.basics.size() && nRegex == other.nRegex && !where == !other.where ) ) {
+    bool Matcher::keyMatch( const Matcher &docMatcher ) const {
+        // Quick check certain non key match cases.
+        if ( docMatcher._all
+        	|| docMatcher._haveSize
+        	|| docMatcher._hasArray // We can't match an array to its first indexed element using keymatch
+        	|| docMatcher._haveNeg ) {
+         	return false;   
+        }
+        
+        // Check that all match components are available in the index matcher.
+        if ( !( _basics.size() == docMatcher._basics.size() && _nRegex == docMatcher._nRegex && !docMatcher._where ) ) {
+            return false;
+        }
+        if ( _andMatchers.size() != docMatcher._andMatchers.size() ) {
             return false;
         }
-        if ( _norMatchers.size() != other._norMatchers.size() ) {
+        if ( _orMatchers.size() != docMatcher._orMatchers.size() ) {
             return false;
         }
-        if ( _orMatchers.size() != other._orMatchers.size() ) {
+        if ( docMatcher._norMatchers.size() > 0 ) {
             return false;
         }
-        if ( _orConstraints.size() != other._orConstraints.size() ) {
+        if ( docMatcher._orDedupConstraints.size() > 0 ) {
             return false;
         }
+        
+        // Recursively check that all submatchers support key match.
         {
-            list< shared_ptr< Matcher > >::const_iterator i = _norMatchers.begin();
-            list< shared_ptr< Matcher > >::const_iterator j = other._norMatchers.begin();
-            while( i != _norMatchers.end() ) {
-                if ( !(*i)->sameCriteriaCount( **j ) ) {
+            list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin();
+            list< shared_ptr< Matcher > >::const_iterator j = docMatcher._andMatchers.begin();
+            while( i != _andMatchers.end() ) {
+                if ( !(*i)->keyMatch( **j ) ) {
                     return false;
                 }
                 ++i; ++j;
@@ -898,14 +984,16 @@ namespace mongo {
         }
         {
             list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin();
-            list< shared_ptr< Matcher > >::const_iterator j = other._orMatchers.begin();
+            list< shared_ptr< Matcher > >::const_iterator j = docMatcher._orMatchers.begin();
             while( i != _orMatchers.end() ) {
-                if ( !(*i)->sameCriteriaCount( **j ) ) {
+                if ( !(*i)->keyMatch( **j ) ) {
                     return false;
                 }
                 ++i; ++j;
             }
         }
+        // Nor matchers and or dedup constraints aren't created for index matchers,
+        // so no need to check those here.
         return true;
     }
 
diff --git a/db/matcher.h b/db/matcher.h
index d242df6..82ef5cc 100644
--- a/db/matcher.h
+++ b/db/matcher.h
@@ -21,7 +21,7 @@
 #pragma once
 
 #include "jsobj.h"
-#include <pcrecpp.h>
+#include "pcrecpp.h"
 
 namespace mongo {
 
@@ -32,13 +32,13 @@ namespace mongo {
 
     class RegexMatcher {
     public:
-        const char *fieldName;
-        const char *regex;
-        const char *flags;
-        string prefix;
-        shared_ptr< pcrecpp::RE > re;
-        bool isNot;
-        RegexMatcher() : isNot() {}
+        const char *_fieldName;
+        const char *_regex;
+        const char *_flags;
+        string _prefix;
+        shared_ptr< pcrecpp::RE > _re;
+        bool _isNot;
+        RegexMatcher() : _isNot() {}
     };
 
     struct element_lt {
@@ -57,27 +57,27 @@ namespace mongo {
         ElementMatcher() {
         }
 
-        ElementMatcher( BSONElement _e , int _op, bool _isNot );
+        ElementMatcher( BSONElement e , int op, bool isNot );
 
-        ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot );
+        ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot );
 
         ~ElementMatcher() { }
 
-        BSONElement toMatch;
-        int compareOp;
-        bool isNot;
-        shared_ptr< set<BSONElement,element_lt> > myset;
-        shared_ptr< vector<RegexMatcher> > myregex;
+        BSONElement _toMatch;
+        int _compareOp;
+        bool _isNot;
+        shared_ptr< set<BSONElement,element_lt> > _myset;
+        shared_ptr< vector<RegexMatcher> > _myregex;
 
         // these are for specific operators
-        int mod;
-        int modm;
-        BSONType type;
+        int _mod;
+        int _modm;
+        BSONType _type;
 
-        shared_ptr<Matcher> subMatcher;
-        bool subMatcherOnPrimitives ;
+        shared_ptr<Matcher> _subMatcher;
+        bool _subMatcherOnPrimitives ;
 
-        vector< shared_ptr<Matcher> > allMatchers;
+        vector< shared_ptr<Matcher> > _allMatchers;
     };
 
     class Where; // used for $where javascript eval
@@ -89,19 +89,19 @@ namespace mongo {
         }
 
         void reset() {
-            loadedObject = false;
-            elemMatchKey = 0;
+            _loadedObject = false;
+            _elemMatchKey = 0;
         }
 
         string toString() const {
             stringstream ss;
-            ss << "loadedObject: " << loadedObject << " ";
-            ss << "elemMatchKey: " << ( elemMatchKey ? elemMatchKey : "NULL" ) << " ";
+            ss << "loadedObject: " << _loadedObject << " ";
+            ss << "elemMatchKey: " << ( _elemMatchKey ? _elemMatchKey : "NULL" ) << " ";
             return ss.str();
         }
 
-        bool loadedObject;
-        const char * elemMatchKey; // warning, this may go out of scope if matched object does
+        bool _loadedObject;
+        const char * _elemMatchKey; // warning, this may go out of scope if matched object does
     };
 
     /* Match BSON objects against a query pattern.
@@ -134,45 +134,44 @@ namespace mongo {
             return op <= BSONObj::LTE ? -1 : 1;
         }
 
-        Matcher(const BSONObj &pattern, bool subMatcher = false);
+        Matcher(const BSONObj &pattern, bool nested=false);
 
         ~Matcher();
 
         bool matches(const BSONObj& j, MatchDetails * details = 0 );
 
-        // fast rough check to see if we must load the real doc - we also
-        // compare field counts against covereed index matcher; for $or clauses
-        // we just compare field counts
-        bool keyMatch() const { return !all && !haveSize && !hasArray && !haveNeg; }
-
         bool atomic() const { return _atomic; }
 
-        bool hasType( BSONObj::MatchType type ) const;
-
         string toString() const {
-            return jsobj.toString();
+            return _jsobj.toString();
         }
 
-        void addOrConstraint( const shared_ptr< FieldRangeVector > &frv ) {
-            _orConstraints.push_back( frv );
+        void addOrDedupConstraint( const shared_ptr< FieldRangeVector > &frv ) {
+            _orDedupConstraints.push_back( frv );
         }
 
         void popOrClause() {
             _orMatchers.pop_front();
         }
 
-        bool sameCriteriaCount( const Matcher &other ) const;
-
+        /**
+         * @return true if this key matcher will return the same true/false
+         * value as the provided doc matcher.
+         */
+        bool keyMatch( const Matcher &docMatcher ) const;
+        
     private:
-        // Only specify constrainIndexKey if matches() will be called with
-        // index keys having empty string field names.
-        Matcher( const Matcher &other, const BSONObj &constrainIndexKey );
+        /**
+         * Generate a matcher for the provided index key format using the
+         * provided full doc matcher.
+         */
+        Matcher( const Matcher &docMatcher, const BSONObj &constrainIndexKey );
 
         void addBasic(const BSONElement &e, int c, bool isNot) {
             // TODO May want to selectively ignore these element types based on op type.
             if ( e.type() == MinKey || e.type() == MaxKey )
                 return;
-            basics.push_back( ElementMatcher( e , c, isNot ) );
+            _basics.push_back( ElementMatcher( e , c, isNot ) );
         }
 
         void addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot = false);
@@ -180,17 +179,19 @@ namespace mongo {
 
         int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm);
 
-        bool parseOrNor( const BSONElement &e, bool subMatcher );
-        void parseOr( const BSONElement &e, bool subMatcher, list< shared_ptr< Matcher > > &matchers );
+        bool parseClause( const BSONElement &e );
+        void parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers );
 
-        Where *where;                    // set if query uses $where
-        BSONObj jsobj;                  // the query pattern.  e.g., { name: "joe" }
-        BSONObj constrainIndexKey_;
-        vector<ElementMatcher> basics;
-        bool haveSize;
-        bool all;
-        bool hasArray;
-        bool haveNeg;
+        void parseMatchExpressionElement( const BSONElement &e, bool nested );
+        
+        Where *_where;                    // set if query uses $where
+        BSONObj _jsobj;                  // the query pattern.  e.g., { name: "joe" }
+        BSONObj _constrainIndexKey;
+        vector<ElementMatcher> _basics;
+        bool _haveSize;
+        bool _all;
+        bool _hasArray;
+        bool _haveNeg;
 
         /* $atomic - if true, a multi document operation (some removes, updates)
                      should be done atomically.  in that case, we do not yield -
@@ -199,14 +200,15 @@ namespace mongo {
         */
         bool _atomic;
 
-        RegexMatcher regexs[4];
-        int nRegex;
+        RegexMatcher _regexs[4];
+        int _nRegex;
 
         // so we delete the mem when we're done:
         vector< shared_ptr< BSONObjBuilder > > _builders;
+        list< shared_ptr< Matcher > > _andMatchers;
         list< shared_ptr< Matcher > > _orMatchers;
         list< shared_ptr< Matcher > > _norMatchers;
-        vector< shared_ptr< FieldRangeVector > > _orConstraints;
+        vector< shared_ptr< FieldRangeVector > > _orDedupConstraints;
 
         friend class CoveredIndexMatcher;
     };
@@ -216,7 +218,13 @@ namespace mongo {
     public:
         CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false );
         bool matches(const BSONObj &o) { return _docMatcher->matches( o ); }
-        bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 , bool keyUsable = true );
+        bool matchesWithSingleKeyIndex(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 ) {
+            return matches( key, recLoc, details, true );   
+        }
+        /**
+         * This is the preferred method for matching against a cursor, as it
+         * can handle both multi and single key cursors.
+         */
         bool matchesCurrent( Cursor * cursor , MatchDetails * details = 0 );
         bool needRecord() { return _needRecord; }
 
@@ -224,7 +232,7 @@ namespace mongo {
 
         // once this is called, shouldn't use this matcher for matching any more
         void advanceOrClause( const shared_ptr< FieldRangeVector > &frv ) {
-            _docMatcher->addOrConstraint( frv );
+            _docMatcher->addOrDedupConstraint( frv );
             // TODO this is not yet optimal.  Since we could skip an entire
             // or clause (if a match is impossible) between calls to advanceOrClause()
             // we may not pop all the clauses we can.
@@ -234,15 +242,17 @@ namespace mongo {
         CoveredIndexMatcher *nextClauseMatcher( const BSONObj &indexKeyPattern, bool alwaysUseRecord=false ) {
             return new CoveredIndexMatcher( _docMatcher, indexKeyPattern, alwaysUseRecord );
         }
+
+        string toString() const;
+
     private:
+        bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 , bool keyUsable = true );
         CoveredIndexMatcher(const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false );
         void init( bool alwaysUseRecord );
         shared_ptr< Matcher > _docMatcher;
         Matcher _keyMatcher;
 
         bool _needRecord; // if the key itself isn't good enough to determine a positive match
-        bool _needRecordReject; // if the key itself isn't good enough to determine a negative match
-        bool _useRecordOnly;
     };
 
 } // namespace mongo
diff --git a/db/matcher_covered.cpp b/db/matcher_covered.cpp
index 18892be..52164f5 100644
--- a/db/matcher_covered.cpp
+++ b/db/matcher_covered.cpp
@@ -46,22 +46,24 @@ namespace mongo {
     void CoveredIndexMatcher::init( bool alwaysUseRecord ) {
         _needRecord =
             alwaysUseRecord ||
-            ! ( _docMatcher->keyMatch() &&
-                _keyMatcher.sameCriteriaCount( *_docMatcher ) );
-
-        _needRecordReject = _keyMatcher.hasType( BSONObj::opEXISTS );
+	        !_keyMatcher.keyMatch( *_docMatcher );
     }
 
     bool CoveredIndexMatcher::matchesCurrent( Cursor * cursor , MatchDetails * details ) {
         // bool keyUsable = ! cursor->isMultiKey() && check for $orish like conditions in matcher SERVER-1264
-        return matches( cursor->currKey() , cursor->currLoc() , details  );
+        return matches( cursor->currKey() , cursor->currLoc() , details ,
+                       !cursor->indexKeyPattern().isEmpty() // unindexed cursor
+                       && !cursor->isMultiKey() // multikey cursor
+                       );
     }
 
     bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details , bool keyUsable ) {
+        dassert( key.isValid() );
+
         if ( details )
             details->reset();
 
-        if ( _needRecordReject == false && keyUsable ) {
+        if ( keyUsable ) {
 
             if ( !_keyMatcher.matches(key, details ) ) {
                 return false;
@@ -74,10 +76,24 @@ namespace mongo {
         }
 
         if ( details )
-            details->loadedObject = true;
+            details->_loadedObject = true;
 
         return _docMatcher->matches(recLoc.obj() , details );
     }
 
-
+    string CoveredIndexMatcher::toString() const {
+        StringBuilder buf;
+        buf << "(CoveredIndexMatcher ";
+        
+        if ( _needRecord )
+            buf << "needRecord ";
+        
+        buf << "keyMatcher: " << _keyMatcher.toString() << " ";
+        
+        if ( _docMatcher )
+            buf << "docMatcher: " << _docMatcher->toString() << " ";
+        
+        buf << ")";
+        return buf.str();
+    }
 }
diff --git a/db/modules/mms.cpp b/db/modules/mms.cpp
index b180262..40abb39 100644
--- a/db/modules/mms.cpp
+++ b/db/modules/mms.cpp
@@ -20,7 +20,7 @@
 #include "../db.h"
 #include "../instance.h"
 #include "../module.h"
-#include "../../util/httpclient.h"
+#include "../../util/net/httpclient.h"
 #include "../../util/background.h"
 #include "../commands.h"
 
@@ -142,7 +142,7 @@ namespace mongo {
 
             string errmsg;
             BSONObjBuilder sub;
-            if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) )
+            if ( ! c->run( "admin.$cmd" , co , 0 , errmsg , sub , false ) )
                 postData.append( cmd , errmsg );
             else
                 postData.append( cmd , sub.obj() );
diff --git a/db/mongommf.cpp b/db/mongommf.cpp
index 5ae573d..7c77ef8 100644
--- a/db/mongommf.cpp
+++ b/db/mongommf.cpp
@@ -53,7 +53,7 @@ namespace mongo {
                 break;
 
             size_t viewStart = (size_t) x.first;
-            size_t viewEnd = viewStart + mmf->length();
+            size_t viewEnd = (size_t) (viewStart + mmf->length());
             if( viewEnd <= chunkStart )
                 break;
 
@@ -68,7 +68,7 @@ namespace mongo {
             bool ok = VirtualProtect((void*)protectStart, protectSize, PAGE_WRITECOPY, &old);
             if( !ok ) {
                 DWORD e = GetLastError();
-                log() << "VirtualProtect failed " << chunkno << hex << protectStart << ' ' << protectSize << ' ' << errnoWithDescription(e) << endl;
+                log() << "VirtualProtect failed (mcw) " << mmf->filename() << ' ' << chunkno << hex << protectStart << ' ' << protectSize << ' ' << errnoWithDescription(e) << endl;
                 assert(false);
             }
         }
@@ -76,95 +76,16 @@ namespace mongo {
         writable.set(chunkno);
     }
 
-    __declspec(noinline) void makeChunkWritableOld(size_t chunkno) { 
-        scoped_lock lk(mapViewMutex);
-
-        if( writable.get(chunkno) )
-            return;
-
-        size_t loc = chunkno * MemoryMappedFile::ChunkSize;
-        void *Loc = (void*) loc;
-        size_t ofs;
-        MongoMMF *mmf = privateViews.find( (void *) (loc), ofs );
-        MemoryMappedFile *f = (MemoryMappedFile*) mmf;
-        assert(f);
-
-        size_t len = MemoryMappedFile::ChunkSize;
-        assert( mmf->getView() <= Loc );
-        if( ofs + len > f->length() ) {
-            // at the very end of the map
-            len = f->length() - ofs;
-        }
-        else { 
-            ;
-        }
-
-        // todo: check this goes away on remap
-        DWORD old;
-        bool ok = VirtualProtect(Loc, len, PAGE_WRITECOPY, &old);
-        if( !ok ) {
-            DWORD e = GetLastError();
-            log() << "VirtualProtect failed " << Loc << ' ' << len << ' ' << errnoWithDescription(e) << endl;
-            assert(false);
-        }
-
-        writable.set(chunkno);
-    }
-
-    // align so that there is only one map per chunksize so our bitset works right
-    void* mapaligned(HANDLE h, unsigned long long _len) {
-        void *loc = 0;
-        int n = 0;
-        while( 1 ) { 
-            n++;
-            void *m = MapViewOfFileEx(h, FILE_MAP_READ, 0, 0, 0, loc);
-            if( m == 0 ) {
-                DWORD e = GetLastError();
-                if( n == 0 ) { 
-                    // if first fails, it isn't going to work
-                    log() << "mapaligned errno: " << e << endl;
-                    break;
-                }
-                if( debug && n == 1 ) { 
-                    log() << "mapaligned info e:" << e << " at n=1" << endl;
-                }
-                if( n > 98 ) {
-                    log() << "couldn't align mapped view of file len:" << _len/1024.0/1024.0 << "MB errno:" << e << endl;
-                    break;
-                }
-                loc = (void*) (((size_t)loc)+MemoryMappedFile::ChunkSize);
-                continue;
-            }
-
-            size_t x = (size_t) m;
-            if( x % MemoryMappedFile::ChunkSize == 0 ) {
-                void *end = (void*) (x+_len);
-                DEV log() << "mapaligned " << m << '-' << end << " len:" << _len << endl;
-                return m;
-            }
-
-            UnmapViewOfFile(m);
-            x = ((x+MemoryMappedFile::ChunkSize-1) / MemoryMappedFile::ChunkSize) * MemoryMappedFile::ChunkSize;
-            loc = (void*) x;
-            if( n % 20 == 0 ) { 
-                log() << "warning mapaligned n=20" << endl;
-            }
-            if( n > 100 ) {
-                log() << "couldn't align mapped view of file len:" << _len/1024.0/1024.0 << "MB" << endl;
-                break;
-            }
-        }
-        return 0;
-    }
-
     void* MemoryMappedFile::createPrivateMap() {
         assert( maphandle );
         scoped_lock lk(mapViewMutex);
-        //void *p = mapaligned(maphandle, len);
         void *p = MapViewOfFile(maphandle, FILE_MAP_READ, 0, 0, 0);
         if ( p == 0 ) {
             DWORD e = GetLastError();
-            log() << "createPrivateMap failed " << filename() << " " << errnoWithDescription(e) << endl;
+            log() << "createPrivateMap failed " << filename() << " " << 
+                errnoWithDescription(e) << " filelen:" << len <<
+                ((sizeof(void*) == 4 ) ? " (32 bit build)" : "") <<
+                endl;
         }
         else {
             clearWritableBits(p);
@@ -180,7 +101,17 @@ namespace mongo {
         scoped_lock lk(mapViewMutex);
 
         clearWritableBits(oldPrivateAddr);
-
+#if 1
+        // https://jira.mongodb.org/browse/SERVER-2942
+        DWORD old;
+        bool ok = VirtualProtect(oldPrivateAddr, (SIZE_T) len, PAGE_READONLY, &old);
+        if( !ok ) {
+            DWORD e = GetLastError();
+            log() << "VirtualProtect failed in remapPrivateView " << filename() << hex << oldPrivateAddr << ' ' << len << ' ' << errnoWithDescription(e) << endl;
+            assert(false);
+        }
+        return oldPrivateAddr;
+#else
         if( !UnmapViewOfFile(oldPrivateAddr) ) {
             DWORD e = GetLastError();
             log() << "UnMapViewOfFile failed " << filename() << ' ' << errnoWithDescription(e) << endl;
@@ -199,6 +130,7 @@ namespace mongo {
         }
         assert(p == oldPrivateAddr);
         return p;
+#endif
     }
 #endif
 
@@ -351,7 +283,7 @@ namespace mongo {
             if( cmdLine.dur ) {
                 _view_private = createPrivateMap();
                 if( _view_private == 0 ) {
-                    massert( 13636 , "createPrivateMap failed (look in log for error)" , false );
+                    msgasserted(13636, str::stream() << "file " << filename() << " open/create failed in createPrivateMap (look in log for more information)");
                 }
                 privateViews.add(_view_private, this); // note that testIntent builds use this, even though it points to view_write then...
             }
@@ -376,14 +308,12 @@ namespace mongo {
     }
 
     /*virtual*/ void MongoMMF::close() {
-        {
-            if( cmdLine.dur && _view_write/*actually was opened*/ ) {
-                if( debug )
-                    log() << "closingFileNotication:" << filename() << endl;
-                dur::closingFileNotification();
-            }
-            privateViews.remove(_view_private);
+        if( cmdLine.dur && _view_write/*actually was opened*/ ) {
+            dur::closingFileNotification();
         }
+
+        RWLockRecursive::Exclusive lk(mmmutex);
+        privateViews.remove(_view_private);
         _view_write = _view_private = 0;
         MemoryMappedFile::close();
     }
diff --git a/db/mongommf.h b/db/mongommf.h
index 5da46fc..0c4e8e4 100644
--- a/db/mongommf.h
+++ b/db/mongommf.h
@@ -27,6 +27,9 @@ namespace mongo {
         not this.
     */
     class MongoMMF : private MemoryMappedFile {
+    protected:
+        virtual void* viewForFlushing() { return _view_write; }
+
     public:
         MongoMMF();
         virtual ~MongoMMF();
@@ -72,7 +75,7 @@ namespace mongo {
             fileSuffixNo() is 3
             if the suffix is "ns", fileSuffixNo -1
         */
-        RelativePath relativePath() const {
+        const RelativePath& relativePath() const {
             DEV assert( !_p._p.empty() );
             return _p;
         }
diff --git a/db/namespace-inl.h b/db/namespace-inl.h
index a777ff8..a621a22 100644
--- a/db/namespace-inl.h
+++ b/db/namespace-inl.h
@@ -71,21 +71,23 @@ namespace mongo {
     }
 
     inline IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected ) {
-        if( idxNo < NIndexesBase )
-            return _indexes[idxNo];
+        if( idxNo < NIndexesBase ) {
+            IndexDetails& id = _indexes[idxNo];
+            return id;
+        }
         Extra *e = extra();
         if ( ! e ) {
             if ( missingExpected )
                 throw MsgAssertionException( 13283 , "Missing Extra" );
-            massert(13282, "missing Extra", e);
+            massert(14045, "missing Extra", e);
         }
         int i = idxNo - NIndexesBase;
         if( i >= NIndexesExtra ) {
             e = e->next(this);
             if ( ! e ) {
                 if ( missingExpected )
-                    throw MsgAssertionException( 13283 , "missing extra" );
-                massert(13283, "missing Extra", e);
+                    throw MsgAssertionException( 14823 , "missing extra" );
+                massert(14824, "missing Extra", e);
             }
             i -= NIndexesExtra;
         }
diff --git a/db/namespace.cpp b/db/namespace.cpp
index 0cb0e74..2bc7409 100644
--- a/db/namespace.cpp
+++ b/db/namespace.cpp
@@ -25,9 +25,11 @@
 #include "btree.h"
 #include <algorithm>
 #include <list>
-#include "query.h"
 #include "queryutil.h"
 #include "json.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+
 
 namespace mongo {
 
@@ -91,7 +93,7 @@ namespace mongo {
         boost::filesystem::path dir( dir_ );
         dir /= database_;
         if ( !boost::filesystem::exists( dir ) )
-            BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( dir ) );
+            MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::create_directory( dir ), "create dir for db " );
     }
 
     unsigned lenForNewNsFiles = 16 * 1024 * 1024;
@@ -99,7 +101,7 @@ namespace mongo {
 #if defined(_DEBUG)
     void NamespaceDetails::dump(const Namespace& k) {
         if( !cmdLine.dur )
-            cout << "ns offsets which follow will not display correctly with --dur disabled" << endl;
+            cout << "ns offsets which follow will not display correctly with --journal disabled" << endl;
 
         size_t ofs = 1; // 1 is sentinel that the find call below failed
         privateViews.find(this, /*out*/ofs);
@@ -253,7 +255,11 @@ namespace mongo {
         }
     }
 
-    // lenToAlloc is WITH header
+    /** allocate space for a new record from deleted lists.
+        @param lenToAlloc is WITH header
+        @param extentLoc OUT returns the extent location
+        @return null diskloc if no room - allocate a new extent then
+    */
     DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) {
         lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
         DiskLoc loc = _alloc(ns, lenToAlloc);
@@ -568,8 +574,8 @@ namespace mongo {
 
     /* ------------------------------------------------------------------------- */
 
-    mongo::mutex NamespaceDetailsTransient::_qcMutex("qc");
-    mongo::mutex NamespaceDetailsTransient::_isMutex("is");
+    SimpleMutex NamespaceDetailsTransient::_qcMutex("qc");
+    SimpleMutex NamespaceDetailsTransient::_isMutex("is");
     map< string, shared_ptr< NamespaceDetailsTransient > > NamespaceDetailsTransient::_map;
     typedef map< string, shared_ptr< NamespaceDetailsTransient > >::iterator ouriter;
 
@@ -627,7 +633,7 @@ namespace mongo {
        options: { capped : ..., size : ... }
     */
     void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0) {
-        log(1) << "New namespace: " << ns << '\n';
+        LOG(1) << "New namespace: " << ns << endl;
         if ( strstr(ns, "system.namespaces") ) {
             // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
             // TODO: fix above should not be strstr!
@@ -643,6 +649,9 @@ namespace mongo {
             char database[256];
             nsToDatabase(ns, database);
             string s = database;
+            if( cmdLine.configsvr && (s != "config" && s != "admin") ) { 
+                uasserted(14037, "can't create user databases on a --configsvr instance");
+            }
             s += ".system.namespaces";
             theDataFileMgr.insert(s.c_str(), j.objdata(), j.objsize(), true);
         }
@@ -711,14 +720,14 @@ namespace mongo {
                     newIndexSpecB << "ns" << to;
             }
             BSONObj newIndexSpec = newIndexSpecB.done();
-            DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, BSONElement(), false );
+            DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, false );
             int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) );
             IndexDetails &indexDetails = details->idx(indexI);
             string oldIndexNs = indexDetails.indexNamespace();
             indexDetails.info = newIndexSpecLoc;
             string newIndexNs = indexDetails.indexNamespace();
 
-            BtreeBucket::renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() );
+            renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() );
             deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true );
         }
     }
diff --git a/db/namespace.h b/db/namespace.h
index ef3d04e..3dfb3f3 100644
--- a/db/namespace.h
+++ b/db/namespace.h
@@ -20,7 +20,7 @@
 
 #include "../pch.h"
 #include "jsobj.h"
-#include "queryutil.h"
+#include "querypattern.h"
 #include "diskloc.h"
 #include "../util/hashtab.h"
 #include "mongommf.h"
@@ -44,6 +44,21 @@ namespace mongo {
         NamespaceString( const string& ns ) { init(ns.c_str()); }
         string ns() const { return db + '.' + coll; }
         bool isSystem() const { return strncmp(coll.c_str(), "system.", 7) == 0; }
+
+        /**
+         * @return true if ns is 'normal'.  $ used for collections holding index data, which do not contain BSON objects in their records.
+         * special case for the local.oplog.$main ns -- naming it as such was a mistake.
+         */
+        static bool normal(const char* ns) {
+            const char *p = strchr(ns, '$');
+            if( p == 0 )
+                return true;
+            return strcmp( ns, "local.oplog.$main" ) == 0;
+        }
+
+        static bool special(const char *ns) { 
+            return !normal(ns) || strstr(ns, ".system.");
+        }
     private:
         void init(const char *ns) {
             const char *p = strchr(ns, '.');
@@ -67,6 +82,9 @@ namespace mongo {
         bool operator==(const char *r) const { return strcmp(buf, r) == 0; }
         bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; }
         int hash() const; // value returned is always > 0
+
+        size_t size() const { return strlen( buf ); }
+
         string toString() const { return (string) buf; }
         operator string() const { return (string) buf; }
 
@@ -93,8 +111,8 @@ namespace mongo {
 
 namespace mongo {
 
-    /** @return true if a client can modify this namespace
-        things like *.system.users
+    /** @return true if a client can modify this namespace even though it is under ".system."
+        For example <dbname>.system.users is ok for regular clients to update.
         @param write used when .system.js
     */
     bool legalClientSystemNS( const string& ns , bool write );
@@ -154,7 +172,7 @@ namespace mongo {
         unsigned long long reservedA;
         long long extraOffset;                // where the $extra info is located (bytes relative to this)
     public:
-        int indexBuildInProgress;   // 1 if in prog
+        int indexBuildInProgress;             // 1 if in prog
         unsigned reservedB;
         // ofs 424 (8)
         struct Capped2 {
@@ -302,13 +320,17 @@ namespace mongo {
 
         void paddingFits() {
             double x = paddingFactor - 0.01;
-            if ( x >= 1.0 )
-                getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+            if ( x >= 1.0 ) {
+                *getDur().writing(&paddingFactor) = x;
+                //getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+            }
         }
         void paddingTooSmall() {
             double x = paddingFactor + 0.6;
-            if ( x <= 2.0 )
-                getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+            if ( x <= 2.0 ) {
+                *getDur().writing(&paddingFactor) = x;
+                //getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+            }
         }
 
         // @return offset in indexes[]
@@ -337,6 +359,10 @@ namespace mongo {
             return -1;
         }
 
+        bool haveIdIndex() { 
+            return (flags & NamespaceDetails::Flag_HaveIdIndex) || findIdIndex() >= 0;
+        }
+
         /* return which "deleted bucket" for this size object */
         static int bucket(int n) {
             for ( int i = 0; i < Buckets; i++ )
@@ -412,9 +438,12 @@ namespace mongo {
         static std::map< string, shared_ptr< NamespaceDetailsTransient > > _map;
     public:
         NamespaceDetailsTransient(const char *ns) : _ns(ns), _keysComputed(false), _qcWriteCount() { }
+    private:
         /* _get() is not threadsafe -- see get_inlock() comments */
         static NamespaceDetailsTransient& _get(const char *ns);
-        /* use get_w() when doing write operations */
+    public:
+        /* use get_w() when doing write operations. this is safe as there is only 1 write op and it's exclusive to everything else.  
+           for reads you must lock and then use get_inlock() instead. */
         static NamespaceDetailsTransient& get_w(const char *ns) {
             DEV assertInWriteLock();
             return _get(ns);
@@ -427,6 +456,26 @@ namespace mongo {
         static void clearForPrefix(const char *prefix);
         static void eraseForPrefix(const char *prefix);
 
+        /**
+         * @return a cursor interface to the query optimizer.  The implementation may
+         * utilize a single query plan or interleave results from multiple query
+         * plans before settling on a single query plan.  Note that the schema of
+         * currKey() documents, the matcher(), and the isMultiKey() nature of the
+         * cursor may change over the course of iteration.
+         *
+         * @param order - If no index exists that satisfies this sort order, an
+         * empty shared_ptr will be returned.
+         *
+         * The returned cursor may @throw inside of advance() or recoverFromYield() in
+         * certain error cases, for example if a capped overrun occurred during a yield.
+         * This indicates that the cursor was unable to perform a complete scan.
+         *
+         * This is a work in progress.  Partial list of features not yet implemented:
+         * - modification of scanned documents
+         * - covered indexes
+         */
+        static shared_ptr<Cursor> getCursor( const char *ns, const BSONObj &query, const BSONObj &order = BSONObj() );
+                                     
         /* indexKeys() cache ---------------------------------------------------- */
         /* assumed to be in write lock for this */
     private:
@@ -447,12 +496,12 @@ namespace mongo {
         /* IndexSpec caching */
     private:
         map<const IndexDetails*,IndexSpec> _indexSpecs;
-        static mongo::mutex _isMutex;
+        static SimpleMutex _isMutex;
     public:
         const IndexSpec& getIndexSpec( const IndexDetails * details ) {
             IndexSpec& spec = _indexSpecs[details];
             if ( ! spec._finishedInit ) {
-                scoped_lock lk(_isMutex);
+                SimpleMutex::scoped_lock lk(_isMutex);
                 if ( ! spec._finishedInit ) {
                     spec.reset( details );
                     assert( spec._finishedInit );
@@ -466,7 +515,7 @@ namespace mongo {
         int _qcWriteCount;
         map< QueryPattern, pair< BSONObj, long long > > _qcCache;
     public:
-        static mongo::mutex _qcMutex;
+        static SimpleMutex _qcMutex;
         /* you must be in the qcMutex when calling this (and using the returned val): */
         static NamespaceDetailsTransient& get_inlock(const char *ns) {
             return _get(ns);
@@ -479,7 +528,7 @@ namespace mongo {
         void notifyOfWriteOp() {
             if ( _qcCache.empty() )
                 return;
-            if ( ++_qcWriteCount >= 100 )
+            if ( ++_qcWriteCount >= 1000 )
                 clearQueryCache();
         }
         BSONObj indexForPattern( const QueryPattern &pattern ) {
@@ -564,6 +613,8 @@ namespace mongo {
 
         boost::filesystem::path path() const;
 
+        unsigned long long fileLength() const { return f.length(); }
+
     private:
         void maybeMkdir() const;
 
diff --git a/db/nonce.cpp b/db/nonce.cpp
index 6f35c79..379e88f 100644
--- a/db/nonce.cpp
+++ b/db/nonce.cpp
@@ -23,7 +23,9 @@ extern int do_md5_test(void);
 
 namespace mongo {
 
-    BOOST_STATIC_ASSERT( sizeof(nonce) == 8 );
+    BOOST_STATIC_ASSERT( sizeof(nonce64) == 8 );
+
+    static Security security; // needs to be static so _initialized is preset to false (see initsafe below)
 
     Security::Security() {
         static int n;
@@ -31,7 +33,7 @@ namespace mongo {
         init();
     }
 
-    void Security::init() {
+    NOINLINE_DECL void Security::init() {
         if( _initialized ) return;
         _initialized = true;
 
@@ -39,7 +41,7 @@ namespace mongo {
         _devrandom = new ifstream("/dev/urandom", ios::binary|ios::in);
         massert( 10353 ,  "can't open dev/urandom", _devrandom->is_open() );
 #elif defined(_WIN32)
-        srand(curTimeMicros());
+        srand(curTimeMicros()); // perhaps not relevant for rand_s but we might want elsewhere anyway
 #else
         srandomdev();
 #endif
@@ -50,21 +52,12 @@ namespace mongo {
 #endif
     }
 
-    nonce Security::getNonce() {
-        static mongo::mutex m("getNonce");
-        scoped_lock lk(m);
-        
-        if ( ! _initialized )
-            init();
-
-        /* question/todo: /dev/random works on OS X.  is it better
-           to use that than random() / srandom()?
-        */
-
-        nonce n;
+    nonce64 Security::__getNonce() { 
+        dassert( _initialized );
+        nonce64 n;
 #if defined(__linux__) || defined(__sunos__) || defined(__APPLE__)
         _devrandom->read((char*)&n, sizeof(n));
-        massert( 10355 , "devrandom failed", !_devrandom->fail());
+        massert(10355 , "devrandom failed", !_devrandom->fail());
 #elif defined(_WIN32)
         unsigned a=0, b=0;
         assert( rand_s(&a) == 0 );
@@ -75,9 +68,28 @@ namespace mongo {
 #endif
         return n;
     }
-    unsigned getRandomNumber() { return (unsigned) security.getNonce(); }
 
-    bool Security::_initialized;
-    Security security;
+    SimpleMutex nonceMutex("nonce");
+    nonce64 Security::_getNonce() {
+        // not good this is a static as gcc will mutex protect it which costs time
+        SimpleMutex::scoped_lock lk(nonceMutex);
+        if( !_initialized )
+            init();
+        return __getNonce();
+    }
+
+    nonce64 Security::getNonceDuringInit() {
+        // the mutex might not be inited yet.  init phase should be one thread anyway (hopefully we don't spawn threads therein)
+        if( !security._initialized )
+            security.init();
+        return security.__getNonce();
+    }
+
+    nonce64 Security::getNonce() {
+        return security._getNonce();
+    }
+
+    // name warns us this might be a little slow (see code above)
+    unsigned goodRandomNumberSlow() { return (unsigned) Security::getNonce(); }
 
 } // namespace mongo
diff --git a/db/nonce.h b/db/nonce.h
index 21592ab..d6a147a 100644
--- a/db/nonce.h
+++ b/db/nonce.h
@@ -1,4 +1,4 @@
-// nonce.h
+// @file nonce.h
 
 /*    Copyright 2009 10gen Inc.
  *
@@ -19,24 +19,18 @@
 
 namespace mongo {
 
-    typedef unsigned long long nonce;
+    typedef unsigned long long nonce64;
 
     struct Security {
         Security();
-
-        nonce getNonce();
-
-        /** safe during global var initialization */
-        nonce getNonceInitSafe() {
-            init();
-            return getNonce();
-        }
+        static nonce64 getNonce();
+        static nonce64 getNonceDuringInit(); // use this version during global var constructors
     private:
+        nonce64 _getNonce();
+        nonce64 __getNonce();
         ifstream *_devrandom;
-        static bool _initialized;
+        bool _initialized;
         void init(); // can call more than once
     };
 
-    extern Security security;
-
 } // namespace mongo
diff --git a/db/oplog.cpp b/db/oplog.cpp
index 1557cbd..dc9db76 100644
--- a/db/oplog.cpp
+++ b/db/oplog.cpp
@@ -23,6 +23,12 @@
 #include "commands.h"
 #include "repl/rs.h"
 #include "stats/counters.h"
+#include "../util/file.h"
+#include "../util/unittest.h"
+#include "queryoptimizer.h"
+#include "ops/update.h"
+#include "ops/delete.h"
+#include "ops/query.h"
 
 namespace mongo {
 
@@ -113,10 +119,12 @@ namespace mongo {
         *b = EOO;
     }
 
+    // global is safe as we are in write lock. we put the static outside the function to avoid the implicit mutex 
+    // the compiler would use if inside the function.  the reason this is static is to avoid a malloc/free for this
+    // on every logop call.
+    static BufBuilder logopbufbuilder(8*1024);
     static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
         DEV assertInWriteLock();
-        // ^- static is safe as we are in write lock
-        static BufBuilder bufbuilder(8*1024);
 
         if ( strncmp(ns, "local.", 6) == 0 ) {
             if ( strncmp(ns, "local.slaves", 12) == 0 )
@@ -125,7 +133,6 @@ namespace mongo {
         }
 
         const OpTime ts = OpTime::now();
-
         long long hashNew;
         if( theReplSet ) {
             massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary());
@@ -141,12 +148,10 @@ namespace mongo {
            instead we do a single copy to the destination position in the memory mapped file.
         */
 
-        bufbuilder.reset();
-        BSONObjBuilder b(bufbuilder);
-
+        logopbufbuilder.reset();
+        BSONObjBuilder b(logopbufbuilder);
         b.appendTimestamp("ts", ts.asDate());
         b.append("h", hashNew);
-
         b.append("op", opstr);
         b.append("ns", ns);
         if ( bb )
@@ -361,7 +366,7 @@ namespace mongo {
                 sz = (256-64) * 1000 * 1000;
 #else
                 sz = 990.0 * 1000 * 1000;
-                boost::intmax_t free = freeSpace(); //-1 if call not supported.
+                boost::intmax_t free = File::freeSpace(dbpath); //-1 if call not supported.
                 double fivePct = free * 0.05;
                 if ( fivePct > sz )
                     sz = fivePct;
@@ -389,11 +394,151 @@ namespace mongo {
 
     // -------------------------------------
 
-    struct TestOpTime {
-        TestOpTime() {
+    FindingStartCursor::FindingStartCursor( const QueryPlan & qp ) :
+    _qp( qp ),
+    _findingStart( true ),
+    _findingStartMode(),
+    _findingStartTimer( 0 )
+    { init(); }
+    
+    void FindingStartCursor::next() {
+        if ( !_findingStartCursor || !_findingStartCursor->ok() ) {
+            _findingStart = false;
+            _c = _qp.newCursor(); // on error, start from beginning
+            destroyClientCursor();
+            return;
+        }
+        switch( _findingStartMode ) {
+            // Initial mode: scan backwards from end of collection
+            case Initial: {
+                if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+                    _findingStart = false; // found first record out of query range, so scan normally
+                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
+                    destroyClientCursor();
+                    return;
+                }
+                _findingStartCursor->advance();
+                RARELY {
+                    if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) {
+                        // If we've scanned enough, switch to find extent mode.
+                        createClientCursor( extentFirstLoc( _findingStartCursor->currLoc() ) );
+                        _findingStartMode = FindExtent;
+                        return;
+                    }
+                }
+                return;
+            }
+            // FindExtent mode: moving backwards through extents, check first
+            // document of each extent.
+            case FindExtent: {
+                if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+                    _findingStartMode = InExtent;
+                    return;
+                }
+                DiskLoc prev = prevExtentFirstLoc( _findingStartCursor->currLoc() );
+                if ( prev.isNull() ) { // hit beginning, so start scanning from here
+                    createClientCursor();
+                    _findingStartMode = InExtent;
+                    return;
+                }
+                // There might be a more efficient implementation than creating new cursor & client cursor each time,
+                // not worrying about that for now
+                createClientCursor( prev );
+                return;
+            }
+            // InExtent mode: once an extent is chosen, find starting doc in the extent.
+            case InExtent: {
+                if ( _matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+                    _findingStart = false; // found first record in query range, so scan normally
+                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
+                    destroyClientCursor();
+                    return;
+                }
+                _findingStartCursor->advance();
+                return;
+            }
+            default: {
+                massert( 14038, "invalid _findingStartMode", false );
+            }
+        }
+    }
+    
+    DiskLoc FindingStartCursor::extentFirstLoc( const DiskLoc &rec ) {
+        Extent *e = rec.rec()->myExtent( rec );
+        if ( !_qp.nsd()->capLooped() || ( e->myLoc != _qp.nsd()->capExtent ) )
+            return e->firstRecord;
+        // Likely we are on the fresh side of capExtent, so return first fresh record.
+        // If we are on the stale side of capExtent, then the collection is small and it
+        // doesn't matter if we start the extent scan with capFirstNewRecord.
+        return _qp.nsd()->capFirstNewRecord;
+    }
+    
+    void wassertExtentNonempty( const Extent *e ) {
+        // TODO ensure this requirement is clearly enforced, or fix.
+        wassert( !e->firstRecord.isNull() );
+    }
+    
+    DiskLoc FindingStartCursor::prevExtentFirstLoc( const DiskLoc &rec ) {
+        Extent *e = rec.rec()->myExtent( rec );
+        if ( _qp.nsd()->capLooped() ) {
+            if ( e->xprev.isNull() ) {
+                e = _qp.nsd()->lastExtent.ext();
+            }
+            else {
+                e = e->xprev.ext();
+            }
+            if ( e->myLoc != _qp.nsd()->capExtent ) {
+                wassertExtentNonempty( e );
+                return e->firstRecord;
+            }
+        }
+        else {
+            if ( !e->xprev.isNull() ) {
+                e = e->xprev.ext();
+                wassertExtentNonempty( e );
+                return e->firstRecord;
+            }
+        }
+        return DiskLoc(); // reached beginning of collection
+    }
+    
+    void FindingStartCursor::createClientCursor( const DiskLoc &startLoc ) {
+        shared_ptr<Cursor> c = _qp.newCursor( startLoc );
+        _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()) );
+    }
+
+    bool FindingStartCursor::firstDocMatchesOrEmpty() const {
+        shared_ptr<Cursor> c = _qp.newCursor();
+        return !c->ok() || _matcher->matchesCurrent( c.get() );
+    }
+    
+    void FindingStartCursor::init() {
+        BSONElement tsElt = _qp.originalQuery()[ "ts" ];
+        massert( 13044, "no ts field in query", !tsElt.eoo() );
+        BSONObjBuilder b;
+        b.append( tsElt );
+        BSONObj tsQuery = b.obj();
+        _matcher.reset(new CoveredIndexMatcher(tsQuery, _qp.indexKey()));
+        if ( firstDocMatchesOrEmpty() ) {
+            _c = _qp.newCursor();
+            _findingStart = false;
+            return;
+        }
+        // Use a ClientCursor here so we can release db mutex while scanning
+        // oplog (can take quite a while with large oplogs).
+        shared_ptr<Cursor> c = _qp.newReverseCursor();
+        _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) );
+        _findingStartTimer.reset();
+        _findingStartMode = Initial;
+    }
+    
+    // -------------------------------------
+
+    struct TestOpTime : public UnitTest {
+        void run() {
             OpTime t;
             for ( int i = 0; i < 10; i++ ) {
-                OpTime s = OpTime::now();
+                OpTime s = OpTime::now_inlock();
                 assert( s != t );
                 t = s;
             }
@@ -481,18 +626,23 @@ namespace mongo {
     }
 
     void applyOperation_inlock(const BSONObj& op , bool fromRepl ) {
+        assertInWriteLock();
+        LOG(6) << "applying op: " << op << endl;
+
         OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;
 
-        if( logLevel >= 6 )
-            log() << "applying op: " << op << endl;
+        const char *names[] = { "o", "ns", "op", "b" };
+        BSONElement fields[4];
+        op.getFields(4, names, fields);
 
-        assertInWriteLock();
+        BSONObj o;
+        if( fields[0].isABSONObj() )
+            o = fields[0].embeddedObject();
+            
+        const char *ns = fields[1].valuestrsafe();
 
-        OpDebug debug;
-        BSONObj o = op.getObjectField("o");
-        const char *ns = op.getStringField("ns");
         // operation type -- see logOp() comments for types
-        const char *opType = op.getStringField("op");
+        const char *opType = fields[2].valuestrsafe();
 
         if ( *opType == 'i' ) {
             opCounters->gotInsert();
@@ -505,57 +655,53 @@ namespace mongo {
             }
             else {
                 // do upserts for inserts as we might get replayed more than once
+                OpDebug debug;
                 BSONElement _id;
                 if( !o.getObjectID(_id) ) {
                     /* No _id.  This will be very slow. */
                     Timer t;
-                    updateObjects(ns, o, o, true, false, false , debug );
+                    updateObjects(ns, o, o, true, false, false, debug );
                     if( t.millis() >= 2 ) {
                         RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
                     }
                 }
                 else {
-                    BSONObjBuilder b;
-                    b.append(_id);
-
                     /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */
                     RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow
 
                     /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
                               then.  very few upserts will not be inserts...
                               */
+                    BSONObjBuilder b;
+                    b.append(_id);
                     updateObjects(ns, o, b.done(), true, false, false , debug );
                 }
             }
         }
         else if ( *opType == 'u' ) {
             opCounters->gotUpdate();
-
             RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow
-            updateObjects(ns, o, op.getObjectField("o2"), /*upsert*/ op.getBoolField("b"), /*multi*/ false, /*logop*/ false , debug );
+            OpDebug debug;
+            updateObjects(ns, o, op.getObjectField("o2"), /*upsert*/ fields[3].booleanSafe(), /*multi*/ false, /*logop*/ false , debug );
         }
         else if ( *opType == 'd' ) {
             opCounters->gotDelete();
-
             if ( opType[1] == 0 )
-                deleteObjects(ns, o, op.getBoolField("b"));
+                deleteObjects(ns, o, /*justOne*/ fields[3].booleanSafe());
             else
                 assert( opType[1] == 'b' ); // "db" advertisement
         }
-        else if ( *opType == 'n' ) {
-            // no op
-        }
         else if ( *opType == 'c' ) {
             opCounters->gotCommand();
-
             BufBuilder bb;
             BSONObjBuilder ob;
             _runCommands(ns, o, bb, ob, true, 0);
         }
+        else if ( *opType == 'n' ) {
+            // no op
+        }
         else {
-            stringstream ss;
-            ss << "unknown opType [" << opType << "]";
-            throw MsgAssertionException( 13141 , ss.str() );
+            throw MsgAssertionException( 14825 , ErrorMsg("error in applyOperation : unknown opType ", *opType) );
         }
 
     }
@@ -566,9 +712,9 @@ namespace mongo {
         virtual LockType locktype() const { return WRITE; }
         ApplyOpsCmd() : Command( "applyOps" ) {}
         virtual void help( stringstream &help ) const {
-            help << "examples: { applyOps : [ ] , preCondition : [ { ns : ... , q : ... , res : ... } ] }";
+            help << "internal (sharding)\n{ applyOps : [ ] , preCondition : [ { ns : ... , q : ... , res : ... } ] }";
         }
-        virtual bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
 
             if ( cmdObj.firstElement().type() != Array ) {
                 errmsg = "ops has to be an array";
diff --git a/db/oplog.h b/db/oplog.h
index d9073ab..2f2b286 100644
--- a/db/oplog.h
+++ b/db/oplog.h
@@ -26,8 +26,7 @@
 #include "pdfile.h"
 #include "db.h"
 #include "dbhelpers.h"
-#include "query.h"
-#include "queryoptimizer.h"
+#include "clientcursor.h"
 #include "../client/dbclient.h"
 #include "../util/optime.h"
 #include "../util/timer.h"
@@ -64,82 +63,41 @@ namespace mongo {
 
     extern int __findingStartInitialTimeout; // configurable for testing
 
+    class QueryPlan;
+    
+    /** Implements an optimized procedure for finding the first op in the oplog. */
     class FindingStartCursor {
     public:
-        FindingStartCursor( const QueryPlan & qp ) :
-            _qp( qp ),
-            _findingStart( true ),
-            _findingStartMode(),
-            _findingStartTimer( 0 )
-        { init(); }
+
+        /**
+         * The cursor will attempt to find the first op in the oplog matching the
+         * 'ts' field of the qp's query.
+         */
+        FindingStartCursor( const QueryPlan & qp );
+
+        /** @return true if the first matching op in the oplog has been found. */
         bool done() const { return !_findingStart; }
-        shared_ptr<Cursor> cRelease() { return _c; }
-        void next() {
-            if ( !_findingStartCursor || !_findingStartCursor->ok() ) {
-                _findingStart = false;
-                _c = _qp.newCursor(); // on error, start from beginning
-                destroyClientCursor();
-                return;
-            }
-            switch( _findingStartMode ) {
-            case Initial: {
-                if ( !_matcher->matches( _findingStartCursor->currKey(), _findingStartCursor->currLoc() ) ) {
-                    _findingStart = false; // found first record out of query range, so scan normally
-                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
-                    destroyClientCursor();
-                    return;
-                }
-                _findingStartCursor->advance();
-                RARELY {
-                    if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) {
-                        createClientCursor( startLoc( _findingStartCursor->currLoc() ) );
-                        _findingStartMode = FindExtent;
-                        return;
-                    }
-                }
-                return;
-            }
-            case FindExtent: {
-                if ( !_matcher->matches( _findingStartCursor->currKey(), _findingStartCursor->currLoc() ) ) {
-                    _findingStartMode = InExtent;
-                    return;
-                }
-                DiskLoc prev = prevLoc( _findingStartCursor->currLoc() );
-                if ( prev.isNull() ) { // hit beginning, so start scanning from here
-                    createClientCursor();
-                    _findingStartMode = InExtent;
-                    return;
-                }
-                // There might be a more efficient implementation than creating new cursor & client cursor each time,
-                // not worrying about that for now
-                createClientCursor( prev );
-                return;
-            }
-            case InExtent: {
-                if ( _matcher->matches( _findingStartCursor->currKey(), _findingStartCursor->currLoc() ) ) {
-                    _findingStart = false; // found first record in query range, so scan normally
-                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
-                    destroyClientCursor();
-                    return;
-                }
-                _findingStartCursor->advance();
-                return;
-            }
-            default: {
-                massert( 12600, "invalid _findingStartMode", false );
-            }
-            }
-        }
+
+        /** @return cursor pointing to the first matching op, if done(). */
+        shared_ptr<Cursor> cursor() { verify( 14835, done() ); return _c; }
+
+        /** Iterate the cursor, to continue trying to find matching op. */
+        void next();
+
+        /** Yield cursor, if not done(). */
         bool prepareToYield() {
             if ( _findingStartCursor ) {
                 return _findingStartCursor->prepareToYield( _yieldData );
             }
-            return true;
+            return false;
         }
+        
+        /** Recover from cursor yield. */
         void recoverFromYield() {
             if ( _findingStartCursor ) {
                 if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
                     _findingStartCursor.reset( 0 );
+                    msgassertedNoTrace( 15889, "FindingStartCursor::recoverFromYield() failed to recover" );
                 }
             }
         }
@@ -153,56 +111,15 @@ namespace mongo {
         ClientCursor::CleanupPointer _findingStartCursor;
         shared_ptr<Cursor> _c;
         ClientCursor::YieldData _yieldData;
-        DiskLoc startLoc( const DiskLoc &rec ) {
-            Extent *e = rec.rec()->myExtent( rec );
-            if ( !_qp.nsd()->capLooped() || ( e->myLoc != _qp.nsd()->capExtent ) )
-                return e->firstRecord;
-            // Likely we are on the fresh side of capExtent, so return first fresh record.
-            // If we are on the stale side of capExtent, then the collection is small and it
-            // doesn't matter if we start the extent scan with capFirstNewRecord.
-            return _qp.nsd()->capFirstNewRecord;
-        }
+        DiskLoc extentFirstLoc( const DiskLoc &rec );
 
-        // should never have an empty extent in the oplog, so don't worry about that case
-        DiskLoc prevLoc( const DiskLoc &rec ) {
-            Extent *e = rec.rec()->myExtent( rec );
-            if ( _qp.nsd()->capLooped() ) {
-                if ( e->xprev.isNull() )
-                    e = _qp.nsd()->lastExtent.ext();
-                else
-                    e = e->xprev.ext();
-                if ( e->myLoc != _qp.nsd()->capExtent )
-                    return e->firstRecord;
-            }
-            else {
-                if ( !e->xprev.isNull() ) {
-                    e = e->xprev.ext();
-                    return e->firstRecord;
-                }
-            }
-            return DiskLoc(); // reached beginning of collection
-        }
-        void createClientCursor( const DiskLoc &startLoc = DiskLoc() ) {
-            shared_ptr<Cursor> c = _qp.newCursor( startLoc );
-            _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()) );
-        }
+        DiskLoc prevExtentFirstLoc( const DiskLoc &rec );
+        void createClientCursor( const DiskLoc &startLoc = DiskLoc() );
         void destroyClientCursor() {
             _findingStartCursor.reset( 0 );
         }
-        void init() {
-            // Use a ClientCursor here so we can release db mutex while scanning
-            // oplog (can take quite a while with large oplogs).
-            shared_ptr<Cursor> c = _qp.newReverseCursor();
-            _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) );
-            _findingStartTimer.reset();
-            _findingStartMode = Initial;
-            BSONElement tsElt = _qp.originalQuery()[ "ts" ];
-            massert( 13044, "no ts field in query", !tsElt.eoo() );
-            BSONObjBuilder b;
-            b.append( tsElt );
-            BSONObj tsQuery = b.obj();
-            _matcher.reset(new CoveredIndexMatcher(tsQuery, _qp.indexKey()));
-        }
+        void init();
+        bool firstDocMatchesOrEmpty() const;
     };
 
     void pretouchOperation(const BSONObj& op);
diff --git a/db/oplogreader.h b/db/oplogreader.h
index 54c90d9..01f76f4 100644
--- a/db/oplogreader.h
+++ b/db/oplogreader.h
@@ -12,8 +12,8 @@ namespace mongo {
        still fairly awkward but a start.
     */
     class OplogReader {
-        auto_ptr<DBClientConnection> _conn;
-        auto_ptr<DBClientCursor> cursor;
+        shared_ptr<DBClientConnection> _conn;
+        shared_ptr<DBClientCursor> cursor;
     public:
 
         OplogReader() {
@@ -40,6 +40,9 @@ namespace mongo {
         /* ok to call if already connected */
         bool connect(string hostname);
 
+        bool connect(const BSONObj& rid, const int from, const string& to);
+
+
         void tailCheck() {
             if( cursor.get() && cursor->isDead() ) {
                 log() << "repl: old cursor isDead, will initiate a new one" << endl;
@@ -51,25 +54,39 @@ namespace mongo {
 
         void query(const char *ns, const BSONObj& query) {
             assert( !haveCursor() );
-            cursor = _conn->query(ns, query, 0, 0, 0, QueryOption_SlaveOk);
+            cursor.reset( _conn->query(ns, query, 0, 0, 0, QueryOption_SlaveOk).release() );
+        }
+
+        void queryGTE(const char *ns, OpTime t) {
+            BSONObjBuilder q;
+            q.appendDate("$gte", t.asDate());
+            BSONObjBuilder q2;
+            q2.append("ts", q.done());
+            query(ns, q2.done());
         }
 
-        void tailingQuery(const char *ns, const BSONObj& query) {
+        void tailingQuery(const char *ns, const BSONObj& query, const BSONObj* fields=0) {
             assert( !haveCursor() );
             log(2) << "repl: " << ns << ".find(" << query.toString() << ')' << endl;
-            cursor = _conn->query( ns, query, 0, 0, 0,
-                                   QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay |
-                                   /* TODO: slaveok maybe shouldn't use? */
-                                   QueryOption_AwaitData
-                                 );
+            cursor.reset( _conn->query( ns, query, 0, 0, fields,
+                                        QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay |
+                                        /* TODO: slaveok maybe shouldn't use? */
+                                        QueryOption_AwaitData
+                                        ).release() );
         }
 
-        void tailingQueryGTE(const char *ns, OpTime t) {
+        void tailingQueryGTE(const char *ns, OpTime t, const BSONObj* fields=0) {
             BSONObjBuilder q;
             q.appendDate("$gte", t.asDate());
             BSONObjBuilder query;
             query.append("ts", q.done());
-            tailingQuery(ns, query.done());
+            tailingQuery(ns, query.done(), fields);
+        }
+
+        /* Do a tailing query, but only send the ts field back. */
+        void ghostQueryGTE(const char *ns, OpTime t) {
+            const BSONObj fields = BSON("ts" << 1 << "_id" << 0);
+            return tailingQueryGTE(ns, t, &fields);
         }
 
         bool more() {
@@ -93,13 +110,13 @@ namespace mongo {
 
         BSONObj nextSafe() { return cursor->nextSafe(); }
 
-        BSONObj next() {
-            return cursor->next();
-        }
+        BSONObj next() { return cursor->next(); }
 
-        void putBack(BSONObj op) {
-            cursor->putBack(op);
-        }
+        void putBack(BSONObj op) { cursor->putBack(op); }
+
+    private:
+        bool commonConnect(const string& hostName);
+        bool passthroughHandshake(const BSONObj& rid, const int f);
     };
 
 }
diff --git a/db/ops/delete.cpp b/db/ops/delete.cpp
new file mode 100644
index 0000000..3009047
--- /dev/null
+++ b/db/ops/delete.cpp
@@ -0,0 +1,242 @@
+// delete.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "delete.h"
+#include "../queryoptimizer.h"
+#include "../oplog.h"
+
+namespace mongo {
+    
+    // Just try to identify best plan.
+    class DeleteOp : public MultiCursor::CursorOp {
+    public:
+        DeleteOp( bool justOne, int& bestCount, int orClauseIndex = -1 ) :
+            justOne_( justOne ),
+            count_(),
+            bestCount_( bestCount ),
+            _nscanned(),
+            _orClauseIndex( orClauseIndex ) {
+        }
+        virtual void _init() {
+            c_ = qp().newCursor();
+        }
+        virtual bool prepareToYield() {
+            if ( _orClauseIndex > 0 ) {
+                return false;
+            }
+            if ( ! _cc ) {
+                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , qp().ns() ) );
+            }
+            return _cc->prepareToYield( _yieldData );
+        }
+        virtual void recoverFromYield() {
+            if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _cc.reset();
+                c_.reset();
+                massert( 13340, "cursor dropped during delete", false );
+            }
+        }
+        virtual long long nscanned() {
+            return c_.get() ? c_->nscanned() : _nscanned;
+        }
+        virtual void next() {
+            if ( !c_->ok() ) {
+                setComplete();
+                return;
+            }
+
+            DiskLoc rloc = c_->currLoc();
+
+            if ( matcher( c_ )->matchesCurrent(c_.get()) ) {
+                if ( !c_->getsetdup(rloc) )
+                    ++count_;
+            }
+
+            c_->advance();
+            _nscanned = c_->nscanned();
+            
+            if ( _orClauseIndex > 0 && _nscanned >= 100 ) {
+                setComplete();
+                return;
+            }
+            
+            if ( count_ > bestCount_ )
+                bestCount_ = count_;
+
+            if ( count_ > 0 ) {
+                if ( justOne_ )
+                    setComplete();
+                else if ( _nscanned >= 100 && count_ == bestCount_ )
+                    setComplete();
+            }
+        }
+        virtual bool mayRecordPlan() const { return !justOne_; }
+        virtual QueryOp *_createChild() const {
+            bestCount_ = 0; // should be safe to reset this in contexts where createChild() is called
+            return new DeleteOp( justOne_, bestCount_, _orClauseIndex + 1 );
+        }
+        virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); }
+    private:
+        bool justOne_;
+        int count_;
+        int &bestCount_;
+        long long _nscanned;
+        shared_ptr<Cursor> c_;
+        ClientCursor::CleanupPointer _cc;
+        ClientCursor::YieldData _yieldData;
+        // Avoid yielding in the MultiPlanScanner when not the first $or clause - just a temporary implementaiton for now.  SERVER-3555
+        int _orClauseIndex;
+    };
+
+    /* ns:      namespace, e.g. <database>.<collection>
+       pattern: the "where" clause / criteria
+       justOne: stop after 1 match
+       god:     allow access to system namespaces, and don't yield
+    */
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
+        if( !god ) {
+            if ( strstr(ns, ".system.") ) {
+                /* note a delete from system.indexes would corrupt the db
+                if done here, as there are pointers into those objects in
+                NamespaceDetails.
+                */
+                uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
+            }
+            if ( strchr( ns , '$' ) ) {
+                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
+                uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
+            }
+        }
+
+        {
+            NamespaceDetails *d = nsdetails( ns );
+            if ( ! d )
+                return 0;
+            uassert( 10101 ,  "can't remove from a capped collection" , ! d->capped );
+        }
+
+        long long nDeleted = 0;
+
+        int best = 0;
+        shared_ptr< MultiCursor::CursorOp > opPtr( new DeleteOp( justOneOrig, best ) );
+        shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, !god ) );
+
+        if( !creal->ok() )
+            return nDeleted;
+
+        shared_ptr< Cursor > cPtr = creal;
+        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
+        cc->setDoingDeletes( true );
+
+        CursorId id = cc->cursorid();
+
+        bool justOne = justOneOrig;
+        bool canYield = !god && !creal->matcher()->docMatcher().atomic();
+
+        do {
+            // TODO: we can generalize this I believe
+            //       
+            bool willNeedRecord = creal->matcher()->needRecord() || pattern.isEmpty() || isSimpleIdQuery( pattern );
+            if ( ! willNeedRecord ) {
+                // TODO: this is a total hack right now
+                // check if the index full encompasses query
+                
+                if ( pattern.nFields() == 1 && 
+                     str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) )
+                    willNeedRecord = true;
+            }
+            
+            if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) {
+                cc.release(); // has already been deleted elsewhere
+                // TODO should we assert or something?
+                break;
+            }
+            if ( !cc->ok() ) {
+                break; // if we yielded, could have hit the end
+            }
+
+            // this way we can avoid calling updateLocation() every time (expensive)
+            // as well as some other nuances handled
+            cc->setDoingDeletes( true );
+
+            DiskLoc rloc = cc->currLoc();
+            BSONObj key = cc->currKey();
+
+            // NOTE Calling advance() may change the matcher, so it's important
+            // to try to match first.
+            bool match = creal->matcher()->matchesCurrent(creal.get());
+
+            if ( ! cc->advance() )
+                justOne = true;
+
+            if ( ! match )
+                continue;
+
+            assert( !cc->c()->getsetdup(rloc) ); // can't be a dup, we deleted it!
+
+            if ( !justOne ) {
+                /* NOTE: this is SLOW.  this is not good, noteLocation() was designed to be called across getMore
+                    blocks.  here we might call millions of times which would be bad.
+                    */
+                cc->c()->noteLocation();
+            }
+
+            if ( logop ) {
+                BSONElement e;
+                if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
+                    BSONObjBuilder b;
+                    b.append( e );
+                    bool replJustOne = true;
+                    logOp( "d", ns, b.done(), 0, &replJustOne );
+                }
+                else {
+                    problem() << "deleted object without id, not logging" << endl;
+                }
+            }
+
+            if ( rs )
+                rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );
+
+            theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
+            nDeleted++;
+            if ( justOne ) {
+                break;
+            }
+            cc->c()->checkLocation();
+         
+            if( !god ) 
+                getDur().commitIfNeeded();
+
+            if( debug && god && nDeleted == 100 ) 
+                log() << "warning high number of deletes with god=true which could use significant memory" << endl;
+        }
+        while ( cc->ok() );
+
+        if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
+            // TODO: remove this and the id declaration above if this doesn't trigger
+            //       if it does, then i'm very confused (ERH 06/2011)
+            error() << "this should be impossible" << endl;
+            printStackTrace();
+            cc.release();
+        }
+
+        return nDeleted;
+    }
+
+}
diff --git a/db/ops/delete.h b/db/ops/delete.h
new file mode 100644
index 0000000..a74b7a6
--- /dev/null
+++ b/db/ops/delete.h
@@ -0,0 +1,33 @@
+// delete.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    class RemoveSaver;
+
+    // If justOne is true, deletedId is set to the id of the deleted object.
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false, RemoveSaver * rs=0);
+
+
+}
diff --git a/db/query.cpp b/db/ops/query.cpp
index 671e714..cf4dc98 100644
--- a/db/query.cpp
+++ b/db/ops/query.cpp
@@ -18,24 +18,25 @@
 
 #include "pch.h"
 #include "query.h"
-#include "pdfile.h"
-#include "jsobjmanipulator.h"
-#include "../bson/util/builder.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../../bson/util/builder.h"
 #include <time.h>
-#include "introspect.h"
-#include "btree.h"
-#include "../util/lruishmap.h"
-#include "json.h"
-#include "repl.h"
-#include "replpair.h"
-#include "scanandorder.h"
-#include "security.h"
-#include "curop-inl.h"
-#include "commands.h"
-#include "queryoptimizer.h"
-#include "lasterror.h"
-#include "../s/d_logic.h"
-#include "repl_block.h"
+#include "../introspect.h"
+#include "../btree.h"
+#include "../../util/lruishmap.h"
+#include "../json.h"
+#include "../repl.h"
+#include "../replutil.h"
+#include "../scanandorder.h"
+#include "../security.h"
+#include "../curop-inl.h"
+#include "../commands.h"
+#include "../queryoptimizer.h"
+#include "../lasterror.h"
+#include "../../s/d_logic.h"
+#include "../repl_block.h"
+#include "../../server.h"
 
 namespace mongo {
 
@@ -50,204 +51,14 @@ namespace mongo {
     extern bool useCursors;
     extern bool useHints;
 
-    // Just try to identify best plan.
-    class DeleteOp : public MultiCursor::CursorOp {
-    public:
-        DeleteOp( bool justOne, int& bestCount ) :
-            justOne_( justOne ),
-            count_(),
-            bestCount_( bestCount ),
-            _nscanned() {
-        }
-        virtual void _init() {
-            c_ = qp().newCursor();
-        }
-        virtual bool prepareToYield() {
-            if ( ! _cc ) {
-                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , qp().ns() ) );
-            }
-            return _cc->prepareToYield( _yieldData );
-        }
-        virtual void recoverFromYield() {
-            if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
-                _cc.reset();
-                c_.reset();
-                massert( 13340, "cursor dropped during delete", false );
-            }
-        }
-        virtual long long nscanned() {
-            return c_.get() ? c_->nscanned() : _nscanned;
-        }
-        virtual void next() {
-            if ( !c_->ok() ) {
-                setComplete();
-                return;
-            }
-
-            DiskLoc rloc = c_->currLoc();
-
-            if ( matcher()->matches(c_->currKey(), rloc ) ) {
-                if ( !c_->getsetdup(rloc) )
-                    ++count_;
-            }
-
-            c_->advance();
-            _nscanned = c_->nscanned();
-            if ( count_ > bestCount_ )
-                bestCount_ = count_;
-
-            if ( count_ > 0 ) {
-                if ( justOne_ )
-                    setComplete();
-                else if ( _nscanned >= 100 && count_ == bestCount_ )
-                    setComplete();
-            }
-        }
-        virtual bool mayRecordPlan() const { return !justOne_; }
-        virtual QueryOp *_createChild() const {
-            bestCount_ = 0; // should be safe to reset this in contexts where createChild() is called
-            return new DeleteOp( justOne_, bestCount_ );
-        }
-        virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); }
-    private:
-        bool justOne_;
-        int count_;
-        int &bestCount_;
-        long long _nscanned;
-        shared_ptr<Cursor> c_;
-        ClientCursor::CleanupPointer _cc;
-        ClientCursor::YieldData _yieldData;
-    };
-
-    /* ns:      namespace, e.g. <database>.<collection>
-       pattern: the "where" clause / criteria
-       justOne: stop after 1 match
-       god:     allow access to system namespaces, and don't yield
-    */
-    long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
-        if( !god ) {
-            if ( strstr(ns, ".system.") ) {
-                /* note a delete from system.indexes would corrupt the db
-                if done here, as there are pointers into those objects in
-                NamespaceDetails.
-                */
-                uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
-            }
-            if ( strchr( ns , '$' ) ) {
-                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
-                uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
-            }
-        }
-
-        NamespaceDetails *d = nsdetails( ns );
-        if ( ! d )
-            return 0;
-        uassert( 10101 ,  "can't remove from a capped collection" , ! d->capped );
-
-        long long nDeleted = 0;
-
-        int best = 0;
-        shared_ptr< MultiCursor::CursorOp > opPtr( new DeleteOp( justOneOrig, best ) );
-        shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, !god ) );
-
-        if( !creal->ok() )
-            return nDeleted;
-
-        shared_ptr< Cursor > cPtr = creal;
-        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
-        cc->setDoingDeletes( true );
-
-        CursorId id = cc->cursorid();
-
-        bool justOne = justOneOrig;
-        bool canYield = !god && !creal->matcher()->docMatcher().atomic();
-
-        do {
-            if ( canYield && ! cc->yieldSometimes() ) {
-                cc.release(); // has already been deleted elsewhere
-                // TODO should we assert or something?
-                break;
-            }
-            if ( !cc->ok() ) {
-                break; // if we yielded, could have hit the end
-            }
-
-            // this way we can avoid calling updateLocation() every time (expensive)
-            // as well as some other nuances handled
-            cc->setDoingDeletes( true );
-
-            DiskLoc rloc = cc->currLoc();
-            BSONObj key = cc->currKey();
-
-            // NOTE Calling advance() may change the matcher, so it's important
-            // to try to match first.
-            bool match = creal->matcher()->matches( key , rloc );
-
-            if ( ! cc->advance() )
-                justOne = true;
-
-            if ( ! match )
-                continue;
-
-            assert( !cc->c()->getsetdup(rloc) ); // can't be a dup, we deleted it!
-
-            if ( !justOne ) {
-                /* NOTE: this is SLOW.  this is not good, noteLocation() was designed to be called across getMore
-                    blocks.  here we might call millions of times which would be bad.
-                    */
-                cc->c()->noteLocation();
-            }
-
-            if ( logop ) {
-                BSONElement e;
-                if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
-                    BSONObjBuilder b;
-                    b.append( e );
-                    bool replJustOne = true;
-                    logOp( "d", ns, b.done(), 0, &replJustOne );
-                }
-                else {
-                    problem() << "deleted object without id, not logging" << endl;
-                }
-            }
-
-            if ( rs )
-                rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );
-
-            theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
-            nDeleted++;
-            if ( justOne ) {
-                break;
-            }
-            cc->c()->checkLocation();
-         
-            if( !god ) 
-                getDur().commitIfNeeded();
-
-            if( debug && god && nDeleted == 100 ) 
-                log() << "warning high number of deletes with god=true which could use significant memory" << endl;
-        }
-        while ( cc->ok() );
-
-        if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
-            cc.release();
-        }
-
-        return nDeleted;
-    }
-
-    int otherTraceLevel = 0;
-
-    int initialExtentSize(int len);
-
     bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
         try {
             return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
         }
         catch ( AssertionException& e ) {
             e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" );
+            curop.debug().exceptionInfo = e.getInfo();
         }
-        curop.debug().str << " assertion ";
         anObjBuilder.append("errmsg", "db assertion failure");
         anObjBuilder.append("ok", 0.0);
         BSONObj x = anObjBuilder.done();
@@ -255,8 +66,6 @@ namespace mongo {
         return true;
     }
 
-    int nCaught = 0;
-
 
     BSONObj id_obj = fromjson("{\"_id\":1}");
     BSONObj empty_obj = fromjson("{}");
@@ -273,6 +82,7 @@ namespace mongo {
         qr->startingFrom = 0;
         qr->len = b.len();
         qr->setOperation(opReply);
+        qr->initializeResultFlags();
         qr->nReturned = 0;
         b.decouple();
         return qr;
@@ -283,35 +93,29 @@ namespace mongo {
         ClientCursor::Pointer p(cursorid);
         ClientCursor *cc = p.c();
 
-        int bufSize = 512;
-        if ( cc ) {
-            bufSize += sizeof( QueryResult );
-            bufSize += MaxBytesToReturnToClientAtOnce;
-        }
+        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;
 
         BufBuilder b( bufSize );
-
         b.skip(sizeof(QueryResult));
-
         int resultFlags = ResultFlag_AwaitCapable;
         int start = 0;
         int n = 0;
 
-        if ( !cc ) {
+        if ( unlikely(!cc) ) {
             log() << "getMore: cursorid not found " << ns << " " << cursorid << endl;
             cursorid = 0;
             resultFlags = ResultFlag_CursorNotFound;
         }
         else {
+            // check for spoofing of the ns such that it does not match the one originally there for the cursor
+            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));
+
             if ( pass == 0 )
                 cc->updateSlaveLocation( curop );
 
             int queryOptions = cc->queryOptions();
-
-            if( pass == 0 ) {
-                StringBuilder& ss = curop.debug().str;
-                ss << " getMore: " << cc->query().toString() << " ";
-            }
+            
+            curop.debug().query = cc->query();
 
             start = cc->pos();
             Cursor *c = cc->c();
@@ -322,6 +126,9 @@ namespace mongo {
             if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
                 keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );
 
+            // This manager may be stale, but it's the state of chunking when the cursor was created.
+            ShardChunkManagerPtr manager = cc->getChunkManager();
+
             while ( 1 ) {
                 if ( !c->ok() ) {
                     if ( c->tailable() ) {
@@ -333,7 +140,7 @@ namespace mongo {
                             continue;
 
                         if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
-                            throw GetMoreWaitException();
+                            return 0;
                         }
 
                         break;
@@ -345,15 +152,13 @@ namespace mongo {
                     cc = 0;
                     break;
                 }
+
                 // in some cases (clone collection) there won't be a matcher
-                if ( c->matcher() && !c->matcher()->matches(c->currKey(), c->currLoc() ) ) {
+                if ( c->matcher() && !c->matcher()->matchesCurrent( c ) ) {
                 }
-                /*
-                  TODO
-                else if ( _chunkMatcher && ! _chunkMatcher->belongsToMe( c->currKey(), c->currLoc() ) ){
-                    cout << "TEMP skipping un-owned chunk: " << c->current() << endl;
+                else if ( manager && ! manager->belongsToMe( cc ) ){
+                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
                 }
-                */
                 else {
                     if( c->getsetdup(c->currLoc()) ) {
                         //out() << "  but it's a dup \n";
@@ -380,7 +185,7 @@ namespace mongo {
                 }
                 c->advance();
 
-                if ( ! cc->yieldSometimes() ) {
+                if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) {
                     ClientCursor::erase(cursorid);
                     cursorid = 0;
                     cc = 0;
@@ -422,7 +227,7 @@ namespace mongo {
         virtual void _init() {
             _c = qp().newCursor();
             _capped = _c->capped();
-            if ( qp().exactKeyMatch() && ! matcher()->needRecord() ) {
+            if ( qp().exactKeyMatch() && ! matcher( _c )->needRecord() ) {
                 _query = qp().simplifiedQuery( qp().indexKey() );
                 _bc = dynamic_cast< BtreeCursor* >( _c.get() );
                 _bc->forgetEndKey();
@@ -452,6 +257,9 @@ namespace mongo {
                 if ( _capped ) {
                     msgassertedNoTrace( 13337, str::stream() << "capped cursor overrun during count: " << _ns );
                 }
+                else if ( qp().mustAssertOnYieldFailure() ) {
+                    msgassertedNoTrace( 15891, str::stream() << "CountOp::recoverFromYield() failed to recover: " << _ns );
+                }
                 else {
                     // we don't fail query since we're fine with returning partial data if collection dropped
                 }
@@ -467,7 +275,7 @@ namespace mongo {
             _nscanned = _c->nscanned();
             if ( _bc ) {
                 if ( _firstMatch.isEmpty() ) {
-                    _firstMatch = _bc->currKeyNode().key.copy();
+                    _firstMatch = _bc->currKey().getOwned();
                     // if not match
                     if ( _query.woCompare( _firstMatch, BSONObj(), false ) ) {
                         setComplete();
@@ -476,7 +284,7 @@ namespace mongo {
                     _gotOne();
                 }
                 else {
-                    if ( ! _firstMatch.woEqual( _bc->currKeyNode().key ) ) {
+                    if ( ! _firstMatch.equal( _bc->currKey() ) ) {
                         setComplete();
                         return;
                     }
@@ -484,7 +292,7 @@ namespace mongo {
                 }
             }
             else {
-                if ( !matcher()->matches(_c->currKey(), _c->currLoc() ) ) {
+                if ( !matcher( _c )->matchesCurrent( _c.get() ) ) {
                 }
                 else if( !_c->getsetdup(_c->currLoc()) ) {
                     _gotOne();
@@ -610,6 +418,8 @@ namespace mongo {
 
             *_b << "indexBounds" << c->prettyIndexBounds();
 
+            c->explainDetails( *_b );
+
             if ( !hint ) {
                 *_b << "allPlans" << _a->arr();
             }
@@ -689,7 +499,7 @@ namespace mongo {
 
             if ( qp().scanAndOrderRequired() ) {
                 _inMemSort = true;
-                _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder() ) );
+                _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder(), qp().multikeyFrs() ) );
             }
 
             if ( _pq.isExplain() ) {
@@ -728,6 +538,9 @@ namespace mongo {
                 if ( _capped ) {
                     msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() );
                 }
+                else if ( qp().mustAssertOnYieldFailure() ) {
+                    msgassertedNoTrace( 15890, str::stream() << "UserQueryOp::recoverFromYield() failed to recover: " << _pq.ns() );
+                }
                 else {
                     // we don't fail query since we're fine with returning partial data if collection dropped
 
@@ -746,13 +559,13 @@ namespace mongo {
 
         virtual void next() {
             if ( _findingStartCursor.get() ) {
+                if ( !_findingStartCursor->done() ) {
+                    _findingStartCursor->next();
+                }                    
                 if ( _findingStartCursor->done() ) {
-                    _c = _findingStartCursor->cRelease();
+                    _c = _findingStartCursor->cursor();
                     _findingStartCursor.reset( 0 );
                 }
-                else {
-                    _findingStartCursor->next();
-                }
                 _capped = true;
                 return;
             }
@@ -774,15 +587,15 @@ namespace mongo {
             }
 
             _nscanned = _c->nscanned();
-            if ( !matcher()->matches(_c->currKey(), _c->currLoc() , &_details ) ) {
+            if ( !matcher( _c )->matchesCurrent(_c.get() , &_details ) ) {
                 // not a match, continue onward
-                if ( _details.loadedObject )
+                if ( _details._loadedObject )
                     _nscannedObjects++;
             }
             else {
                 _nscannedObjects++;
                 DiskLoc cl = _c->currLoc();
-                if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) {
+                if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) { // TODO: should make this covered at some point
                     _nChunkSkips++;
                     // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl;
                 }
@@ -938,6 +751,9 @@ namespace mongo {
                 cc->slaveReadTill( _slaveReadTill );
 
         }
+
+        ShardChunkManagerPtr getChunkManager(){ return _chunkManager; }
+
     private:
         BufBuilder _buf;
         const ParsedQuery& _pq;
@@ -981,7 +797,6 @@ namespace mongo {
        @return points to ns if exhaust mode. 0=normal mode
     */
     const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
-        StringBuilder& ss = curop.debug().str;
         shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
         ParsedQuery& pq( *pq_shared );
         int ntoskip = q.ntoskip;
@@ -990,15 +805,10 @@ namespace mongo {
         const char *ns = q.ns;
 
         if( logLevel >= 2 )
-            log() << "query: " << ns << jsobj << endl;
-
-        ss << ns;
-        {
-            // only say ntoreturn if nonzero.
-            int n =  pq.getNumToReturn();
-            if( n )
-                ss << " ntoreturn:" << n;
-        }
+            log() << "runQuery called " << ns << " " << jsobj << endl;
+
+        curop.debug().ns = ns;
+        curop.debug().ntoreturn = pq.getNumToReturn();
         curop.setQuery(jsobj);
 
         if ( pq.couldBeCommand() ) {
@@ -1006,15 +816,16 @@ namespace mongo {
             bb.skip(sizeof(QueryResult));
             BSONObjBuilder cmdResBuf;
             if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
-                ss << " command: ";
-                jsobj.toString( ss );
+                curop.debug().iscommand = true;
+                curop.debug().query = jsobj;
                 curop.markCommand();
+
                 auto_ptr< QueryResult > qr;
                 qr.reset( (QueryResult *) bb.buf() );
                 bb.decouple();
                 qr->setResultFlagsToOk();
                 qr->len = bb.len();
-                ss << " reslen:" << bb.len();
+                curop.debug().responseLength = bb.len();
                 qr->setOperation(opReply);
                 qr->cursorId = 0;
                 qr->startingFrom = 0;
@@ -1090,6 +901,7 @@ namespace mongo {
         }
 
         if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
+
             bool nsFound = false;
             bool indexFound = false;
 
@@ -1099,8 +911,8 @@ namespace mongo {
             if ( nsFound == false || indexFound == true ) {
                 BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
                 bb.skip(sizeof(QueryResult));
-
-                ss << " idhack ";
+                
+                curop.debug().idhack = true;
                 if ( found ) {
                     n = 1;
                     fillQueryResultFromObj( bb , pq.getFields() , resObject );
@@ -1110,13 +922,14 @@ namespace mongo {
                 bb.decouple();
                 qr->setResultFlagsToOk();
                 qr->len = bb.len();
-                ss << " reslen:" << bb.len();
+                
+                curop.debug().responseLength = bb.len();
                 qr->setOperation(opReply);
                 qr->cursorId = 0;
                 qr->startingFrom = 0;
                 qr->nReturned = n;
                 result.setData( qr.release(), true );
-                return false;
+                return NULL;
             }
         }
 
@@ -1147,8 +960,8 @@ namespace mongo {
         }
         n = dqo.n();
         long long nscanned = dqo.totalNscanned();
-        if ( dqo.scanAndOrderRequired() )
-            ss << " scanAndOrder ";
+        curop.debug().scanAndOrder = dqo.scanAndOrderRequired();
+
         shared_ptr<Cursor> cursor = dqo.cursor();
         if( logLevel >= 5 )
             log() << "   used cursor: " << cursor.get() << endl;
@@ -1159,13 +972,16 @@ namespace mongo {
             bool moreClauses = mps->mayRunMore();
             if ( moreClauses ) {
                 // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield
-                shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher(), dqo ) );
+                shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher( cursor ), dqo ) );
                 cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned());
             }
             else {
-                if( ! cursor->matcher() ) cursor->setMatcher( dqo.matcher() );
+                if( ! cursor->matcher() ) cursor->setMatcher( dqo.matcher( cursor ) );
                 cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() );
             }
+
+            cc->setChunkManager( dqo.getChunkManager() );
+
             cursorid = cc->cursorid();
             DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
             cc->setPos( n );
@@ -1177,7 +993,7 @@ namespace mongo {
                 DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
             if( queryOptions & QueryOption_Exhaust ) {
                 exhaust = ns;
-                ss << " exhaust ";
+                curop.debug().exhaust = true;
             }
             dqo.finishForOplogReplay(cc);
         }
@@ -1186,7 +1002,7 @@ namespace mongo {
         qr->cursorId = cursorid;
         qr->setResultFlagsToOk();
         // qr->len is updated automatically by appendData()
-        ss << " reslen:" << qr->len;
+        curop.debug().responseLength = qr->len;
         qr->setOperation(opReply);
         qr->startingFrom = 0;
         qr->nReturned = n;
@@ -1194,14 +1010,10 @@ namespace mongo {
         int duration = curop.elapsedMillis();
         bool dbprofile = curop.shouldDBProfile( duration );
         if ( dbprofile || duration >= cmdLine.slowMS ) {
-            ss << " nscanned:" << nscanned << ' ';
-            if ( ntoskip )
-                ss << " ntoskip:" << ntoskip;
-            if ( dbprofile )
-                ss << " \nquery: ";
-            ss << jsobj.toString() << ' ';
+            curop.debug().nscanned = (int) nscanned;
+            curop.debug().ntoskip = ntoskip;
         }
-        ss << " nreturned:" << n;
+        curop.debug().nreturned = n;
         return exhaust;
     }
 
diff --git a/db/query.h b/db/ops/query.h
index 5de7ced..ada2e90 100644
--- a/db/query.h
+++ b/db/ops/query.h
@@ -18,98 +18,22 @@
 
 #pragma once
 
-#include "../pch.h"
-#include "../util/message.h"
-#include "dbmessage.h"
-#include "jsobj.h"
-#include "diskloc.h"
-#include "projection.h"
-
-/* db request message format
-
-   unsigned opid;         // arbitary; will be echoed back
-   byte operation;
-   int options;
-
-   then for:
-
-   dbInsert:
-      string collection;
-      a series of JSObjects
-   dbDelete:
-      string collection;
-      int flags=0; // 1=DeleteSingle
-      JSObject query;
-   dbUpdate:
-      string collection;
-      int flags; // 1=upsert
-      JSObject query;
-      JSObject objectToUpdate;
-        objectToUpdate may include { $inc: <field> } or { $set: ... }, see struct Mod.
-   dbQuery:
-      string collection;
-      int nToSkip;
-      int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit)
-                     // greater than zero is simply a hint on how many objects to send back per "cursor batch".
-                     // a negative number indicates a hard limit.
-      JSObject query;
-      [JSObject fieldsToReturn]
-   dbGetMore:
-      string collection; // redundant, might use for security.
-      int nToReturn;
-      int64 cursorID;
-   dbKillCursors=2007:
-      int n;
-      int64 cursorIDs[n];
-
-   Note that on Update, there is only one object, which is different
-   from insert where you can pass a list of objects to insert in the db.
-   Note that the update field layout is very similar layout to Query.
-*/
+#include "../../pch.h"
+#include "../../util/net/message.h"
+#include "../dbmessage.h"
+#include "../jsobj.h"
+#include "../diskloc.h"
+#include "../projection.h"
 
 // struct QueryOptions, QueryResult, QueryResultFlags in:
-#include "../client/dbclient.h"
+#include "../../client/dbclient.h"
 
 namespace mongo {
 
     extern const int MaxBytesToReturnToClientAtOnce;
 
-    // for an existing query (ie a ClientCursor), send back additional information.
-    struct GetMoreWaitException { };
-
     QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op, int pass, bool& exhaust);
 
-    struct UpdateResult {
-        bool existing; // if existing objects were modified
-        bool mod;      // was this a $ mod
-        long long num; // how many objects touched
-        OID upserted;  // if something was upserted, the new _id of the object
-
-        UpdateResult( bool e, bool m, unsigned long long n , const BSONObj& upsertedObject = BSONObj() )
-            : existing(e) , mod(m), num(n) {
-            upserted.clear();
-
-            BSONElement id = upsertedObject["_id"];
-            if ( ! e && n == 1 && id.type() == jstOID ) {
-                upserted = id.OID();
-            }
-        }
-
-    };
-
-    class RemoveSaver;
-
-    /* returns true if an existing object was updated, false if no existing object was found.
-       multi - update multiple objects - mostly useful with things like $set
-       god - allow access to system namespaces
-    */
-    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
-    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern,
-                                bool upsert, bool multi , bool logop , OpDebug& debug , RemoveSaver * rs = 0 );
-
-    // If justOne is true, deletedId is set to the id of the deleted object.
-    long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false, RemoveSaver * rs=0);
-
     long long runCount(const char *ns, const BSONObj& cmd, string& err);
 
     const char * runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result);
@@ -142,7 +66,7 @@ namespace mongo {
      * includes fields from the query message, both possible query levels
      * parses everything up front
      */
-    class ParsedQuery {
+    class ParsedQuery : boost::noncopyable {
     public:
         ParsedQuery( QueryMessage& qm )
             : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ) {
@@ -155,8 +79,6 @@ namespace mongo {
             initFields( fields );
         }
 
-        ~ParsedQuery() {}
-
         const char * ns() const { return _ns; }
         bool isLocalDB() const { return strncmp(_ns, "local.", 6) == 0; }
 
@@ -170,7 +92,6 @@ namespace mongo {
         int getOptions() const { return _options; }
         bool hasOption( int x ) const { return x & _options; }
 
-
         bool isExplain() const { return _explain; }
         bool isSnapshot() const { return _snapshot; }
         bool returnKey() const { return _returnKey; }
@@ -262,27 +183,33 @@ namespace mongo {
                         _order = transformOrderFromArrayFormat( _order );
                     }
                     else {
-                        uassert(13513, "sort must be an object or array", 0);
+                        uasserted(13513, "sort must be an object or array");
                     }
+                    continue;
                 }
-                else if ( strcmp( "$explain" , name ) == 0 )
-                    _explain = e.trueValue();
-                else if ( strcmp( "$snapshot" , name ) == 0 )
-                    _snapshot = e.trueValue();
-                else if ( strcmp( "$min" , name ) == 0 )
-                    _min = e.embeddedObject();
-                else if ( strcmp( "$max" , name ) == 0 )
-                    _max = e.embeddedObject();
-                else if ( strcmp( "$hint" , name ) == 0 )
-                    _hint = e;
-                else if ( strcmp( "$returnKey" , name ) == 0 )
-                    _returnKey = e.trueValue();
-                else if ( strcmp( "$maxScan" , name ) == 0 )
-                    _maxScan = e.numberInt();
-                else if ( strcmp( "$showDiskLoc" , name ) == 0 )
-                    _showDiskLoc = e.trueValue();
-
 
+                if( *name == '$' ) {
+                    name++;
+                    if ( strcmp( "explain" , name ) == 0 )
+                        _explain = e.trueValue();
+                    else if ( strcmp( "snapshot" , name ) == 0 )
+                        _snapshot = e.trueValue();
+                    else if ( strcmp( "min" , name ) == 0 )
+                        _min = e.embeddedObject();
+                    else if ( strcmp( "max" , name ) == 0 )
+                        _max = e.embeddedObject();
+                    else if ( strcmp( "hint" , name ) == 0 )
+                        _hint = e;
+                    else if ( strcmp( "returnKey" , name ) == 0 )
+                        _returnKey = e.trueValue();
+                    else if ( strcmp( "maxScan" , name ) == 0 )
+                        _maxScan = e.numberInt();
+                    else if ( strcmp( "showDiskLoc" , name ) == 0 )
+                        _showDiskLoc = e.trueValue();
+                    else if ( strcmp( "comment" , name ) == 0 ) {
+                        ; // no-op
+                    }
+                }
             }
 
             if ( _snapshot ) {
@@ -299,20 +226,14 @@ namespace mongo {
             _fields->init( fields );
         }
 
-        ParsedQuery( const ParsedQuery& other ) {
-            assert(0);
-        }
-
-        const char* _ns;
-        int _ntoskip;
+        const char * const _ns;
+        const int _ntoskip;
         int _ntoreturn;
-        int _options;
-
         BSONObj _filter;
+        BSONObj _order;
+        const int _options;
         shared_ptr< Projection > _fields;
-
         bool _wantMore;
-
         bool _explain;
         bool _snapshot;
         bool _returnKey;
@@ -320,11 +241,10 @@ namespace mongo {
         BSONObj _min;
         BSONObj _max;
         BSONElement _hint;
-        BSONObj _order;
         int _maxScan;
     };
 
 
 } // namespace mongo
 
-#include "clientcursor.h"
+
diff --git a/db/update.cpp b/db/ops/update.cpp
index 8dc6c85..fd9798a 100644
--- a/db/update.cpp
+++ b/db/ops/update.cpp
@@ -18,12 +18,13 @@
 
 #include "pch.h"
 #include "query.h"
-#include "pdfile.h"
-#include "jsobjmanipulator.h"
-#include "queryoptimizer.h"
-#include "repl.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../queryoptimizer.h"
+#include "../repl.h"
+#include "../btree.h"
+#include "../../util/stringutils.h"
 #include "update.h"
-#include "btree.h"
 
 //#define DEBUGUPDATE(x) cout << x << endl;
 #define DEBUGUPDATE(x)
@@ -284,7 +285,7 @@ namespace mongo {
         case BIT: {
             uassert( 10136 ,  "$bit needs an array" , elt.type() == Object );
             uassert( 10137 ,  "$bit can only be applied to numbers" , in.isNumber() );
-            uassert( 10138 ,  "$bit can't use a double" , in.type() != NumberDouble );
+            uassert( 10138 ,  "$bit cannot update a value of type double" , in.type() != NumberDouble );
 
             int x = in.numberInt();
             long long y = in.numberLong();
@@ -293,23 +294,22 @@ namespace mongo {
             while ( it.more() ) {
                 BSONElement e = it.next();
                 uassert( 10139 ,  "$bit field must be number" , e.isNumber() );
-                if ( strcmp( e.fieldName() , "and" ) == 0 ) {
+                if ( str::equals(e.fieldName(), "and") ) {
                     switch( in.type() ) {
                     case NumberInt: x = x&e.numberInt(); break;
                     case NumberLong: y = y&e.numberLong(); break;
                     default: assert( 0 );
                     }
                 }
-                else if ( strcmp( e.fieldName() , "or" ) == 0 ) {
+                else if ( str::equals(e.fieldName(), "or") ) {
                     switch( in.type() ) {
                     case NumberInt: x = x|e.numberInt(); break;
                     case NumberLong: y = y|e.numberLong(); break;
                     default: assert( 0 );
                     }
                 }
-
                 else {
-                    throw UserException( 9016, (string)"unknown bit mod:" + e.fieldName() );
+                    uasserted(9016, str::stream() << "unknown $bit operation: " << e.fieldName());
                 }
             }
 
@@ -407,7 +407,7 @@ namespace mongo {
                 if ( mss->amIInPlacePossible( e.isNumber() ) ) {
                     // check more typing info here
                     if ( m.elt.type() != e.type() ) {
-                        // if i'm incrememnting with a double, then the storage has to be a double
+                        // if i'm incrementing with a double, then the storage has to be a double
                         mss->amIInPlacePossible( m.elt.type() != NumberDouble );
                     }
 
@@ -509,7 +509,7 @@ namespace mongo {
         }
 
         if ( m->op == Mod::RENAME_FROM ) {
-            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_FROM fielName:" << m->fieldName );
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_FROM fieldName:" << m->fieldName );
             BSONObjBuilder bb( b.subobjStart( "$unset" ) );
             bb.append( m->fieldName, 1 );
             bb.done();
@@ -517,7 +517,7 @@ namespace mongo {
         }
 
         if ( m->op == Mod::RENAME_TO ) {
-            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_TO fielName:" << m->fieldName );
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_TO fieldName:" << m->fieldName );
             BSONObjBuilder bb( b.subobjStart( "$set" ) );
             bb.appendAs( newVal, m->fieldName );
             return;
@@ -570,13 +570,18 @@ namespace mongo {
 
             switch ( m.m->op ) {
             case Mod::UNSET:
-            case Mod::PULL:
-            case Mod::PULL_ALL:
             case Mod::ADDTOSET:
             case Mod::RENAME_FROM:
             case Mod::RENAME_TO:
                 // this should have been handled by prepare
                 break;
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                // this should have been handled by prepare
+                break;
+            case Mod::POP:
+                assert( m.old.eoo() || ( m.old.isABSONObj() && m.old.Obj().isEmpty() ) );
+                break;
                 // [dm] the BSONElementManipulator statements below are for replication (correct?)
             case Mod::INC:
                 if ( isOnDisk )
@@ -658,7 +663,7 @@ namespace mongo {
 
             switch ( cmp ) {
 
-            case LEFT_SUBFIELD: { // Mod is embeddeed under this element
+            case LEFT_SUBFIELD: { // Mod is embedded under this element
                 uassert( 10145 ,  str::stream() << "LEFT_SUBFIELD only supports Object: " << field << " not: " << e.type() , e.type() == Object || e.type() == Array );
                 if ( onedownseen.count( e.fieldName() ) == 0 ) {
                     onedownseen.insert( e.fieldName() );
@@ -739,6 +744,10 @@ namespace mongo {
         return ss.str();
     }
 
+    bool ModSetState::FieldCmp::operator()( const string &l, const string &r ) const {
+        return lexNumCmp( l.c_str(), r.c_str() ) < 0;
+    }
+
     BSONObj ModSet::createNewFromQuery( const BSONObj& query ) {
         BSONObj newObj;
 
@@ -751,7 +760,7 @@ namespace mongo {
                 if ( e.fieldName()[0] == '$' ) // for $atomic and anything else we add
                     continue;
 
-                if ( e.type() == Object && e.embeddedObject().firstElement().fieldName()[0] == '$' ) {
+                if ( e.type() == Object && e.embeddedObject().firstElementFieldName()[0] == '$' ) {
                     // this means this is a $gt type filter, so don't make part of the new object
                     continue;
                 }
@@ -793,7 +802,7 @@ namespace mongo {
             BSONElement e = it.next();
             const char *fn = e.fieldName();
 
-            uassert( 10147 ,  "Invalid modifier specified" + string( fn ), e.type() == Object );
+            uassert( 10147 ,  "Invalid modifier specified: " + string( fn ), e.type() == Object );
             BSONObj j = e.embeddedObject();
             DEBUGUPDATE( "\t" << j );
 
@@ -892,7 +901,10 @@ namespace mongo {
 
     class UpdateOp : public MultiCursor::CursorOp {
     public:
-        UpdateOp( bool hasPositionalField ) : _nscanned(), _hasPositionalField( hasPositionalField ) {}
+        UpdateOp( bool hasPositionalField, int orClauseIndex = -1 ) :
+        _nscanned(),
+        _hasPositionalField( hasPositionalField ),
+        _orClauseIndex( orClauseIndex ) {}
         virtual void _init() {
             _c = qp().newCursor();
             if ( ! _c->ok() ) {
@@ -900,6 +912,9 @@ namespace mongo {
             }
         }
         virtual bool prepareToYield() {
+            if ( _orClauseIndex > 0 ) {
+                return false;
+            }
             if ( ! _cc ) {
                 _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , qp().ns() ) );
             }
@@ -921,7 +936,11 @@ namespace mongo {
                 return;
             }
             _nscanned = _c->nscanned();
-            if ( matcher()->matches(_c->currKey(), _c->currLoc(), &_details ) ) {
+            if ( _orClauseIndex > 0 && _nscanned >= 100 ) {
+                setComplete();
+                return;
+            }
+            if ( matcher( _c )->matchesCurrent(_c.get(), &_details ) ) {
                 setComplete();
                 return;
             }
@@ -930,7 +949,7 @@ namespace mongo {
 
         virtual bool mayRecordPlan() const { return false; }
         virtual QueryOp *_createChild() const {
-            return new UpdateOp( _hasPositionalField );
+            return new UpdateOp( _hasPositionalField, _orClauseIndex + 1 );
         }
         // already scanned to the first match, so return _c
         virtual shared_ptr< Cursor > newCursor() const { return _c; }
@@ -942,6 +961,8 @@ namespace mongo {
         MatchDetails _details;
         ClientCursor::CleanupPointer _cc;
         ClientCursor::YieldData _yieldData;
+        // Avoid yielding in the MultiPlanScanner when not the first $or clause - just a temporary implementaiton for now.  SERVER-3555
+        int _orClauseIndex;
     };
 
     static void checkTooLarge(const BSONObj& newObj) {
@@ -958,19 +979,47 @@ namespace mongo {
                                     NamespaceDetailsTransient *nsdt,
                                     bool god, const char *ns,
                                     const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) {
+
         DiskLoc loc;
         {
             IndexDetails& i = d->idx(idIdxNo);
-            BSONObj key = i.getKeyFromQuery( patternOrig );
-            loc = i.head.btree()->findSingle(i, i.head, key);
+            BSONObj key = i.getKeyFromQuery( patternOrig );            
+            loc = i.idxInterface().findSingle(i, i.head, key);
             if( loc.isNull() ) {
                 // no upsert support in _updateById yet, so we are done.
                 return UpdateResult(0, 0, 0);
             }
         }
-
         Record *r = loc.rec();
 
+        if ( ! r->likelyInPhysicalMemory() ) {
+            {
+                auto_ptr<RWLockRecursive::Shared> lk( new RWLockRecursive::Shared( MongoFile::mmmutex) );
+                dbtempreleasewritelock t;
+                r->touch();
+                lk.reset(0); // we have to release mmmutex before we can re-acquire dbmutex
+            }
+            
+            {
+                // we need to re-find in case something changed
+                d = nsdetails( ns );
+                if ( ! d ) {
+                    // dropped 
+                    return UpdateResult(0, 0, 0);
+                }
+                nsdt = &NamespaceDetailsTransient::get_w(ns);
+                IndexDetails& i = d->idx(idIdxNo);
+                BSONObj key = i.getKeyFromQuery( patternOrig );            
+                loc = i.idxInterface().findSingle(i, i.head, key);
+                if( loc.isNull() ) {
+                    // no upsert support in _updateById yet, so we are done.
+                    return UpdateResult(0, 0, 0);
+                }
+                
+                r = loc.rec();
+            }
+        }
+
         /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
            regular ones at the moment. */
         if ( isOperatorUpdate ) {
@@ -980,8 +1029,6 @@ namespace mongo {
             if( mss->canApplyInPlace() ) {
                 mss->applyModsInPlace(true);
                 DEBUGUPDATE( "\t\t\t updateById doing in place update" );
-                /*if ( profile )
-                    ss << " fastmod "; */
             }
             else {
                 BSONObj newObj = mss->createNewFromMods();
@@ -1027,19 +1074,16 @@ namespace mongo {
         DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi );
         Client& client = cc();
         int profile = client.database()->profile;
-        StringBuilder& ss = debug.str;
+        
+        debug.updateobj = updateobj;
 
-        if ( logLevel > 2 )
-            ss << " update: " << updateobj.toString();
-
-        /* idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case */
-        /* NOTE: when yield() is added herein, these must be refreshed after each call to yield! */
+        // idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case
+        // The pointers may be left invalid on a failed or terminal yield recovery.
         NamespaceDetails *d = nsdetails(ns); // can be null if an upsert...
         NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get_w(ns);
-        /* end note */
 
         auto_ptr<ModSet> mods;
-        bool isOperatorUpdate = updateobj.firstElement().fieldName()[0] == '$';
+        bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$';
         int modsIsIndexed = false; // really the # of indexes
         if ( isOperatorUpdate ) {
             if( d && d->indexBuildInProgress ) {
@@ -1053,208 +1097,248 @@ namespace mongo {
             modsIsIndexed = mods->isIndexed();
         }
 
-        if( !upsert && !multi && isSimpleIdQuery(patternOrig) && d && !modsIsIndexed ) {
+        if( !multi && isSimpleIdQuery(patternOrig) && d && !modsIsIndexed ) {
             int idxNo = d->findIdIndex();
             if( idxNo >= 0 ) {
-                ss << " byid ";
-                return _updateById(isOperatorUpdate, idxNo, mods.get(), profile, d, nsdt, god, ns, updateobj, patternOrig, logop, debug);
+                debug.idhack = true;
+                UpdateResult result = _updateById(isOperatorUpdate, idxNo, mods.get(), profile, d, nsdt, god, ns, updateobj, patternOrig, logop, debug);
+                if ( result.existing || ! upsert ) {
+                    return result;
+                }
+                else if ( upsert && ! isOperatorUpdate && ! logop) {
+                    // this handles repl inserts
+                    checkNoMods( updateobj );
+                    debug.upsert = true;
+                    BSONObj no = updateobj;
+                    theDataFileMgr.insertWithObjMod(ns, no, god);
+                    return UpdateResult( 0 , 0 , 1 , no );
+                }
             }
         }
 
-        set<DiskLoc> seenObjects;
-
         int numModded = 0;
         long long nscanned = 0;
-        MatchDetails details;
         shared_ptr< MultiCursor::CursorOp > opPtr( new UpdateOp( mods.get() && mods->hasDynamicArray() ) );
         shared_ptr< MultiCursor > c( new MultiCursor( ns, patternOrig, BSONObj(), opPtr, true ) );
 
-        auto_ptr<ClientCursor> cc;
-
-        while ( c->ok() ) {
-            nscanned++;
-
-            bool atomic = c->matcher()->docMatcher().atomic();
+        d = nsdetails(ns);
+        nsdt = &NamespaceDetailsTransient::get_w(ns);
 
-            // May have already matched in UpdateOp, but do again to get details set correctly
-            if ( ! c->matcher()->matches( c->currKey(), c->currLoc(), &details ) ) {
-                c->advance();
+        if( c->ok() ) {
+            set<DiskLoc> seenObjects;
+            MatchDetails details;
+            auto_ptr<ClientCursor> cc;
+            do {
+                nscanned++;
 
-                if ( nscanned % 256 == 0 && ! atomic ) {
+                bool atomic = c->matcher()->docMatcher().atomic();
+                
+                if ( !atomic ) {
+                    // *****************
                     if ( cc.get() == 0 ) {
                         shared_ptr< Cursor > cPtr = c;
                         cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
                     }
-                    if ( ! cc->yield() ) {
+    
+                    bool didYield;
+                    if ( ! cc->yieldSometimes( ClientCursor::WillNeed, &didYield ) ) {
                         cc.release();
-                        // TODO should we assert or something?
                         break;
                     }
                     if ( !c->ok() ) {
                         break;
                     }
+                
+                    if ( didYield ) {
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get_w(ns);
+                    }
+                    // *****************
                 }
-                continue;
-            }
 
-            Record *r = c->_current();
-            DiskLoc loc = c->currLoc();
+                // May have already matched in UpdateOp, but do again to get details set correctly
+                if ( ! c->matcher()->matchesCurrent( c.get(), &details ) ) {
+                    c->advance();
 
-            // TODO Maybe this is unnecessary since we have seenObjects
-            if ( c->getsetdup( loc ) ) {
-                c->advance();
-                continue;
-            }
-
-            BSONObj js(r);
-
-            BSONObj pattern = patternOrig;
-
-            if ( logop ) {
-                BSONObjBuilder idPattern;
-                BSONElement id;
-                // NOTE: If the matching object lacks an id, we'll log
-                // with the original pattern.  This isn't replay-safe.
-                // It might make sense to suppress the log instead
-                // if there's no id.
-                if ( js.getObjectID( id ) ) {
-                    idPattern.append( id );
-                    pattern = idPattern.obj();
-                }
-                else {
-                    uassert( 10157 ,  "multi-update requires all modified objects to have an _id" , ! multi );
+                    if ( nscanned % 256 == 0 && ! atomic ) {
+                        if ( cc.get() == 0 ) {
+                            shared_ptr< Cursor > cPtr = c;
+                            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                        }
+                        if ( ! cc->yield() ) {
+                            cc.release();
+                            // TODO should we assert or something?
+                            break;
+                        }
+                        if ( !c->ok() ) {
+                            break;
+                        }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get_w(ns);
+                    }
+                    continue;
                 }
-            }
-
-            if ( profile )
-                ss << " nscanned:" << nscanned;
 
-            /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
-                regular ones at the moment. */
-            if ( isOperatorUpdate ) {
+                Record *r = c->_current();
+                DiskLoc loc = c->currLoc();
 
-                if ( multi ) {
-                    c->advance(); // go to next record in case this one moves
-                    if ( seenObjects.count( loc ) )
-                        continue;
+                // TODO Maybe this is unnecessary since we have seenObjects
+                if ( c->getsetdup( loc ) ) {
+                    c->advance();
+                    continue;
                 }
 
-                const BSONObj& onDisk = loc.obj();
+                BSONObj js(r);
 
-                ModSet * useMods = mods.get();
-                bool forceRewrite = false;
+                BSONObj pattern = patternOrig;
 
-                auto_ptr<ModSet> mymodset;
-                if ( details.elemMatchKey && mods->hasDynamicArray() ) {
-                    useMods = mods->fixDynamicArray( details.elemMatchKey );
-                    mymodset.reset( useMods );
-                    forceRewrite = true;
+                if ( logop ) {
+                    BSONObjBuilder idPattern;
+                    BSONElement id;
+                    // NOTE: If the matching object lacks an id, we'll log
+                    // with the original pattern.  This isn't replay-safe.
+                    // It might make sense to suppress the log instead
+                    // if there's no id.
+                    if ( js.getObjectID( id ) ) {
+                        idPattern.append( id );
+                        pattern = idPattern.obj();
+                    }
+                    else {
+                        uassert( 10157 ,  "multi-update requires all modified objects to have an _id" , ! multi );
+                    }
                 }
 
-                auto_ptr<ModSetState> mss = useMods->prepare( onDisk );
+                if ( profile  && !multi ) 
+                    debug.nscanned = (int) nscanned;
 
-                bool indexHack = multi && ( modsIsIndexed || ! mss->canApplyInPlace() );
+                /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
+                    regular ones at the moment. */
+                if ( isOperatorUpdate ) {
 
-                if ( indexHack ) {
-                    if ( cc.get() )
-                        cc->updateLocation();
-                    else
-                        c->noteLocation();
-                }
+                    if ( multi ) {
+                        c->advance(); // go to next record in case this one moves
+                        if ( seenObjects.count( loc ) )
+                            continue;
+                    }
 
-                if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) {
-                    mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) );
+                    const BSONObj& onDisk = loc.obj();
 
-                    DEBUGUPDATE( "\t\t\t doing in place update" );
-                    if ( profile )
-                        ss << " fastmod ";
+                    ModSet * useMods = mods.get();
+                    bool forceRewrite = false;
 
-                    if ( modsIsIndexed ) {
-                        seenObjects.insert( loc );
-                    }
-                }
-                else {
-                    if ( rs )
-                        rs->goingToDelete( onDisk );
-
-                    BSONObj newObj = mss->createNewFromMods();
-                    checkTooLarge(newObj);
-                    DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
-                    if ( newLoc != loc || modsIsIndexed ) {
-                        // object moved, need to make sure we don' get again
-                        seenObjects.insert( newLoc );
+                    auto_ptr<ModSet> mymodset;
+                    if ( details._elemMatchKey && mods->hasDynamicArray() ) {
+                        useMods = mods->fixDynamicArray( details._elemMatchKey );
+                        mymodset.reset( useMods );
+                        forceRewrite = true;
                     }
 
-                }
+                    auto_ptr<ModSetState> mss = useMods->prepare( onDisk );
 
-                if ( logop ) {
-                    DEV assert( mods->size() );
+                    bool indexHack = multi && ( modsIsIndexed || ! mss->canApplyInPlace() );
 
-                    if ( mss->haveArrayDepMod() ) {
-                        BSONObjBuilder patternBuilder;
-                        patternBuilder.appendElements( pattern );
-                        mss->appendSizeSpecForArrayDepMods( patternBuilder );
-                        pattern = patternBuilder.obj();
+                    if ( indexHack ) {
+                        if ( cc.get() )
+                            cc->updateLocation();
+                        else
+                            c->noteLocation();
                     }
 
-                    if ( forceRewrite || mss->needOpLogRewrite() ) {
-                        DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
-                        logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+                    if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) {
+                        mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) );
+
+                        DEBUGUPDATE( "\t\t\t doing in place update" );
+                        if ( profile && !multi ) 
+                            debug.fastmod = true;
+
+                        if ( modsIsIndexed ) {
+                            seenObjects.insert( loc );
+                        }
                     }
                     else {
-                        logOp("u", ns, updateobj, &pattern );
-                    }
-                }
-                numModded++;
-                if ( ! multi )
-                    return UpdateResult( 1 , 1 , numModded );
-                if ( indexHack )
-                    c->checkLocation();
+                        if ( rs )
+                            rs->goingToDelete( onDisk );
+
+                        BSONObj newObj = mss->createNewFromMods();
+                        checkTooLarge(newObj);
+                        DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+                        if ( newLoc != loc || modsIsIndexed ) {
+                            // object moved, need to make sure we don' get again
+                            seenObjects.insert( newLoc );
+                        }
 
-                if ( nscanned % 64 == 0 && ! atomic ) {
-                    if ( cc.get() == 0 ) {
-                        shared_ptr< Cursor > cPtr = c;
-                        cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
                     }
-                    if ( ! cc->yield() ) {
-                        cc.release();
-                        break;
+
+                    if ( logop ) {
+                        DEV assert( mods->size() );
+
+                        if ( mss->haveArrayDepMod() ) {
+                            BSONObjBuilder patternBuilder;
+                            patternBuilder.appendElements( pattern );
+                            mss->appendSizeSpecForArrayDepMods( patternBuilder );
+                            pattern = patternBuilder.obj();
+                        }
+
+                        if ( forceRewrite || mss->needOpLogRewrite() ) {
+                            DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+                            logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+                        }
+                        else {
+                            logOp("u", ns, updateobj, &pattern );
+                        }
                     }
-                    if ( !c->ok() ) {
-                        break;
+                    numModded++;
+                    if ( ! multi )
+                        return UpdateResult( 1 , 1 , numModded );
+                    if ( indexHack )
+                        c->checkLocation();
+
+                    if ( nscanned % 64 == 0 && ! atomic ) {
+                        if ( cc.get() == 0 ) {
+                            shared_ptr< Cursor > cPtr = c;
+                            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                        }
+                        if ( ! cc->yield() ) {
+                            cc.release();
+                            break;
+                        }
+                        if ( !c->ok() ) {
+                            break;
+                        }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get_w(ns);
                     }
-                }
 
-                getDur().commitIfNeeded();
+                    getDur().commitIfNeeded();
 
-                continue;
-            }
+                    continue;
+                }
 
-            uassert( 10158 ,  "multi update only works with $ operators" , ! multi );
+                uassert( 10158 ,  "multi update only works with $ operators" , ! multi );
 
-            BSONElementManipulator::lookForTimestamps( updateobj );
-            checkNoMods( updateobj );
-            theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, god);
-            if ( logop ) {
-                DEV if( god ) log() << "REALLY??" << endl; // god doesn't get logged, this would be bad.
-                logOp("u", ns, updateobj, &pattern );
-            }
-            return UpdateResult( 1 , 0 , 1 );
-        }
+                BSONElementManipulator::lookForTimestamps( updateobj );
+                checkNoMods( updateobj );
+                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, god);
+                if ( logop ) {
+                    DEV wassert( !god ); // god doesn't get logged, this would be bad.
+                    logOp("u", ns, updateobj, &pattern );
+                }
+                return UpdateResult( 1 , 0 , 1 );
+            } while ( c->ok() );
+        } // endif
 
         if ( numModded )
             return UpdateResult( 1 , 1 , numModded );
 
-
         if ( profile )
-            ss << " nscanned:" << nscanned;
+            debug.nscanned = (int) nscanned;
 
         if ( upsert ) {
-            if ( updateobj.firstElement().fieldName()[0] == '$' ) {
+            if ( updateobj.firstElementFieldName()[0] == '$' ) {
                 /* upsert of an $inc. build a default */
                 BSONObj newObj = mods->createNewFromQuery( patternOrig );
-                if ( profile )
-                    ss << " fastmodinsert ";
+                checkNoMods( newObj );
+                debug.fastmodinsert = true;
                 theDataFileMgr.insertWithObjMod(ns, newObj, god);
                 if ( logop )
                     logOp( "i", ns, newObj );
@@ -1263,8 +1347,7 @@ namespace mongo {
             }
             uassert( 10159 ,  "multi update only works with $ operators" , ! multi );
             checkNoMods( updateobj );
-            if ( profile )
-                ss << " upsert ";
+            debug.upsert = true;
             BSONObj no = updateobj;
             theDataFileMgr.insertWithObjMod(ns, no, god);
             if ( logop )
diff --git a/db/update.h b/db/ops/update.h
index d8396b5..de5805a 100644
--- a/db/update.h
+++ b/db/ops/update.h
@@ -16,13 +16,48 @@
  *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "../pch.h"
-#include "jsobj.h"
-#include "../util/embedded_builder.h"
-#include "matcher.h"
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../../util/embedded_builder.h"
+#include "../matcher.h"
 
 namespace mongo {
 
+    // ---------- public -------------
+
+    struct UpdateResult {
+        bool existing; // if existing objects were modified
+        bool mod;      // was this a $ mod
+        long long num; // how many objects touched
+        OID upserted;  // if something was upserted, the new _id of the object
+
+        UpdateResult( bool e, bool m, unsigned long long n , const BSONObj& upsertedObject = BSONObj() )
+            : existing(e) , mod(m), num(n) {
+            upserted.clear();
+
+            BSONElement id = upsertedObject["_id"];
+            if ( ! e && n == 1 && id.type() == jstOID ) {
+                upserted = id.OID();
+            }
+        }
+
+    };
+
+
+    class RemoveSaver;
+
+    /* returns true if an existing object was updated, false if no existing object was found.
+       multi - update multiple objects - mostly useful with things like $set
+       god - allow access to system namespaces
+    */
+    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
+    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern,
+                                bool upsert, bool multi , bool logop , OpDebug& debug , RemoveSaver * rs = 0 );
+
+
+
+    // ---------- private -------------
+
     class ModState;
     class ModSetState;
 
@@ -507,9 +542,7 @@ namespace mongo {
      */
     class ModSetState : boost::noncopyable {
         struct FieldCmp {
-            bool operator()( const string &l, const string &r ) const {
-                return lexNumCmp( l.c_str(), r.c_str() ) < 0;
-            }
+            bool operator()( const string &l, const string &r ) const;
         };
         typedef map<string,ModState,FieldCmp> ModStateHolder;
         const BSONObj& _obj;
diff --git a/db/pdfile.cpp b/db/pdfile.cpp
index 2aedfd4..ac7731a 100644
--- a/db/pdfile.cpp
+++ b/db/pdfile.cpp
@@ -30,10 +30,11 @@ _ disallow system* manipulations from the database.
 #include "../util/hashtab.h"
 #include "../util/file_allocator.h"
 #include "../util/processinfo.h"
+#include "../util/file.h"
 #include "btree.h"
+#include "btreebuilder.h"
 #include <algorithm>
 #include <list>
-#include "query.h"
 #include "repl.h"
 #include "dbhelpers.h"
 #include "namespace-inl.h"
@@ -41,9 +42,27 @@ _ disallow system* manipulations from the database.
 #include "extsort.h"
 #include "curop-inl.h"
 #include "background.h"
+#include "compact.h"
+#include "ops/delete.h"
+#include "instance.h"
+#include "replutil.h"
 
 namespace mongo {
 
+    BOOST_STATIC_ASSERT( sizeof(Extent)-4 == 48+128 );
+    BOOST_STATIC_ASSERT( sizeof(DataFileHeader)-4 == 8192 );
+
+    bool isValidNS( const StringData& ns ) {
+        // TODO: should check for invalid characters
+
+        const char * x = strchr( ns.data() , '.' );
+        if ( ! x )
+            return false;
+
+        x++;
+        return *x > 0;
+    }
+
     bool inDBRepair = false;
     struct doingRepair {
         doingRepair() {
@@ -86,7 +105,7 @@ namespace mongo {
     }
 
     BackgroundOperation::~BackgroundOperation() {
-        assertInWriteLock();
+        wassert( dbMutex.isWriteLocked() );
         dbsInProg[_ns.db]--;
         nsInProg.erase(_ns.ns());
     }
@@ -114,7 +133,6 @@ namespace mongo {
     DatabaseHolder dbHolder;
     int MAGIC = 0x1000;
 
-    extern int otherTraceLevel;
     void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
     void ensureIdIndexForNewNs(const char *ns) {
         if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) &&
@@ -157,7 +175,7 @@ namespace mongo {
     void _deleteDataFiles(const char *database) {
         if ( directoryperdb ) {
             FileAllocator::get()->waitUntilFinished();
-            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) );
+            MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ), "delete data files with a directoryperdb" );
             return;
         }
         class : public FileOp {
@@ -206,11 +224,11 @@ namespace mongo {
             }
         }
 
-        uassert( 10083 ,  "invalid size spec", size > 0 );
+        uassert( 10083 , "create collection invalid size spec", size > 0 );
 
         bool newCapped = false;
         int mx = 0;
-        if( options.getBoolField("capped") ) {
+        if( options["capped"].trueValue() ) {
             newCapped = true;
             BSONElement e = options.getField("max");
             if ( e.isNumber() ) {
@@ -232,7 +250,7 @@ namespace mongo {
                 // $nExtents is just for testing - always allocate new extents
                 // rather than reuse existing extents so we have some predictibility
                 // in the extent size used by our tests
-                database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
+                database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped );
             }
         }
         else if ( int( e.number() ) > 0 ) {
@@ -244,7 +262,7 @@ namespace mongo {
                 // $nExtents is just for testing - always allocate new extents
                 // rather than reuse existing extents so we have some predictibility
                 // in the extent size used by our tests
-                database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
+                database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped );
             }
         }
         else {
@@ -256,7 +274,7 @@ namespace mongo {
                     desiredExtentSize = Extent::minSize();
                 }
                 desiredExtentSize &= 0xffffff00;
-                Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped );
+                Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped, true );
                 size -= e->length;
             }
         }
@@ -325,13 +343,13 @@ namespace mongo {
         }
     }
 
-    void MongoDataFile::badOfs2(int ofs) const {
+    NOINLINE_DECL void MongoDataFile::badOfs2(int ofs) const {
         stringstream ss;
         ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
         uasserted(13441, ss.str());
     }
 
-    void MongoDataFile::badOfs(int ofs) const {
+    NOINLINE_DECL void MongoDataFile::badOfs(int ofs) const {
         stringstream ss;
         ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
         uasserted(13440, ss.str());
@@ -339,39 +357,17 @@ namespace mongo {
 
     int MongoDataFile::defaultSize( const char *filename ) const {
         int size;
-
         if ( fileNo <= 4 )
             size = (64*1024*1024) << fileNo;
         else
             size = 0x7ff00000;
-
         if ( cmdLine.smallfiles ) {
             size = size >> 2;
         }
-
-
         return size;
     }
 
     void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) {
-        {
-            /* check quotas
-               very simple temporary implementation for now
-            */
-            if ( cmdLine.quota && fileNo > cmdLine.quotaFiles && !MMF::exists(filename) ) {
-                /* todo: if we were adding / changing keys in an index did we do some
-                   work previously that needs cleaning up?  Possible.  We should
-                   check code like that and have it catch the exception and do
-                   something reasonable.
-                */
-                string s = "db disk space quota exceeded ";
-                Database *database = cc().database();
-                if ( database )
-                    s += database->name;
-                uasserted(12501,s);
-            }
-        }
-
         long size = defaultSize( filename );
         while ( size < minSize ) {
             if ( size < maxSize() / 2 )
@@ -438,13 +434,20 @@ namespace mongo {
     }
 
     Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
+        {
+            // make sizes align with VM page size
+            int newSize = (approxSize + 0xfff) & 0xfffff000;
+            assert( newSize >= 0 );
+            if( newSize < Extent::maxSize() )
+                approxSize = newSize;
+        }
         massert( 10357 ,  "shutdown in progress", ! inShutdown() );
         massert( 10358 ,  "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() );
         massert( 10359 ,  "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed
-        int ExtentSize = approxSize <= header()->unusedLength ? approxSize : header()->unusedLength;
+        int ExtentSize = min(header()->unusedLength, approxSize);
         DiskLoc loc;
         if ( ExtentSize < Extent::minSize() ) {
-            /* not there could be a lot of looping here is db just started and
+            /* note there could be a lot of looping here is db just started and
                no files are open yet.  we might want to do something about that. */
             if ( loops > 8 ) {
                 assert( loops < 10000 );
@@ -455,12 +458,12 @@ namespace mongo {
         }
         int offset = header()->unused.getOfs();
 
-        DataFileHeader *h = getDur().writing(header());
-        h->unused.set( fileNo, offset + ExtentSize );
-        h->unusedLength -= ExtentSize;
+        DataFileHeader *h = header();
+        h->unused.writing().set( fileNo, offset + ExtentSize );
+        getDur().writingInt(h->unusedLength) = h->unusedLength - ExtentSize;
         loc.set(fileNo, offset);
         Extent *e = _getExtent(loc);
-        DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset);
+        DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset, newCapped);
 
         addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);
 
@@ -484,11 +487,15 @@ namespace mongo {
                 low = (int) (approxSize * 0.8);
                 high = (int) (approxSize * 1.4);
             }
-            if( high < 0 ) high = approxSize;
+            if( high <= 0 ) {
+                // overflowed
+                high = max(approxSize, Extent::maxSize());
+            }
             int n = 0;
             Extent *best = 0;
             int bestDiff = 0x7fffffff;
             {
+                Timer t;
                 DiskLoc L = f->firstExtent;
                 while( !L.isNull() ) {
                     Extent * e = L.ext();
@@ -497,16 +504,35 @@ namespace mongo {
                         if( diff < bestDiff ) {
                             bestDiff = diff;
                             best = e;
-                            if( diff == 0 )
+                            if( ((double) diff) / approxSize < 0.1 ) { 
+                                // close enough
                                 break;
+                            }
+                            if( t.seconds() >= 2 ) { 
+                                // have spent lots of time in write lock, and we are in [low,high], so close enough
+                                // could come into play if extent freelist is very long
+                                break;
+                            }
+                        }
+                        else { 
+                            OCCASIONALLY {
+                                if( high < 64 * 1024 && t.seconds() >= 2 ) {
+                                    // be less picky if it is taking a long time
+                                    high = 64 * 1024;
+                                }
+                            }
                         }
                     }
                     L = e->xnext;
                     ++n;
-
+                }
+                if( t.seconds() >= 10 ) {
+                    log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl;
                 }
             }
-            OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
+
+            if( n > 128 ) log( n < 512 ) << "warning: newExtent " << n << " scanned\n";
+
             if( best ) {
                 Extent *e = best;
                 // remove from the free list
@@ -521,7 +547,7 @@ namespace mongo {
 
                 // use it
                 OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
-                DiskLoc emptyLoc = e->reuse(ns);
+                DiskLoc emptyLoc = e->reuse(ns, capped);
                 addNewExtentToNamespace(ns, e, e->myLoc, emptyLoc, capped);
                 return e;
             }
@@ -533,24 +559,43 @@ namespace mongo {
 
     /*---------------------------------------------------------------------*/
 
-    DiskLoc Extent::reuse(const char *nsname) {
-        return getDur().writing(this)->_reuse(nsname);
-    }
-    DiskLoc Extent::_reuse(const char *nsname) {
-        log(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n';
-        massert( 10360 ,  "Extent::reset bad magic value", magic == 0x41424344 );
+    void Extent::markEmpty() { 
         xnext.Null();
         xprev.Null();
-        nsDiagnostic = nsname;
         firstRecord.Null();
         lastRecord.Null();
+    }
 
-        DiskLoc emptyLoc = myLoc;
-        emptyLoc.inc( (int) (_extentData-(char*)this) );
+    DiskLoc Extent::reuse(const char *nsname, bool capped) {
+        return getDur().writing(this)->_reuse(nsname, capped);
+    }
 
-        int delRecLength = length - (_extentData - (char *) this);
+    void getEmptyLoc(const char *ns, const DiskLoc extentLoc, int extentLength, bool capped, /*out*/DiskLoc& emptyLoc, /*out*/int& delRecLength) { 
+        emptyLoc = extentLoc;
+        emptyLoc.inc( Extent::HeaderSize() );
+        delRecLength = extentLength - Extent::HeaderSize();
+        if( delRecLength >= 32*1024 && str::contains(ns, '$') && !capped ) { 
+            // probably an index. so skip forward to keep its records page aligned 
+            int& ofs = emptyLoc.GETOFS();
+            int newOfs = (ofs + 0xfff) & ~0xfff; 
+            delRecLength -= (newOfs-ofs);
+            dassert( delRecLength > 0 );
+            ofs = newOfs;
+        }
+    }
 
-        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc);
+    DiskLoc Extent::_reuse(const char *nsname, bool capped) {
+        LOG(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n';
+        massert( 10360 ,  "Extent::reset bad magic value", magic == 0x41424344 );
+        nsDiagnostic = nsname;
+        markEmpty();
+
+        DiskLoc emptyLoc;
+        int delRecLength;
+        getEmptyLoc(nsname, myLoc, length, capped, emptyLoc, delRecLength);
+
+        // todo: some dup code here and below in Extent::init
+        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);
         empty = getDur().writing(empty);
         empty->lengthWithHeaders = delRecLength;
         empty->extentOfs = myLoc.getOfs();
@@ -560,7 +605,7 @@ namespace mongo {
     }
 
     /* assumes already zeroed -- insufficient for block 'reuse' perhaps */
-    DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) {
+    DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset, bool capped) {
         magic = 0x41424344;
         myLoc.set(_fileNo, _offset);
         xnext.Null();
@@ -570,12 +615,12 @@ namespace mongo {
         firstRecord.Null();
         lastRecord.Null();
 
-        DiskLoc emptyLoc = myLoc;
-        emptyLoc.inc( (int) (_extentData-(char*)this) );
+        DiskLoc emptyLoc;
+        int delRecLength;
+        getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength);
 
-        int l = _length - (_extentData - (char *) this);
-        DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, l) );
-        empty->lengthWithHeaders = l;
+        DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength) );
+        empty->lengthWithHeaders = delRecLength;
         empty->extentOfs = myLoc.getOfs();
         return emptyLoc;
     }
@@ -673,7 +718,7 @@ namespace mongo {
             /* todo: if extent is empty, free it for reuse elsewhere.
                that is a bit complicated have to clean up the freelists.
             */
-            RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead " << ns << endl;
+            RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead. ns:" << ns << endl;
             // find a nonempty extent
             // it might be nice to free the whole extent here!  but have to clean up free recs then.
             e = e->getNextExtent();
@@ -713,7 +758,7 @@ namespace mongo {
 
     void printFreeList() {
         string s = cc().database()->name + ".$freelist";
-        log() << "dump freelist " << s << '\n';
+        log() << "dump freelist " << s << endl;
         NamespaceDetails *freeExtents = nsdetails(s.c_str());
         if( freeExtents == 0 ) {
             log() << "  freeExtents==0" << endl;
@@ -722,11 +767,48 @@ namespace mongo {
         DiskLoc a = freeExtents->firstExtent;
         while( !a.isNull() ) {
             Extent *e = a.ext();
-            log() << "  " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << '\n';
+            log() << "  extent " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << endl;
             a = e->xnext;
         }
 
-        log() << "  end freelist" << endl;
+        log() << "end freelist" << endl;
+    }
+
+    /** free a list of extents that are no longer in use.  this is a double linked list of extents 
+        (could be just one in the list)
+    */
+    void freeExtents(DiskLoc firstExt, DiskLoc lastExt) {
+        {
+            assert( !firstExt.isNull() && !lastExt.isNull() );
+            Extent *f = firstExt.ext();
+            Extent *l = lastExt.ext();
+            assert( f->xprev.isNull() );
+            assert( l->xnext.isNull() );
+            assert( f==l || !f->xnext.isNull() );
+            assert( f==l || !l->xprev.isNull() );
+        }
+
+        string s = cc().database()->name + ".$freelist";
+        NamespaceDetails *freeExtents = nsdetails(s.c_str());
+        if( freeExtents == 0 ) {
+            string err;
+            _userCreateNS(s.c_str(), BSONObj(), err, 0); // todo: this actually allocates an extent, which is bad!
+            freeExtents = nsdetails(s.c_str());
+            massert( 10361 , "can't create .$freelist", freeExtents);
+        }
+        if( freeExtents->firstExtent.isNull() ) {
+            freeExtents->firstExtent.writing() = firstExt;
+            freeExtents->lastExtent.writing() = lastExt;
+        }
+        else {
+            DiskLoc a = freeExtents->firstExtent;
+            assert( a.ext()->xprev.isNull() );
+            getDur().writingDiskLoc( a.ext()->xprev ) = lastExt;
+            getDur().writingDiskLoc( lastExt.ext()->xnext ) = a;
+            getDur().writingDiskLoc( freeExtents->firstExtent ) = firstExt;
+        }
+
+        //printFreeList();
     }
 
     /* drop a collection/namespace */
@@ -755,27 +837,9 @@ namespace mongo {
 
         // free extents
         if( !d->firstExtent.isNull() ) {
-            string s = cc().database()->name + ".$freelist";
-            NamespaceDetails *freeExtents = nsdetails(s.c_str());
-            if( freeExtents == 0 ) {
-                string err;
-                _userCreateNS(s.c_str(), BSONObj(), err, 0);
-                freeExtents = nsdetails(s.c_str());
-                massert( 10361 , "can't create .$freelist", freeExtents);
-            }
-            if( freeExtents->firstExtent.isNull() ) {
-                freeExtents->firstExtent.writing() = d->firstExtent;
-                freeExtents->lastExtent.writing() = d->lastExtent;
-            }
-            else {
-                DiskLoc a = freeExtents->firstExtent;
-                assert( a.ext()->xprev.isNull() );
-                getDur().writingDiskLoc( a.ext()->xprev ) = d->lastExtent;
-                getDur().writingDiskLoc( d->lastExtent.ext()->xnext ) = a;
-                getDur().writingDiskLoc( freeExtents->firstExtent ) = d->firstExtent;
-                getDur().writingDiskLoc( d->firstExtent ).setInvalid();
-                getDur().writingDiskLoc( d->lastExtent ).setInvalid();
-            }
+            freeExtents(d->firstExtent, d->lastExtent);
+            getDur().writingDiskLoc( d->firstExtent ).setInvalid();
+            getDur().writingDiskLoc( d->lastExtent ).setInvalid();
         }
 
         // remove from the catalog hashtable
@@ -810,22 +874,17 @@ namespace mongo {
         dropNS(name);
     }
 
-    int nUnindexes = 0;
-
     /* unindex all keys in index for this record. */
     static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) {
-        BSONObjSetDefaultOrder keys;
+        BSONObjSet keys;
         id.getKeysFromObject(obj, keys);
-        for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
+        IndexInterface& ii = id.idxInterface();
+        for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
             BSONObj j = *i;
-            if ( otherTraceLevel >= 5 ) {
-                out() << "_unindexRecord() " << obj.toString();
-                out() << "\n  unindex:" << j.toString() << endl;
-            }
-            nUnindexes++;
+
             bool ok = false;
             try {
-                ok = id.head.btree()->unindex(id.head, id, j, dl);
+                ok = ii.unindex(id.head, id, j, dl);
             }
             catch (AssertionException& e) {
                 problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl;
@@ -837,7 +896,7 @@ namespace mongo {
             }
 
             if ( !ok && logMissing ) {
-                out() << "unindex failed (key too big?) " << id.indexNamespace() << '\n';
+                log() << "unindex failed (key too big?) " << id.indexNamespace() << " key: " << j << " " << obj["_id"] << endl;
             }
         }
     }
@@ -910,7 +969,7 @@ namespace mongo {
         }
     }
 
-    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) {
+    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) {
         dassert( todelete == dl.rec() );
 
         NamespaceDetails* d = nsdetails(ns);
@@ -919,6 +978,14 @@ namespace mongo {
             uassert( 10089 ,  "can't remove from a capped collection" , 0 );
             return;
         }
+        
+        BSONObj toDelete;
+        if ( doLog ) {
+            BSONElement e = dl.obj()["_id"];
+            if ( e.type() ) {
+                toDelete = e.wrap();
+            }
+        }
 
         /* check if any cursors point to us.  if so, advance them. */
         ClientCursor::aboutToDelete(dl);
@@ -927,6 +994,10 @@ namespace mongo {
 
         _deleteRecord(d, ns, todelete, dl);
         NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
+
+        if ( ! toDelete.isEmpty() ) {
+            logOp( "d" , ns , toDelete );
+        }
     }
 
 
@@ -938,7 +1009,7 @@ namespace mongo {
         NamespaceDetailsTransient *nsdt,
         Record *toupdate, const DiskLoc& dl,
         const char *_buf, int _len, OpDebug& debug,  bool god) {
-        StringBuilder& ss = debug.str;
+
         dassert( toupdate == dl.rec() );
 
         BSONObj objOld(toupdate);
@@ -972,8 +1043,7 @@ namespace mongo {
             // doesn't fit.  reallocate -----------------------------------------------------
             uassert( 10003 , "failing update: objects in a capped ns cannot grow", !(d && d->capped));
             d->paddingTooSmall();
-            if ( cc().database()->profile )
-                ss << " moved ";
+            debug.moved = true;
             deleteRecord(ns, toupdate, dl);
             return insert(ns, objNew.objdata(), objNew.objsize(), god);
         }
@@ -987,12 +1057,17 @@ namespace mongo {
             int z = d->nIndexesBeingBuilt();
             for ( int x = 0; x < z; x++ ) {
                 IndexDetails& idx = d->idx(x);
+                IndexInterface& ii = idx.idxInterface();
                 for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) {
                     try {
-                        idx.head.btree()->unindex(idx.head, idx, *changes[x].removed[i], dl);
+                        bool found = ii.unindex(idx.head, idx, *changes[x].removed[i], dl);
+                        if ( ! found ) {
+                            RARELY warning() << "ns: " << ns << " couldn't unindex key: " << *changes[x].removed[i] 
+                                             << " for doc: " << objOld["_id"] << endl;
+                        }
                     }
                     catch (AssertionException&) {
-                        ss << " exception update unindex ";
+                        debug.extra << " exception update unindex ";
                         problem() << " caught assertion update unindex " << idx.indexNamespace() << endl;
                     }
                 }
@@ -1003,18 +1078,18 @@ namespace mongo {
                 for ( unsigned i = 0; i < changes[x].added.size(); i++ ) {
                     try {
                         /* we did the dupCheck() above.  so we don't have to worry about it here. */
-                        idx.head.btree()->bt_insert(
+                        ii.bt_insert(
                             idx.head,
                             dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
                     }
                     catch (AssertionException& e) {
-                        ss << " exception update index ";
-                        problem() << " caught assertion update index " << idx.indexNamespace() << " " << e << endl;
+                        debug.extra << " exception update index ";
+                        problem() << " caught assertion update index " << idx.indexNamespace() << " " << e << " " << objNew["_id"] << endl;
                     }
                 }
             }
-            if( keyUpdates && cc().database()->profile )
-                ss << '\n' << keyUpdates << " key updates ";
+            
+            debug.keyUpdates = keyUpdates;
         }
 
         //  update in place
@@ -1047,19 +1122,21 @@ namespace mongo {
     /* add keys to index idxNo for a new record */
     static inline void  _indexRecord(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) {
         IndexDetails& idx = d->idx(idxNo);
-        BSONObjSetDefaultOrder keys;
+        BSONObjSet keys;
         idx.getKeysFromObject(obj, keys);
+        if( keys.empty() ) 
+            return;
         BSONObj order = idx.keyPattern();
+        IndexInterface& ii = idx.idxInterface();
         Ordering ordering = Ordering::make(order);
         int n = 0;
-        for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
+        for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
             if( ++n == 2 ) {
                 d->setIndexIsMultikey(idxNo);
             }
             assert( !recordLoc.isNull() );
             try {
-                idx.head.btree()->bt_insert(idx.head, recordLoc,
-                                            *i, ordering, dupsAllowed, idx);
+                ii.bt_insert(idx.head, recordLoc, *i, ordering, dupsAllowed, idx);
             }
             catch (AssertionException& e) {
                 if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
@@ -1070,17 +1147,18 @@ namespace mongo {
                     // dup key exception, presumably.
                     throw;
                 }
-                problem() << " caught assertion _indexRecord " << idx.indexNamespace() << endl;
+                problem() << " caught assertion _indexRecord " << idx.indexNamespace() << " " << obj["_id"] << endl;
             }
         }
     }
 
+#if 0    
     void testSorting() {
         BSONObjBuilder b;
         b.appendNull("");
         BSONObj x = b.obj();
 
-        BSONObjExternalSorter sorter;
+        BSONObjExternalSorter sorter(*IndexDetails::iis[1]);
 
         sorter.add(x, DiskLoc(3,7));
         sorter.add(x, DiskLoc(4,7));
@@ -1098,6 +1176,62 @@ namespace mongo {
             cout<<"SORTER next:" << d.first.toString() << endl;*/
         }
     }
+#endif
+
+    SortPhaseOne *precalced = 0;
+
+    template< class V >
+    void buildBottomUpPhases2And3(bool dupsAllowed, IndexDetails& idx, BSONObjExternalSorter& sorter, 
+        bool dropDups, list<DiskLoc> &dupsToDrop, CurOp * op, SortPhaseOne *phase1, ProgressMeterHolder &pm,
+        Timer& t
+        )
+    {
+        BtreeBuilder<V> btBuilder(dupsAllowed, idx);
+        BSONObj keyLast;
+        auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+        assert( pm == op->setMessage( "index: (2/3) btree bottom up" , phase1->nkeys , 10 ) );
+        while( i->more() ) {
+            RARELY killCurrentOp.checkForInterrupt();
+            BSONObjExternalSorter::Data d = i->next();
+
+            try {
+                if ( !dupsAllowed && dropDups ) {
+                    LastError::Disabled led( lastError.get() );
+                    btBuilder.addKey(d.first, d.second);
+                }
+                else {
+                    btBuilder.addKey(d.first, d.second);                    
+                }
+            }
+            catch( AssertionException& e ) {
+                if ( dupsAllowed ) {
+                    // unknow exception??
+                    throw;
+                }
+
+                if( e.interrupted() ) {
+                    killCurrentOp.checkForInterrupt();
+                }
+
+                if ( ! dropDups )
+                    throw;
+
+                /* we could queue these on disk, but normally there are very few dups, so instead we
+                    keep in ram and have a limit.
+                */
+                dupsToDrop.push_back(d.second);
+                uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 );
+            }
+            pm.hit();
+        }
+        pm.finished();
+        op->setMessage( "index: (3/3) btree-middle" );
+        log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
+        btBuilder.commit();
+        if ( btBuilder.getn() != phase1->nkeys && ! dropDups ) {
+            warning() << "not all entries were added to the index, probably some keys were too large" << endl;
+        }
+    }
 
     // throws DBException
     unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
@@ -1116,39 +1250,36 @@ namespace mongo {
         if ( logLevel > 1 ) printMemInfo( "before index start" );
 
         /* get and sort all the keys ----- */
-        unsigned long long n = 0;
-        shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
-        BSONObjExternalSorter sorter(order);
-        sorter.hintNumObjects( d->stats.nrecords );
-        unsigned long long nkeys = 0;
         ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
-        while ( c->ok() ) {
-            BSONObj o = c->current();
-            DiskLoc loc = c->currLoc();
-
-            BSONObjSetDefaultOrder keys;
-            idx.getKeysFromObject(o, keys);
-            int k = 0;
-            for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
-                if( ++k == 2 ) {
-                    d->setIndexIsMultikey(idxNo);
+        SortPhaseOne _ours;
+        SortPhaseOne *phase1 = precalced;
+        if( phase1 == 0 ) {
+            phase1 = &_ours;
+            SortPhaseOne& p1 = *phase1;
+            shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+            p1.sorter.reset( new BSONObjExternalSorter(idx.idxInterface(), order) );
+            p1.sorter->hintNumObjects( d->stats.nrecords );
+            const IndexSpec& spec = idx.getSpec();
+            while ( c->ok() ) {
+                BSONObj o = c->current();
+                DiskLoc loc = c->currLoc();
+                p1.addKeys(spec, o, loc);
+                c->advance();
+                pm.hit();
+                if ( logLevel > 1 && p1.n % 10000 == 0 ) {
+                    printMemInfo( "\t iterating objects" );
                 }
-                sorter.add(*i, loc);
-                nkeys++;
-            }
+            };
+        }
+        pm.finished();
 
-            c->advance();
-            n++;
-            pm.hit();
-            if ( logLevel > 1 && n % 10000 == 0 ) {
-                printMemInfo( "\t iterating objects" );
-            }
+        BSONObjExternalSorter& sorter = *(phase1->sorter);
 
-        };
-        pm.finished();
+        if( phase1->multi )
+            d->setIndexIsMultikey(idxNo);
 
         if ( logLevel > 1 ) printMemInfo( "before final sort" );
-        sorter.sort();
+        phase1->sorter->sort();
         if ( logLevel > 1 ) printMemInfo( "after final sort" );
 
         log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
@@ -1156,55 +1287,21 @@ namespace mongo {
         list<DiskLoc> dupsToDrop;
 
         /* build index --- */
-        {
-            BtreeBuilder btBuilder(dupsAllowed, idx);
-            BSONObj keyLast;
-            auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
-            assert( pm == op->setMessage( "index: (2/3) btree bottom up" , nkeys , 10 ) );
-            while( i->more() ) {
-                RARELY killCurrentOp.checkForInterrupt();
-                BSONObjExternalSorter::Data d = i->next();
-
-                try {
-                    btBuilder.addKey(d.first, d.second);
-                }
-                catch( AssertionException& e ) {
-                    if ( dupsAllowed ) {
-                        // unknow exception??
-                        throw;
-                    }
-
-                    if( e.interrupted() )
-                        throw;
-
-                    if ( ! dropDups )
-                        throw;
-
-                    /* we could queue these on disk, but normally there are very few dups, so instead we
-                       keep in ram and have a limit.
-                    */
-                    dupsToDrop.push_back(d.second);
-                    uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 );
-                }
-                pm.hit();
-            }
-            pm.finished();
-            op->setMessage( "index: (3/3) btree-middle" );
-            log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
-            btBuilder.commit();
-            if ( btBuilder.getn() != nkeys && ! dropDups ) {
-                warning() << "not all entries were added to the index, probably some keys were too large" << endl;
-            }
-        }
+        if( idx.version() == 0 )
+            buildBottomUpPhases2And3<V0>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
+        else if( idx.version() == 1 ) 
+            buildBottomUpPhases2And3<V1>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
+        else
+            assert(false);
 
         log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
 
         for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ){
-            theDataFileMgr.deleteRecord( ns, i->rec(), *i, false, true );
+            theDataFileMgr.deleteRecord( ns, i->rec(), *i, false /* cappedOk */ , true /* noWarn */ , isMaster( ns ) /* logOp */ );
             getDur().commitIfNeeded();
         }
 
-        return n;
+        return phase1->n;
     }
 
     class BackgroundIndexBuildJob : public BackgroundOperation {
@@ -1226,18 +1323,27 @@ namespace mongo {
             while ( cc->ok() ) {
                 BSONObj js = cc->current();
                 try {
-                    _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                    {
+                        if ( !dupsAllowed && dropDups ) {
+                            LastError::Disabled led( lastError.get() );
+                            _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                        }
+                        else {
+                            _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                        }
+                    }
                     cc->advance();
                 }
                 catch( AssertionException& e ) {
-                    if( e.interrupted() )
-                        throw;
+                    if( e.interrupted() ) {
+                        killCurrentOp.checkForInterrupt();
+                    }
 
                     if ( dropDups ) {
                         DiskLoc toDelete = cc->currLoc();
                         bool ok = cc->advance();
                         cc->updateLocation();
-                        theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true );
+                        theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true , true );
                         if( ClientCursor::find(id, false) == 0 ) {
                             cc.release();
                             if( !ok ) {
@@ -1259,7 +1365,10 @@ namespace mongo {
 
                 getDur().commitIfNeeded();
 
-                if ( n % 128 == 0 && !cc->yield() ) {
+                if ( cc->yieldSometimes( ClientCursor::WillNeed ) ) {
+                    progress.setTotalWhileRunning( d->stats.nrecords );
+                }
+                else {
                     cc.release();
                     uasserted(12584, "cursor gone during bg index");
                     break;
@@ -1292,7 +1401,7 @@ namespace mongo {
             prep(ns.c_str(), d);
             assert( idxNo == d->nIndexes );
             try {
-                idx.head.writing() = BtreeBucket::addBucket(idx);
+                idx.head.writing() = idx.idxInterface().addBucket(idx);
                 n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
             }
             catch(...) {
@@ -1336,18 +1445,18 @@ namespace mongo {
 
     // throws DBException
     static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
-        tlog() << "building new index on " << idx.keyPattern() << " for " << ns << ( background ? " background" : "" ) << endl;
+        tlog() << "build index " << ns << ' ' << idx.keyPattern() << ( background ? " background" : "" ) << endl;
         Timer t;
         unsigned long long n;
 
-        if( background ) {
-            log(2) << "buildAnIndex: background=true\n";
-        }
-
         assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be...
         assert( d->indexBuildInProgress == 0 );
         assertInWriteLock();
         RecoverableIndexState recoverable( d );
+
+        // Build index spec here in case the collection is empty and the index details are invalid
+        idx.getSpec();
+
         if( inDBRepair || !background ) {
             n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
             assert( !idx.head.isNull() );
@@ -1356,7 +1465,7 @@ namespace mongo {
             BackgroundIndexBuildJob j(ns.c_str());
             n = j.go(ns, d, idx, idxNo);
         }
-        tlog() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl;
+        tlog() << "build index done " << n << " records " << t.millis() / 1000.0 << " secs" << endl;
     }
 
     /* add keys to indexes for a new record */
@@ -1436,17 +1545,16 @@ namespace mongo {
         logOp( "i", ns, tmp );
     }
 
+    /** @param o the object to insert. can be modified to add _id and thus be an in/out param
+     */
     DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
-        DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god );
-        if ( !loc.isNull() )
+        bool addedID = false;
+        DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god, true, &addedID );
+        if( addedID && !loc.isNull() )
             o = BSONObj( loc.rec() );
         return loc;
     }
 
-    void DataFileMgr::insertNoReturnVal(const char *ns,  BSONObj o, bool god) {
-        insert( ns, o.objdata(), o.objsize(), god );
-    }
-
     bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject );
 
     // We are now doing two btree scans for all unique indexes (one here, and one when we've
@@ -1457,55 +1565,186 @@ namespace mongo {
         for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) {
             if( d->idx(idxNo).unique() ) {
                 IndexDetails& idx = d->idx(idxNo);
-                BSONObjSetDefaultOrder keys;
+                BSONObjSet keys;
                 idx.getKeysFromObject(obj, keys);
                 BSONObj order = idx.keyPattern();
-                for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
+                IndexInterface& ii = idx.idxInterface();
+                for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+                    // WARNING: findSingle may not be compound index safe.  this may need to change.  see notes in 
+                    // findSingle code.
                     uassert( 12582, "duplicate key insert for unique index of capped collection",
-                             idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() );
+                             ii.findSingle(idx, idx.head, *i ).isNull() );
                 }
             }
         }
     }
 
-    /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
-             after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
+    /** add a record to the end of the linked list chain within this extent. 
+        require: you must have already declared write intent for the record header.        
     */
-    DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) {
-        bool wouldAddIndex = false;
-        massert( 10093 , "cannot insert into reserved $ collection", god || isANormalNSName( ns ) );
-        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
-        const char *sys = strstr(ns, "system.");
-        if ( sys ) {
-            uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
-            if ( strstr(ns, ".system.") ) {
-                // later:check for dba-type permissions here if have that at some point separate
-                if ( strstr(ns, ".system.indexes" ) )
-                    wouldAddIndex = true;
-                else if ( legalClientSystemNS( ns , true ) )
-                    ;
-                else if ( !god ) {
-                    out() << "ERROR: attempt to insert in system namespace " << ns << endl;
-                    return DiskLoc();
+    void addRecordToRecListInExtent(Record *r, DiskLoc loc) {
+        dassert( loc.rec() == r );
+        Extent *e = r->myExtent(loc);
+        if ( e->lastRecord.isNull() ) {
+            Extent::FL *fl = getDur().writing(e->fl());
+            fl->firstRecord = fl->lastRecord = loc;
+            r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+        }
+        else {
+            Record *oldlast = e->lastRecord.rec();
+            r->prevOfs = e->lastRecord.getOfs();
+            r->nextOfs = DiskLoc::NullOfs;
+            getDur().writingInt(oldlast->nextOfs) = loc.getOfs();
+            getDur().writingDiskLoc(e->lastRecord) = loc;
+        }
+    }
+
+    NOINLINE_DECL DiskLoc outOfSpace(const char *ns, NamespaceDetails *d, int lenWHdr, bool god, DiskLoc extentLoc) {
+        DiskLoc loc;
+        if ( d->capped == 0 ) { // size capped doesn't grow
+            log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
+            cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
+            loc = d->alloc(ns, lenWHdr, extentLoc);
+            if ( loc.isNull() ) {
+                log() << "warning: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
+                for ( int z=0; z<10 && lenWHdr > d->lastExtentSize; z++ ) {
+                    log() << "try #" << z << endl;
+                    cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
+                    loc = d->alloc(ns, lenWHdr, extentLoc);
+                    if ( ! loc.isNull() )
+                        break;
                 }
             }
-            else
-                sys = 0;
         }
+        return loc;
+    }
+
+    /** used by insert and also compact
+      * @return null loc if out of space 
+      */
+    DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god) {
+        DiskLoc extentLoc;
+        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+        if ( loc.isNull() ) {
+            loc = outOfSpace(ns, d, lenWHdr, god, extentLoc);
+        }
+        return loc;
+    }
+
+    bool NOINLINE_DECL insert_checkSys(const char *sys, const char *ns, bool& wouldAddIndex, const void *obuf, bool god) {
+        uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
+        if ( strstr(ns, ".system.") ) {
+            // later:check for dba-type permissions here if have that at some point separate
+            if ( strstr(ns, ".system.indexes" ) )
+                wouldAddIndex = true;
+            else if ( legalClientSystemNS( ns , true ) ) {
+                if ( obuf && strstr( ns , ".system.users" ) ) {
+                    BSONObj t( reinterpret_cast<const char *>( obuf ) );
+                    uassert( 14051 , "system.user entry needs 'user' field to be a string" , t["user"].type() == String );
+                    uassert( 14052 , "system.user entry needs 'pwd' field to be a string" , t["pwd"].type() == String );
+                    uassert( 14053 , "system.user entry needs 'user' field to be non-empty" , t["user"].String().size() );
+                    uassert( 14054 , "system.user entry needs 'pwd' field to be non-empty" , t["pwd"].String().size() );
+                }
+            }
+            else if ( !god ) {
+                // todo this should probably uasseert rather than doing this:
+                log() << "ERROR: attempt to insert in system namespace " << ns << endl;
+                return false;
+            }
+        }
+        return true;
+    }
+
+    NOINLINE_DECL NamespaceDetails* insert_newNamespace(const char *ns, int len, bool god) { 
+        addNewNamespaceToCatalog(ns);
+        /* todo: shouldn't be in the namespace catalog until after the allocations here work.
+            also if this is an addIndex, those checks should happen before this!
+        */
+        // This may create first file in the database.
+        int ies = Extent::initialSize(len);
+        if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { 
+            // probably an index.  so we pick a value here for the first extent instead of using initialExtentSize() which is more 
+            // for user collections.  TODO: we could look at the # of records in the parent collection to be smarter here.
+            ies = (32+4) * 1024;
+        }
+        cc().database()->allocExtent(ns, ies, false, false);
+        NamespaceDetails *d = nsdetails(ns);
+        if ( !god )
+            ensureIdIndexForNewNs(ns);
+        return d;
+    }
+
+    void NOINLINE_DECL insert_makeIndex(NamespaceDetails *tableToIndex, const string& tabletoidxns, const DiskLoc& loc) { 
+        uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos );
+
+        BSONObj info = loc.obj();
+        bool background = info["background"].trueValue();
+        if( background && cc().isSyncThread() ) {
+            /* don't do background indexing on slaves.  there are nuances.  this could be added later
+                but requires more code.
+                */
+            log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
+            background = false;
+        }
+
+        int idxNo = tableToIndex->nIndexes;
+        IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
+        getDur().writingDiskLoc(idx.info) = loc;
+        try {
+            buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
+        }
+        catch( DBException& e ) {
+            // save our error msg string as an exception or dropIndexes will overwrite our message
+            LastError *le = lastError.get();
+            int savecode = 0;
+            string saveerrmsg;
+            if ( le ) {
+                savecode = le->code;
+                saveerrmsg = le->msg;
+            }
+            else {
+                savecode = e.getCode();
+                saveerrmsg = e.what();
+            }
+
+            // roll back this index
+            string name = idx.indexName();
+            BSONObjBuilder b;
+            string errmsg;
+            bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
+            if( !ok ) {
+                log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
+            }
+
+            assert( le && !saveerrmsg.empty() );
+            raiseError(savecode,saveerrmsg.c_str());
+            throw;
+        }
+    }
+
+    /* if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
+         after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
+
+       @param mayAddIndex almost always true, except for invocation from rename namespace command.
+       @param addedID if not null, set to true if adding _id element. you must assure false before calling
+              if using.
+    */
 
+
+    DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, bool mayAddIndex, bool *addedID) {
+        bool wouldAddIndex = false;
+        massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
+        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
+        {
+            const char *sys = strstr(ns, "system.");
+            if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
+                return DiskLoc();
+        }
         bool addIndex = wouldAddIndex && mayAddIndex;
 
         NamespaceDetails *d = nsdetails(ns);
         if ( d == 0 ) {
-            addNewNamespaceToCatalog(ns);
-            /* todo: shouldn't be in the namespace catalog until after the allocations here work.
-               also if this is an addIndex, those checks should happen before this!
-            */
-            // This may create first file in the database.
-            cc().database()->allocExtent(ns, Extent::initialSize(len), false);
-            d = nsdetails(ns);
-            if ( !god )
-                ensureIdIndexForNewNs(ns);
+            d = insert_newNamespace(ns, len, god);
         }
         d->paddingFits();
 
@@ -1516,18 +1755,18 @@ namespace mongo {
         if ( addIndex ) {
             assert( obuf );
             BSONObj io((const char *) obuf);
-            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) )
+            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) {
+                // prepare creates _id itself, or this indicates to fail the build silently (such 
+                // as if index already exists)
                 return DiskLoc();
-
+            }
             if ( ! fixedIndexObject.isEmpty() ) {
                 obuf = fixedIndexObject.objdata();
                 len = fixedIndexObject.objsize();
             }
-
         }
 
-        const BSONElement *newId = &writeId;
-        int addID = 0;
+        int addID = 0; // 0 if not adding _id; if adding, the length of that new element
         if( !god ) {
             /* Check if we have an _id field. If we don't, we'll add it.
                Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
@@ -1535,20 +1774,18 @@ namespace mongo {
             BSONObj io((const char *) obuf);
             BSONElement idField = io.getField( "_id" );
             uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
-            if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 ) {
+            // we don't add _id for capped collections as they don't have an _id index
+            if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 && d->haveIdIndex() ) {
+                if( addedID )
+                    *addedID = true;
                 addID = len;
-                if ( writeId.eoo() ) {
-                    // Very likely we'll add this elt, so little harm in init'ing here.
-                    idToInsert_.oid.init();
-                    newId = &idToInsert;
-                }
-                len += newId->size();
+                idToInsert_.oid.init();
+                len += idToInsert.size();
             }
 
             BSONElementManipulator::lookForTimestamps( io );
         }
 
-        DiskLoc extentLoc;
         int lenWHdr = len + Record::HeaderSize;
         lenWHdr = (int) (lenWHdr * d->paddingFactor);
         if ( lenWHdr == 0 ) {
@@ -1564,29 +1801,11 @@ namespace mongo {
             checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
         }
 
-        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
         if ( loc.isNull() ) {
-            // out of space
-            if ( d->capped == 0 ) { // size capped doesn't grow
-                log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
-                cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false);
-                loc = d->alloc(ns, lenWHdr, extentLoc);
-                if ( loc.isNull() ) {
-                    log() << "WARNING: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
-                    for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ) {
-                        log() << "try #" << zzz << endl;
-                        cc().database()->allocExtent(ns, Extent::followupSize(len, d->lastExtentSize), false);
-                        loc = d->alloc(ns, lenWHdr, extentLoc);
-                        if ( ! loc.isNull() )
-                            break;
-                    }
-                }
-            }
-            if ( loc.isNull() ) {
-                log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->capped << endl;
-                assert(d->capped);
-                return DiskLoc();
-            }
+            log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->capped << endl;
+            assert(d->capped);
+            return DiskLoc();
         }
 
         Record *r = loc.rec();
@@ -1595,31 +1814,17 @@ namespace mongo {
             r = (Record*) getDur().writingPtr(r, lenWHdr);
             if( addID ) {
                 /* a little effort was made here to avoid a double copy when we add an ID */
-                ((int&)*r->data) = *((int*) obuf) + newId->size();
-                memcpy(r->data+4, newId->rawdata(), newId->size());
-                memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4);
+                ((int&)*r->data) = *((int*) obuf) + idToInsert.size();
+                memcpy(r->data+4, idToInsert.rawdata(), idToInsert.size());
+                memcpy(r->data+4+idToInsert.size(), ((char *)obuf)+4, addID-4);
             }
             else {
-                if( obuf )
+                if( obuf ) // obuf can be null from internal callers
                     memcpy(r->data, obuf, len);
             }
         }
 
-        {
-            Extent *e = r->myExtent(loc);
-            if ( e->lastRecord.isNull() ) {
-                Extent::FL *fl = getDur().writing(e->fl());
-                fl->firstRecord = fl->lastRecord = loc;
-                r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
-            }
-            else {
-                Record *oldlast = e->lastRecord.rec();
-                r->prevOfs = e->lastRecord.getOfs();
-                r->nextOfs = DiskLoc::NullOfs;
-                getDur().writingInt(oldlast->nextOfs) = loc.getOfs();
-                getDur().writingDiskLoc(e->lastRecord) = loc;
-            }
-        }
+        addRecordToRecListInExtent(r, loc);
 
         /* durability todo : this could be a bit annoying / slow to record constantly */
         {
@@ -1628,56 +1833,12 @@ namespace mongo {
             s->nrecords++;
         }
 
-        // we don't bother clearing those stats for the god tables - also god is true when adidng a btree bucket
+        // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
         if ( !god )
             NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
 
         if ( tableToIndex ) {
-            uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos );
-
-            BSONObj info = loc.obj();
-            bool background = info["background"].trueValue();
-            if( background && cc().isSyncThread() ) {
-                /* don't do background indexing on slaves.  there are nuances.  this could be added later
-                   but requires more code.
-                   */
-                log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
-                background = false;
-            }
-
-            int idxNo = tableToIndex->nIndexes;
-            IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
-            getDur().writingDiskLoc(idx.info) = loc;
-            try {
-                buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
-            }
-            catch( DBException& e ) {
-                // save our error msg string as an exception or dropIndexes will overwrite our message
-                LastError *le = lastError.get();
-                int savecode = 0;
-                string saveerrmsg;
-                if ( le ) {
-                    savecode = le->code;
-                    saveerrmsg = le->msg;
-                }
-                else {
-                    savecode = e.getCode();
-                    saveerrmsg = e.what();
-                }
-
-                // roll back this index
-                string name = idx.indexName();
-                BSONObjBuilder b;
-                string errmsg;
-                bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
-                if( !ok ) {
-                    log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
-                }
-
-                assert( le && !saveerrmsg.empty() );
-                raiseError(savecode,saveerrmsg.c_str());
-                throw;
-            }
+            insert_makeIndex(tableToIndex, tabletoidxns, loc);
         }
 
         /* add this record to our indexes */
@@ -1703,7 +1864,6 @@ namespace mongo {
             }
         }
 
-        //  out() << "   inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
         return loc;
     }
 
@@ -1718,10 +1878,7 @@ namespace mongo {
         DiskLoc extentLoc;
         int lenWHdr = len + Record::HeaderSize;
         DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
-        if ( loc.isNull() ) {
-            assert(false);
-            return 0;
-        }
+        assert( !loc.isNull() );
 
         Record *r = loc.rec();
         assert( r->lengthWithHeaders >= lenWHdr );
@@ -1782,6 +1939,15 @@ namespace mongo {
 
         BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str());
 
+        dbMutex.assertWriteLocked();
+
+        // Not sure we need this here, so removed.  If we do, we need to move it down 
+        // within other calls both (1) as they could be called from elsewhere and 
+        // (2) to keep the lock order right - groupcommitmutex must be locked before 
+        // mmmutex (if both are locked).
+        //
+        //  RWLockRecursive::Exclusive lk(MongoFile::mmmutex);
+
         getDur().syncDataAndTruncateJournal();
 
         Database::closeDatabase( d->name.c_str(), d->path );
@@ -1889,21 +2055,6 @@ namespace mongo {
         return sa.size();
     }
 
-#if !defined(_WIN32)
-} // namespace mongo
-#include <sys/statvfs.h>
-namespace mongo {
-#endif
-    boost::intmax_t freeSpace ( const string &path ) {
-#if !defined(_WIN32)
-        struct statvfs info;
-        assert( !statvfs( path.c_str() , &info ) );
-        return boost::intmax_t( info.f_bavail ) * info.f_frsize;
-#else
-        return -1;
-#endif
-    }
-
     bool repairDatabase( string dbNameS , string &errmsg,
                          bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) {
         doingRepair dr;
@@ -1923,7 +2074,7 @@ namespace mongo {
         getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
 
         boost::intmax_t totalSize = dbSize( dbName );
-        boost::intmax_t freeSize = freeSpace( repairpath );
+        boost::intmax_t freeSize = File::freeSpace(repairpath);
         if ( freeSize > -1 && freeSize < totalSize ) {
             stringstream ss;
             ss << "Cannot repair database " << dbName << " having size: " << totalSize
@@ -1946,12 +2097,15 @@ namespace mongo {
             assert( ctx.justCreated() );
 
             res = cloneFrom(localhost.c_str(), errmsg, dbName,
-                            /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
+                            /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false,
+                            /*snapshot*/false, /*mayYield*/false, /*mayBeInterrupted*/true);
             Database::closeDatabase( dbName, reservedPathString.c_str() );
         }
 
         if ( !res ) {
-            problem() << "clone failed for " << dbName << " with error: " << errmsg << endl;
+            errmsg = str::stream() << "clone failed for " << dbName << " with error: " << errmsg;
+            problem() << errmsg << endl;
+
             if ( !preserveClonedFilesOnFailure )
                 BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
 
@@ -1996,7 +2150,7 @@ namespace mongo {
         bool ok = false;
         BOOST_CHECK_EXCEPTION( ok = fo.apply( q ) );
         if ( ok )
-            log(2) << fo.op() << " file " << q.string() << '\n';
+            log(2) << fo.op() << " file " << q.string() << endl;
         int i = 0;
         int extra = 10; // should not be necessary, this is defensive in case there are missing files
         while ( 1 ) {
@@ -2060,16 +2214,4 @@ namespace mongo {
         return true;
     }
 
-    bool isValidNS( const StringData& ns ) {
-        // TODO: should check for invalid characters
-
-        const char * x = strchr( ns.data() , '.' );
-        if ( ! x )
-            return false;
-
-        x++;
-        return *x > 0;
-    }
-
-
 } // namespace mongo
diff --git a/db/pdfile.h b/db/pdfile.h
index 91f4877..64dba68 100644
--- a/db/pdfile.h
+++ b/db/pdfile.h
@@ -52,9 +52,6 @@ namespace mongo {
     bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication, bool *deferIdIndex = 0);
     shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc());
 
-    // -1 if library unavailable.
-    boost::intmax_t freeSpace( const string &path = dbpath );
-
     bool isValidNS( const StringData& ns );
 
     /*---------------------------------------------------------------------*/
@@ -123,13 +120,16 @@ namespace mongo {
         // The object o may be updated if modified on insert.
         void insertAndLog( const char *ns, const BSONObj &o, bool god = false );
 
-        /** @param obj both and in and out param -- insert can sometimes modify an object (such as add _id). */
-        DiskLoc insertWithObjMod(const char *ns, BSONObj &o, bool god = false);
+        /** insert will add an _id to the object if not present.  if you would like to see the final object
+            after such an addition, use this method.
+            @param o both and in and out param 
+            */
+        DiskLoc insertWithObjMod(const char *ns, BSONObj & /*out*/o, bool god = false);
 
         /** @param obj in value only for this version. */
         void insertNoReturnVal(const char *ns, BSONObj o, bool god = false);
 
-        DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, const BSONElement &writeId = BSONElement(), bool mayAddIndex = true);
+        DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, bool mayAddIndex = true, bool *addedID = 0);
         static shared_ptr<Cursor> findAll(const char *ns, const DiskLoc &startLoc = DiskLoc());
 
         /* special version of insert for transaction logging -- streamlined a bit.
@@ -142,7 +142,7 @@ namespace mongo {
         static Record* getRecord(const DiskLoc& dl);
         static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len);
 
-        void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false);
+        void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false, bool logOp=false);
 
         /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */
         void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl);
@@ -160,6 +160,9 @@ namespace mongo {
         int lengthWithHeaders;
         int extentOfs;
         DiskLoc nextDeleted;
+        DiskLoc myExtentLoc(const DiskLoc& myLoc) const {
+            return DiskLoc(myLoc.a(), extentOfs);
+        }
         Extent* myExtent(const DiskLoc& myLoc) {
             return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs));
         }
@@ -174,7 +177,7 @@ namespace mongo {
     (11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo
     (11:04:33 AM) dm10gen: see class DiskLoc for more info
     (11:04:43 AM) dm10gen: so that is how Record::myExtent() works
-    (11:04:53 AM) dm10gen: on an alloc(), when we build a new Record, we must popular its extentOfs then
+    (11:04:53 AM) dm10gen: on an alloc(), when we build a new Record, we must populate its extentOfs then
     */
     class Record {
     public:
@@ -204,11 +207,43 @@ namespace mongo {
         DiskLoc getNext(const DiskLoc& myLoc);
         DiskLoc getPrev(const DiskLoc& myLoc);
 
+        DiskLoc nextInExtent(const DiskLoc& myLoc) { 
+            if ( nextOfs == DiskLoc::NullOfs )
+                return DiskLoc();
+            assert( nextOfs );
+            return DiskLoc(myLoc.a(), nextOfs);
+        }
+
         struct NP {
             int nextOfs;
             int prevOfs;
         };
         NP* np() { return (NP*) &nextOfs; }
+
+        // ---------------------
+        // memory cache
+        // ---------------------
+
+        /** 
+         * touches the data so that is in physical memory
+         * @param entireRecrd if false, only the header and first byte is touched
+         *                    if true, the entire record is touched
+         * */
+        void touch( bool entireRecrd = false );
+
+        /**
+         * @return if this record is likely in physical memory
+         *         its not guaranteed because its possible it gets swapped out in a very unlucky windows
+         */
+        bool likelyInPhysicalMemory();
+
+        /**
+         * tell the cache this Record was accessed
+         * @return this, for simple chaining
+         */
+        Record* accessed();
+
+        static bool MemoryTrackingEnabled;
     };
 
     /* extents are datafile regions where all the records within the region
@@ -240,6 +275,12 @@ namespace mongo {
                    length >= 0 && !myLoc.isNull();
         }
 
+        BSONObj dump() {
+            return BSON( "loc" << myLoc.toString() << "xnext" << xnext.toString() << "xprev" << xprev.toString()
+                      << "nsdiag" << nsDiagnostic.toString()
+                      << "size" << length << "firstRecord" << firstRecord.toString() << "lastRecord" << lastRecord.toString());
+        }
+
         void dump(iostream& s) {
             s << "    loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n';
             s << "    nsdiag:" << nsDiagnostic.toString() << '\n';
@@ -250,10 +291,10 @@ namespace mongo {
         Returns a DeletedRecord location which is the data in the extent ready for us.
         Caller will need to add that to the freelist structure in namespacedetail.
         */
-        DiskLoc init(const char *nsname, int _length, int _fileNo, int _offset);
+        DiskLoc init(const char *nsname, int _length, int _fileNo, int _offset, bool capped);
 
         /* like init(), but for a reuse case */
-        DiskLoc reuse(const char *nsname);
+        DiskLoc reuse(const char *nsname, bool newUseIsAsCapped);
 
         bool isOk() const { return magic == 0x41424344; }
         void assertOk() const { assert(isOk()); }
@@ -279,8 +320,8 @@ namespace mongo {
          */
         static int followupSize(int len, int lastExtentLen);
 
-        /**
-         * @param len lengt of record we need
+        /** get a suggested size for the first extent in a namespace
+         *  @param len length of record we need to insert
          */
         static int initialSize(int len);
 
@@ -292,8 +333,11 @@ namespace mongo {
             this helper is for that -- for use with getDur().writing() method
         */
         FL* fl() { return (FL*) &firstRecord; }
+
+        /** caller must declare write intent first */
+        void markEmpty();
     private:
-        DiskLoc _reuse(const char *nsname);
+        DiskLoc _reuse(const char *nsname, bool newUseIsAsCapped); // recycle an extent and reuse it for a different ns
     };
 
     /*  a datafile - i.e. the "dbname.<#>" files :
@@ -318,7 +362,7 @@ namespace mongo {
         int unusedLength;
         char reserved[8192 - 4*4 - 8];
 
-        char data[4];
+        char data[4]; // first extent starts here
 
         enum { HeaderSize = 8192 };
 
@@ -414,7 +458,7 @@ namespace mongo {
         return DataFileMgr::getRecord(*this);
     }
     inline BSONObj DiskLoc::obj() const {
-        return BSONObj(rec());
+        return BSONObj(rec()->accessed());
     }
     inline DeletedRecord* DiskLoc::drec() const {
         assert( _a != -1 );
@@ -423,9 +467,12 @@ namespace mongo {
     inline Extent* DiskLoc::ext() const {
         return DataFileMgr::getExtent(*this);
     }
-    inline const BtreeBucket* DiskLoc::btree() const {
+
+    template< class V >
+    inline 
+    const BtreeBucket<V> * DiskLoc::btree() const {
         assert( _a != -1 );
-        return (const BtreeBucket *) rec()->data;
+        return (const BtreeBucket<V> *) rec()->data;
     }
 
 } // namespace mongo
@@ -478,19 +525,8 @@ namespace mongo {
 
     bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex );
 
-
-    /**
-     * @return true if ns is 'normal'.  $ used for collections holding index data, which do not contain BSON objects in their records.
-     * special case for the local.oplog.$main ns -- naming it as such was a mistake.
-     */
-    inline bool isANormalNSName( const char* ns ) {
-        if ( strchr( ns , '$' ) == 0 )
-            return true;
-        return strcmp( ns, "local.oplog.$main" ) == 0;
-    }
-
     inline BSONObj::BSONObj(const Record *r) {
-        init(r->data, false);
+        init(r->data);
     }
 
 } // namespace mongo
diff --git a/db/projection.cpp b/db/projection.cpp
index 3dcfef7..d07e565 100644
--- a/db/projection.cpp
+++ b/db/projection.cpp
@@ -61,7 +61,7 @@ namespace mongo {
                     }
                 }
                 else {
-                    uassert(13097, string("Unsupported projection option: ") + obj.firstElement().fieldName(), false);
+                    uassert(13097, string("Unsupported projection option: ") + obj.firstElementFieldName(), false);
                 }
 
             }
diff --git a/db/projection.h b/db/projection.h
index fd3b856..b5e0a0c 100644
--- a/db/projection.h
+++ b/db/projection.h
@@ -94,6 +94,8 @@ namespace mongo {
          */
         KeyOnly* checkKey( const BSONObj& keyPattern ) const;
 
+        bool includeID() const { return _includeID; }
+
     private:
 
         /**
diff --git a/db/queryoptimizer.cpp b/db/queryoptimizer.cpp
index 4eb2a99..692e9fd 100644
--- a/db/queryoptimizer.cpp
+++ b/db/queryoptimizer.cpp
@@ -1,4 +1,4 @@
-/* queryoptimizer.cpp */
+// @file queryoptimizer.cpp
 
 /**
 *    Copyright (C) 2008 10gen Inc.
@@ -24,7 +24,6 @@
 #include "queryoptimizer.h"
 #include "cmdline.h"
 #include "clientcursor.h"
-#include <queue>
 
 //#define DEBUGQO(x) cout << x << endl;
 #define DEBUGQO(x)
@@ -53,9 +52,10 @@ namespace mongo {
 
     QueryPlan::QueryPlan(
         NamespaceDetails *d, int idxNo,
-        const FieldRangeSet &fbs, const FieldRangeSet &originalFrs, const BSONObj &originalQuery, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) :
+        const FieldRangeSetPair &frsp, const FieldRangeSetPair *originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONObj &startKey, const BSONObj &endKey , string special ) :
         _d(d), _idxNo(idxNo),
-        _fbs( fbs ),
+        _frs( frsp.frsForIndex( _d, _idxNo ) ),
+        _frsMulti( frsp.frsForIndex( _d, -1 ) ),
         _originalQuery( originalQuery ),
         _order( order ),
         _index( 0 ),
@@ -65,36 +65,44 @@ namespace mongo {
         _direction( 0 ),
         _endKeyInclusive( endKey.isEmpty() ),
         _unhelpful( false ),
+        _impossible( false ),
         _special( special ),
         _type(0),
-        _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ) {
+        _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ),
+        _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) {
 
-        if ( !_fbs.matchPossible() ) {
-            _unhelpful = true;
+        BSONObj idxKey = _idxNo < 0 ? BSONObj() : d->idx( _idxNo ).keyPattern();
+            
+        if ( !_frs.matchPossibleForIndex( idxKey ) ) {
+            _impossible = true;
             _scanAndOrderRequired = false;
             return;
         }
-
-        if( _idxNo >= 0 ) {
-            _index = &d->idx(_idxNo);
-        }
-        else {
-            // full table scan case
-            if ( _order.isEmpty() || !strcmp( _order.firstElement().fieldName(), "$natural" ) )
+            
+        if ( willScanTable() ) {
+            if ( _order.isEmpty() || !strcmp( _order.firstElementFieldName(), "$natural" ) )
                 _scanAndOrderRequired = false;
-            return;
+            return;                
         }
+            
+        _index = &d->idx(_idxNo);
+
+        // If the parsing or index indicates this is a special query, don't continue the processing
+        if ( _special.size() ||
+            ( _index->getSpec().getType() && _index->getSpec().getType()->suitability( originalQuery, order ) != USELESS ) ) {
+
+            if( _special.size() ) _optimal = true;
 
-        if ( _special.size() ) {
-            _optimal = true;
             _type  = _index->getSpec().getType();
+            if( !_special.size() ) _special = _index->getSpec().getType()->getPlugin()->getName();
+
             massert( 13040 , (string)"no type for special: " + _special , _type );
             // hopefully safe to use original query in these contexts - don't think we can mix special with $or clause separation yet
             _scanAndOrderRequired = _type->scanAndOrderRequired( _originalQuery , order );
             return;
         }
 
-        BSONObj idxKey = _index->keyPattern();
+        const IndexSpec &idxSpec = _index->getSpec();
         BSONObjIterator o( order );
         BSONObjIterator k( idxKey );
         if ( !o.moreWithEOO() )
@@ -114,7 +122,7 @@ namespace mongo {
                     goto doneCheckOrder;
                 if ( strcmp( oe.fieldName(), ke.fieldName() ) == 0 )
                     break;
-                if ( !fbs.range( ke.fieldName() ).equality() )
+                if ( !_frs.range( ke.fieldName() ).equality() )
                     goto doneCheckOrder;
             }
             int d = elementDirection( oe ) == elementDirection( ke ) ? 1 : -1;
@@ -130,41 +138,46 @@ doneCheckOrder:
         int exactIndexedQueryCount = 0;
         int optimalIndexedQueryCount = 0;
         bool stillOptimalIndexedQueryCount = true;
-        set< string > orderFieldsUnindexed;
+        set<string> orderFieldsUnindexed;
         order.getFieldNames( orderFieldsUnindexed );
         while( i.moreWithEOO() ) {
             BSONElement e = i.next();
             if ( e.eoo() )
                 break;
-            const FieldRange &fb = fbs.range( e.fieldName() );
+            const FieldRange &fr = _frs.range( e.fieldName() );
             if ( stillOptimalIndexedQueryCount ) {
-                if ( fb.nontrivial() )
+                if ( fr.nontrivial() )
                     ++optimalIndexedQueryCount;
-                if ( !fb.equality() )
+                if ( !fr.equality() )
                     stillOptimalIndexedQueryCount = false;
             }
             else {
-                if ( fb.nontrivial() )
+                if ( fr.nontrivial() )
                     optimalIndexedQueryCount = -1;
             }
-            if ( fb.equality() ) {
-                BSONElement e = fb.max();
+            if ( fr.equality() ) {
+                BSONElement e = fr.max();
                 if ( !e.isNumber() && !e.mayEncapsulate() && e.type() != RegEx )
                     ++exactIndexedQueryCount;
             }
             orderFieldsUnindexed.erase( e.fieldName() );
         }
         if ( !_scanAndOrderRequired &&
-                ( optimalIndexedQueryCount == fbs.nNontrivialRanges() ) )
+                ( optimalIndexedQueryCount == _frs.nNontrivialRanges() ) )
             _optimal = true;
-        if ( exactIndexedQueryCount == fbs.nNontrivialRanges() &&
+        if ( exactIndexedQueryCount == _frs.nNontrivialRanges() &&
                 orderFieldsUnindexed.size() == 0 &&
-                exactIndexedQueryCount == _index->keyPattern().nFields() &&
+                exactIndexedQueryCount == idxKey.nFields() &&
                 exactIndexedQueryCount == _originalQuery.nFields() ) {
             _exactKeyMatch = true;
         }
-        _frv.reset( new FieldRangeVector( fbs, idxKey, _direction ) );
-        _originalFrv.reset( new FieldRangeVector( originalFrs, idxKey, _direction ) );
+        _frv.reset( new FieldRangeVector( _frs, idxSpec, _direction ) );
+        if ( originalFrsp ) {
+            _originalFrv.reset( new FieldRangeVector( originalFrsp->frsForIndex( _d, _idxNo ), idxSpec, _direction ) );
+        }
+        else {
+            _originalFrv = _frv;
+        }
         if ( _startOrEndSpec ) {
             BSONObj newStart, newEnd;
             if ( !startKey.isEmpty() )
@@ -178,7 +191,7 @@ doneCheckOrder:
         }
 
         if ( ( _scanAndOrderRequired || _order.isEmpty() ) &&
-                !fbs.range( idxKey.firstElement().fieldName() ).nontrivial() ) {
+                !_frs.range( idxKey.firstElementFieldName() ).nontrivial() ) {
             _unhelpful = true;
         }
     }
@@ -190,39 +203,57 @@ doneCheckOrder:
             return _type->newCursor( _originalQuery , _order , numWanted );
         }
 
-        if ( !_fbs.matchPossible() ) {
-            if ( _fbs.nNontrivialRanges() )
-                checkTableScanAllowed( _fbs.ns() );
+        if ( _impossible ) {
+            // TODO We might want to allow this dummy table scan even in no table
+            // scan mode, since it won't scan anything.
+            if ( _frs.nNontrivialRanges() )
+                checkTableScanAllowed( _frs.ns() );
             return shared_ptr<Cursor>( new BasicCursor( DiskLoc() ) );
         }
-        if ( !_index ) {
-            if ( _fbs.nNontrivialRanges() )
-                checkTableScanAllowed( _fbs.ns() );
-            return findTableScan( _fbs.ns(), _order, startLoc );
-        }
 
+        if ( willScanTable() ) {
+            if ( _frs.nNontrivialRanges() ) {
+                checkTableScanAllowed( _frs.ns() );
+                
+                // if we are doing a table scan on _id
+                // and its a capped collection
+                // we disallow as its a common user error
+                // .system. and local collections are exempt
+                if ( _d && _d->capped && _frs.range( "_id" ).nontrivial() ) {
+                    if ( cc().isSyncThread() ||
+                         str::contains( _frs.ns() , ".system." ) || 
+                         str::startsWith( _frs.ns() , "local." ) ) {
+                        // ok
+                    }
+                    else {
+                        warning() << "_id query on capped collection without an _id index, performance will be poor collection: " << _frs.ns() << endl;
+                        //uassert( 14820, str::stream() << "doing _id query on a capped collection without an index is not allowed: " << _frs.ns() ,
+                    }
+                }
+            }
+            return findTableScan( _frs.ns(), _order, startLoc );
+        }
+                
         massert( 10363 ,  "newCursor() with start location not implemented for indexed plans", startLoc.isNull() );
 
         if ( _startOrEndSpec ) {
             // we are sure to spec _endKeyInclusive
-            return shared_ptr<Cursor>( new BtreeCursor( _d, _idxNo, *_index, _startKey, _endKey, _endKeyInclusive, _direction >= 0 ? 1 : -1 ) );
+            return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _startKey, _endKey, _endKeyInclusive, _direction >= 0 ? 1 : -1 ) );
         }
         else if ( _index->getSpec().getType() ) {
-            return shared_ptr<Cursor>( new BtreeCursor( _d, _idxNo, *_index, _frv->startKey(), _frv->endKey(), true, _direction >= 0 ? 1 : -1 ) );
+            return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv->startKey(), _frv->endKey(), true, _direction >= 0 ? 1 : -1 ) );
         }
         else {
-            return shared_ptr<Cursor>( new BtreeCursor( _d, _idxNo, *_index, _frv, _direction >= 0 ? 1 : -1 ) );
+            return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv, _direction >= 0 ? 1 : -1 ) );
         }
     }
 
     shared_ptr<Cursor> QueryPlan::newReverseCursor() const {
-        if ( !_fbs.matchPossible() )
-            return shared_ptr<Cursor>( new BasicCursor( DiskLoc() ) );
-        if ( !_index ) {
+        if ( willScanTable() ) {
             int orderSpec = _order.getIntField( "$natural" );
             if ( orderSpec == INT_MIN )
                 orderSpec = 1;
-            return findTableScan( _fbs.ns(), BSON( "$natural" << -orderSpec ) );
+            return findTableScan( _frs.ns(), BSON( "$natural" << -orderSpec ) );
         }
         massert( 10364 ,  "newReverseCursor() not implemented for indexed plans", false );
         return shared_ptr<Cursor>();
@@ -235,23 +266,51 @@ doneCheckOrder:
     }
 
     void QueryPlan::registerSelf( long long nScanned ) const {
-        if ( _fbs.matchPossible() ) {
-            scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
-            NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( _fbs.pattern( _order ), indexKey(), nScanned );
+        // FIXME SERVER-2864 Otherwise no query pattern can be generated.
+        if ( _frs.matchPossible() ) {
+            SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+            NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( _frs.pattern( _order ), indexKey(), nScanned );
         }
     }
+    
+    /**
+     * @return a copy of the inheriting class, which will be run with its own
+     * query plan.  If multiple plan sets are required for an $or query, the
+     * QueryOp of the winning plan from a given set will be cloned to generate
+     * QueryOps for the subsequent plan set.  This function should only be called
+     * after the query op has completed executing.
+     */    
+    QueryOp *QueryOp::createChild() {
+        if( _orConstraint.get() ) {
+            _matcher->advanceOrClause( _orConstraint );
+            _orConstraint.reset();
+        }
+        QueryOp *ret = _createChild();
+        ret->_oldMatcher = _matcher;
+        return ret;
+    }    
 
     bool QueryPlan::isMultiKey() const {
         if ( _idxNo < 0 )
             return false;
         return _d->isMultikey( _idxNo );
     }
+    
+    void QueryOp::init() {
+        if ( _oldMatcher.get() ) {
+            _matcher.reset( _oldMatcher->nextClauseMatcher( qp().indexKey() ) );
+        }
+        else {
+            _matcher.reset( new CoveredIndexMatcher( qp().originalQuery(), qp().indexKey(), alwaysUseRecord() ) );
+        }
+        _init();
+    }    
 
-    QueryPlanSet::QueryPlanSet( const char *ns, auto_ptr< FieldRangeSet > frs, auto_ptr< FieldRangeSet > originalFrs, const BSONObj &originalQuery, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) :
+    QueryPlanSet::QueryPlanSet( const char *ns, auto_ptr<FieldRangeSetPair> frsp, auto_ptr<FieldRangeSetPair> originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) :
         _ns(ns),
         _originalQuery( originalQuery ),
-        _fbs( frs ),
-        _originalFrs( originalFrs ),
+        _frsp( frsp ),
+        _originalFrsp( originalFrsp ),
         _mayRecordPlan( true ),
         _usingPrerecordedPlan( false ),
         _hint( BSONObj() ),
@@ -262,7 +321,8 @@ doneCheckOrder:
         _max( max.getOwned() ),
         _bestGuessOnly( bestGuessOnly ),
         _mayYield( mayYield ),
-        _yieldSometimesTracker( 256, 20 ) {
+	    _yieldSometimesTracker( 256, 20 ),
+        _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) {
         if ( hint && !hint->eoo() ) {
             _hint = hint->wrap();
         }
@@ -289,10 +349,10 @@ doneCheckOrder:
             string errmsg;
             BSONObj keyPattern = id.keyPattern();
             // This reformats _min and _max to be used for index lookup.
-            massert( 10365 ,  errmsg, indexDetailsForRange( _fbs->ns(), errmsg, _min, _max, keyPattern ) );
+            massert( 10365 ,  errmsg, indexDetailsForRange( _frsp->ns(), errmsg, _min, _max, keyPattern ) );
         }
         NamespaceDetails *d = nsdetails(_ns);
-        _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_fbs, *_originalFrs, _originalQuery, _order, _min, _max ) ) );
+        _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) );
     }
 
     // returns an IndexDetails * for a hint, 0 if hint is $natural.
@@ -312,7 +372,7 @@ doneCheckOrder:
         else if( hint.type() == Object ) {
             BSONObj hintobj = hint.embeddedObject();
             uassert( 10112 ,  "bad hint", !hintobj.isEmpty() );
-            if ( !strcmp( hintobj.firstElement().fieldName(), "$natural" ) ) {
+            if ( !strcmp( hintobj.firstElementFieldName(), "$natural" ) ) {
                 return 0;
             }
             NamespaceDetails::IndexIterator i = d->ii();
@@ -329,15 +389,16 @@ doneCheckOrder:
 
     void QueryPlanSet::init() {
         DEBUGQO( "QueryPlanSet::init " << ns << "\t" << _originalQuery );
+        _runner.reset();
         _plans.clear();
         _mayRecordPlan = true;
         _usingPrerecordedPlan = false;
 
-        const char *ns = _fbs->ns();
+        const char *ns = _frsp->ns();
         NamespaceDetails *d = nsdetails( ns );
-        if ( !d || !_fbs->matchPossible() ) {
+        if ( !d || !_frsp->matchPossible() ) {
             // Table scan plan, when no matches are possible
-            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ) );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
             return;
         }
 
@@ -351,7 +412,7 @@ doneCheckOrder:
             else {
                 massert( 10366 ,  "natural order cannot be specified with $min/$max", _min.isEmpty() && _max.isEmpty() );
                 // Table scan plan
-                _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ) );
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
             }
             return;
         }
@@ -361,7 +422,7 @@ doneCheckOrder:
             BSONObj keyPattern;
             IndexDetails *idx = indexDetailsForRange( ns, errmsg, _min, _max, keyPattern );
             massert( 10367 ,  errmsg, idx );
-            _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_fbs, *_originalFrs, _originalQuery, _order, _min, _max ) ) );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) );
             return;
         }
 
@@ -370,19 +431,19 @@ doneCheckOrder:
             if ( idx >= 0 ) {
                 _usingPrerecordedPlan = true;
                 _mayRecordPlan = false;
-                _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_fbs , *_fbs , _originalQuery, _order ) ) );
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_frsp , _originalFrsp.get() , _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
                 return;
             }
         }
 
         if ( _originalQuery.isEmpty() && _order.isEmpty() ) {
-            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ) );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
             return;
         }
 
-        DEBUGQO( "\t special : " << _fbs->getSpecial() );
-        if ( _fbs->getSpecial().size() ) {
-            _special = _fbs->getSpecial();
+        DEBUGQO( "\t special : " << _frsp->getSpecial() );
+        if ( _frsp->getSpecial().size() ) {
+            _special = _frsp->getSpecial();
             NamespaceDetails::IndexIterator i = d->ii();
             while( i.more() ) {
                 int j = i.pos();
@@ -391,8 +452,8 @@ doneCheckOrder:
                 if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , _order ) ) {
                     _usingPrerecordedPlan = true;
                     _mayRecordPlan = false;
-                    _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_fbs , *_fbs , _originalQuery, _order ,
-                                                    BSONObj() , BSONObj() , _special ) ) );
+                    _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_frsp , _originalFrsp.get() , _originalQuery, _order ,
+                                                    _mustAssertOnYieldFailure , BSONObj() , BSONObj() , _special ) ) );
                     return;
                 }
             }
@@ -400,15 +461,15 @@ doneCheckOrder:
         }
 
         if ( _honorRecordedPlan ) {
-            scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
-            NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( ns );
-            BSONObj bestIndex = nsd.indexForPattern( _fbs->pattern( _order ) );
+            pair< BSONObj, long long > best = QueryUtilIndexed::bestIndexForPatterns( *_frsp, _order );
+            BSONObj bestIndex = best.first;
+            long long oldNScanned = best.second;
             if ( !bestIndex.isEmpty() ) {
                 QueryPlanPtr p;
-                _oldNScanned = nsd.nScannedForPattern( _fbs->pattern( _order ) );
-                if ( !strcmp( bestIndex.firstElement().fieldName(), "$natural" ) ) {
+                _oldNScanned = oldNScanned;
+                if ( !strcmp( bestIndex.firstElementFieldName(), "$natural" ) ) {
                     // Table scan plan
-                    p.reset( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) );
+                    p.reset( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
                 }
 
                 NamespaceDetails::IndexIterator i = d->ii();
@@ -416,7 +477,7 @@ doneCheckOrder:
                     int j = i.pos();
                     IndexDetails& ii = i.next();
                     if( ii.keyPattern().woCompare(bestIndex) == 0 ) {
-                        p.reset( new QueryPlan( d, j, *_fbs, *_originalFrs, _originalQuery, _order ) );
+                        p.reset( new QueryPlan( d, j, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
                     }
                 }
 
@@ -434,67 +495,111 @@ doneCheckOrder:
     }
 
     void QueryPlanSet::addOtherPlans( bool checkFirst ) {
-        const char *ns = _fbs->ns();
+        const char *ns = _frsp->ns();
         NamespaceDetails *d = nsdetails( ns );
         if ( !d )
             return;
 
         // If table scan is optimal or natural order requested or tailable cursor requested
-        if ( !_fbs->matchPossible() || ( _fbs->nNontrivialRanges() == 0 && _order.isEmpty() ) ||
-                ( !_order.isEmpty() && !strcmp( _order.firstElement().fieldName(), "$natural" ) ) ) {
+        if ( !_frsp->matchPossible() || ( _frsp->noNontrivialRanges() && _order.isEmpty() ) ||
+                ( !_order.isEmpty() && !strcmp( _order.firstElementFieldName(), "$natural" ) ) ) {
             // Table scan plan
-            addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ), checkFirst );
+            addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst );
             return;
         }
 
         bool normalQuery = _hint.isEmpty() && _min.isEmpty() && _max.isEmpty();
 
         PlanSet plans;
+        QueryPlanPtr optimalPlan;
         for( int i = 0; i < d->nIndexes; ++i ) {
-            IndexDetails& id = d->idx(i);
-            const IndexSpec& spec = id.getSpec();
-            IndexSuitability suitability = HELPFUL;
             if ( normalQuery ) {
-                suitability = spec.suitability( _fbs->simplifiedQuery() , _order );
-                if ( suitability == USELESS )
+                BSONObj keyPattern = d->idx( i ).keyPattern();
+                if ( !_frsp->matchPossibleForIndex( d, i, keyPattern ) ) {
+                    // If no match is possible, only generate a trival plan that won't
+                    // scan any documents.
+                    QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+                    addPlan( p, checkFirst );
+                    return;
+                }
+                if ( !QueryUtilIndexed::indexUseful( *_frsp, d, i, _order ) ) {
                     continue;
+                }
             }
 
-            QueryPlanPtr p( new QueryPlan( d, i, *_fbs, *_originalFrs, _originalQuery, _order ) );
+            QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
             if ( p->optimal() ) {
-                addPlan( p, checkFirst );
-                return;
+                if ( !optimalPlan.get() ) {
+                    optimalPlan = p;
+                }
             }
             else if ( !p->unhelpful() ) {
                 plans.push_back( p );
             }
         }
+        if ( optimalPlan.get() ) {
+            addPlan( optimalPlan, checkFirst );
+            return;
+        }
         for( PlanSet::iterator i = plans.begin(); i != plans.end(); ++i )
             addPlan( *i, checkFirst );
 
         // Table scan plan
-        addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_fbs, *_originalFrs, _originalQuery, _order ) ), checkFirst );
+        addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst );
     }
 
-    shared_ptr< QueryOp > QueryPlanSet::runOp( QueryOp &op ) {
+    shared_ptr<QueryOp> QueryPlanSet::runOp( QueryOp &op ) {
         if ( _usingPrerecordedPlan ) {
             Runner r( *this, op );
-            shared_ptr< QueryOp > res = r.run();
-            // _plans.size() > 1 if addOtherPlans was called in Runner::run().
+            shared_ptr<QueryOp> res = r.runUntilFirstCompletes();
+            // _plans.size() > 1 if addOtherPlans was called in Runner::runUntilFirstCompletes().
             if ( _bestGuessOnly || res->complete() || _plans.size() > 1 )
                 return res;
-            {
-                scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
-                NamespaceDetailsTransient::get_inlock( _fbs->ns() ).registerIndexForPattern( _fbs->pattern( _order ), BSONObj(), 0 );
-            }
+            // Retry with all candidate plans.
+            QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order );
             init();
         }
         Runner r( *this, op );
-        return r.run();
+        return r.runUntilFirstCompletes();
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::nextOp( QueryOp &originalOp, bool retried ) {
+        if ( !_runner ) {
+            _runner.reset( new Runner( *this, originalOp ) );
+            shared_ptr<QueryOp> op = _runner->init();
+            if ( op->complete() ) {
+                return op;   
+            }
+        }
+        shared_ptr<QueryOp> op = _runner->nextNonError();
+        if ( !op->error() ) {
+            return op;   
+        }
+        if ( !_usingPrerecordedPlan || _bestGuessOnly || _plans.size() > 1 ) {
+            return op;
+        }
+
+        // Avoid an infinite loop here
+        uassert( 15878, str::stream() << "query plans not successful even with no constraints, potentially due to additional sort", ! retried );
+
+        // Retry with all candidate plans.
+        QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order );
+        init();
+        return nextOp( originalOp, true );
     }
 
+    bool QueryPlanSet::prepareToYield() {
+        return _runner ? _runner->prepareToYield() : true;   
+    }
+    
+    void QueryPlanSet::recoverFromYield() {
+        if ( _runner ) {
+            _runner->recoverFromYield();   
+        }
+    }
+    
     BSONObj QueryPlanSet::explain() const {
-        vector< BSONObj > arr;
+        vector<BSONObj> arr;
         for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) {
             shared_ptr<Cursor> c = (*i)->newCursor();
             BSONObjBuilder explain;
@@ -515,17 +620,16 @@ doneCheckOrder:
                     return _plans[i];
             }
 
-            stringstream ss;
-            ss << "best guess plan requested, but scan and order required:";
-            ss << " query: " << _fbs->simplifiedQuery();
-            ss << " order: " << _order;
-            ss << " choices: ";
-            for ( unsigned i=0; i<_plans.size(); i++ ) {
-                ss << _plans[i]->indexKey() << " ";
-            }
+            warning() << "best guess query plan requested, but scan and order are required for all plans "
+            		  << " query: " << _originalQuery
+            		  << " order: " << _order
+            		  << " choices: ";
+
+            for ( unsigned i=0; i<_plans.size(); i++ )
+            	warning() << _plans[i]->indexKey() << " ";
+            warning() << endl;
 
-            string s = ss.str();
-            msgassertedNoTrace( 13284, s.c_str() );
+            return QueryPlanPtr();
         }
         return _plans[0];
     }
@@ -535,101 +639,134 @@ doneCheckOrder:
         _plans( plans ) {
     }
 
-    void QueryPlanSet::Runner::mayYield( const vector< shared_ptr< QueryOp > > &ops ) {
-        if ( _plans._mayYield ) {
-            if ( _plans._yieldSometimesTracker.ping() ) {
-                int micros = ClientCursor::yieldSuggest();
-                if ( micros > 0 ) {
-                    for( vector< shared_ptr< QueryOp > >::const_iterator i = ops.begin(); i != ops.end(); ++i ) {
-                        if ( !prepareToYield( **i ) ) {
-                            return;
-                        }
-                    }
-                    ClientCursor::staticYield( micros , _plans._ns );
-                    for( vector< shared_ptr< QueryOp > >::const_iterator i = ops.begin(); i != ops.end(); ++i ) {
-                        recoverFromYield( **i );
-                    }
-                }
+    bool QueryPlanSet::Runner::prepareToYield() {
+        for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            if ( !prepareToYieldOp( **i ) ) {
+                return false;
             }
         }
+        return true;
     }
 
-    struct OpHolder {
-        OpHolder( const shared_ptr< QueryOp > &op ) : _op( op ), _offset() {}
-        shared_ptr< QueryOp > _op;
-        long long _offset;
-        bool operator<( const OpHolder &other ) const {
-            return _op->nscanned() + _offset > other._op->nscanned() + other._offset;
-        }
-    };
+    void QueryPlanSet::Runner::recoverFromYield() {
+        for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            recoverFromYieldOp( **i );
+        }        
+    }
+    
+    void QueryPlanSet::Runner::mayYield() {
+        if ( ! _plans._mayYield ) 
+            return;
+        
+        if ( ! _plans._yieldSometimesTracker.ping() ) 
+            return;
+        
+        int micros = ClientCursor::yieldSuggest();
+        if ( micros <= 0 ) 
+            return;
+        
+        if ( !prepareToYield() ) 
+            return;   
+        
+        ClientCursor::staticYield( micros , _plans._ns , 0 );
+        recoverFromYield();
+    }
 
-    shared_ptr< QueryOp > QueryPlanSet::Runner::run() {
+    shared_ptr<QueryOp> QueryPlanSet::Runner::init() {
         massert( 10369 ,  "no plans", _plans._plans.size() > 0 );
-
-        vector< shared_ptr< QueryOp > > ops;
+        
         if ( _plans._bestGuessOnly ) {
-            shared_ptr< QueryOp > op( _op.createChild() );
+            shared_ptr<QueryOp> op( _op.createChild() );
             op->setQueryPlan( _plans.getBestGuess().get() );
-            ops.push_back( op );
+            _ops.push_back( op );
         }
         else {
             if ( _plans._plans.size() > 1 )
                 log(1) << "  running multiple plans" << endl;
             for( PlanSet::iterator i = _plans._plans.begin(); i != _plans._plans.end(); ++i ) {
-                shared_ptr< QueryOp > op( _op.createChild() );
+                shared_ptr<QueryOp> op( _op.createChild() );
                 op->setQueryPlan( i->get() );
-                ops.push_back( op );
+                _ops.push_back( op );
             }
         }
-
-        for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) {
+        
+        // Initialize ops.
+        for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) {
             initOp( **i );
             if ( (*i)->complete() )
                 return *i;
         }
-
-        std::priority_queue< OpHolder > queue;
-        for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) {
+        
+        // Put runnable ops in the priority queue.
+        for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) {
             if ( !(*i)->error() ) {
-                queue.push( *i );
+                _queue.push( *i );
             }
         }
-
-        while( !queue.empty() ) {
-            mayYield( ops );
-            OpHolder holder = queue.top();
-            queue.pop();
-            QueryOp &op = *holder._op;
-            nextOp( op );
-            if ( op.complete() ) {
-                if ( _plans._mayRecordPlan && op.mayRecordPlan() ) {
-                    op.qp().registerSelf( op.nscanned() );
-                }
-                return holder._op;
-            }
-            if ( op.error() ) {
-                continue;
-            }
-            queue.push( holder );
-            if ( !_plans._bestGuessOnly && _plans._usingPrerecordedPlan && op.nscanned() > _plans._oldNScanned * 10 && _plans._special.empty() ) {
-                holder._offset = -op.nscanned();
-                _plans.addOtherPlans( true );
-                PlanSet::iterator i = _plans._plans.begin();
-                ++i;
-                for( ; i != _plans._plans.end(); ++i ) {
-                    shared_ptr< QueryOp > op( _op.createChild() );
-                    op->setQueryPlan( i->get() );
-                    ops.push_back( op );
-                    initOp( *op );
-                    if ( op->complete() )
-                        return op;
-                    queue.push( op );
-                }
-                _plans._mayRecordPlan = true;
-                _plans._usingPrerecordedPlan = false;
+        
+        return *_ops.begin();
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::Runner::nextNonError() {
+        if ( _queue.empty() ) {
+            return *_ops.begin();   
+        }
+        shared_ptr<QueryOp> ret;
+        do {
+            ret = next();
+        } while( ret->error() && !_queue.empty() );
+        return ret;
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::Runner::next() {
+        mayYield();
+        dassert( !_queue.empty() );
+        OpHolder holder = _queue.pop();
+        QueryOp &op = *holder._op;
+        nextOp( op );
+        if ( op.complete() ) {
+            if ( _plans._mayRecordPlan && op.mayRecordPlan() ) {
+                op.qp().registerSelf( op.nscanned() );
             }
+            return holder._op;
+        }
+        if ( op.error() ) {
+            return holder._op;
         }
-        return ops[ 0 ];
+        _queue.push( holder );
+        if ( !_plans._bestGuessOnly && _plans._usingPrerecordedPlan && op.nscanned() > _plans._oldNScanned * 10 && _plans._special.empty() ) {
+            holder._offset = -op.nscanned();
+            _plans.addOtherPlans( /* avoid duplicating the initial plan */ true );
+            PlanSet::iterator i = _plans._plans.begin();
+            ++i;
+            for( ; i != _plans._plans.end(); ++i ) {
+                shared_ptr<QueryOp> op( _op.createChild() );
+                op->setQueryPlan( i->get() );
+                _ops.push_back( op );
+                initOp( *op );
+                if ( op->complete() )
+                    return op;
+                _queue.push( op );
+            }
+            _plans._mayRecordPlan = true;
+            _plans._usingPrerecordedPlan = false;
+        }
+        return holder._op;
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::Runner::runUntilFirstCompletes() {
+        shared_ptr<QueryOp> potentialFinisher = init();
+        if ( potentialFinisher->complete() ) {
+         	return potentialFinisher;
+        }
+        
+        while( !_queue.empty() ) {
+            shared_ptr<QueryOp> potentialFinisher = next();
+            if ( potentialFinisher->complete() ) {
+                return potentialFinisher;
+            }
+        }
+        return _ops[ 0 ];
     }
 
 #define GUARD_OP_EXCEPTION( op, expression ) \
@@ -655,22 +792,46 @@ doneCheckOrder:
         GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.next(); } );
     }
 
-    bool QueryPlanSet::Runner::prepareToYield( QueryOp &op ) {
+    bool QueryPlanSet::Runner::prepareToYieldOp( QueryOp &op ) {
         GUARD_OP_EXCEPTION( op,
         if ( op.error() ) {
-        return true;
-    }
-    else {
-        return op.prepareToYield();
+            return true;
+        }
+        else {
+            return op.prepareToYield();
         } );
         return true;
     }
 
-    void QueryPlanSet::Runner::recoverFromYield( QueryOp &op ) {
+    void QueryPlanSet::Runner::recoverFromYieldOp( QueryOp &op ) {
         GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.recoverFromYield(); } );
     }
 
-
+    /**
+     * NOTE on our $or implementation: In our current qo implementation we don't
+     * keep statistics on our data, but we can conceptualize the problem of
+     * selecting an index when statistics exist for all index ranges.  The
+     * d-hitting set problem on k sets and n elements can be reduced to the
+     * problem of index selection on k $or clauses and n index ranges (where
+     * d is the max number of indexes, and the number of ranges n is unbounded).
+     * In light of the fact that d-hitting set is np complete, and we don't even
+     * track statistics (so cost calculations are expensive) our first
+     * implementation uses the following greedy approach: We take one $or clause
+     * at a time and treat each as a separate query for index selection purposes.
+     * But if an index range is scanned for a particular $or clause, we eliminate
+     * that range from all subsequent clauses.  One could imagine an opposite
+     * implementation where we select indexes based on the union of index ranges
+     * for all $or clauses, but this can have much poorer worst case behavior.
+     * (An index range that suits one $or clause may not suit another, and this
+     * is worse than the typical case of index range choice staleness because
+     * with $or the clauses may likely be logically distinct.)  The greedy
+     * implementation won't do any worse than all the $or clauses individually,
+     * and it can often do better.  In the first cut we are intentionally using
+     * QueryPattern tracking to record successful plans on $or clauses for use by
+     * subsequent $or clauses, even though there may be a significant aggregate
+     * $nor component that would not be represented in QueryPattern.    
+     */
+    
     MultiPlanScanner::MultiPlanScanner( const char *ns,
                                         const BSONObj &query,
                                         const BSONObj &order,
@@ -683,24 +844,29 @@ doneCheckOrder:
         _ns( ns ),
         _or( !query.getField( "$or" ).eoo() ),
         _query( query.getOwned() ),
-        _fros( ns, _query ),
         _i(),
         _honorRecordedPlan( honorRecordedPlan ),
         _bestGuessOnly( bestGuessOnly ),
         _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ),
         _mayYield( mayYield ),
         _tableScanned() {
-        if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() || !_fros.getSpecial().empty() ) {
+        if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() ) {
             _or = false;
         }
-        if ( _or && uselessOr( _hint.firstElement() ) ) {
-            _or = false;
+        if ( _or ) {
+            // Only construct an OrRangeGenerator if we may handle $or clauses.
+            _org.reset( new OrRangeGenerator( ns, _query ) );
+            if ( !_org->getSpecial().empty() ) {
+                _or = false;
+            }
+            else if ( uselessOr( _hint.firstElement() ) ) {
+                _or = false;   
+            }
         }
         // if _or == false, don't use or clauses for index selection
         if ( !_or ) {
-            auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns, _query ) );
-            auto_ptr< FieldRangeSet > oldFrs( new FieldRangeSet( *frs ) );
-            _currentQps.reset( new QueryPlanSet( ns, frs, oldFrs, _query, order, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) );
+            auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, _query, true ) );
+            _currentQps.reset( new QueryPlanSet( ns, frsp, auto_ptr<FieldRangeSetPair>(), _query, order, false, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) );
         }
         else {
             BSONElement e = _query.getField( "$or" );
@@ -708,71 +874,168 @@ doneCheckOrder:
         }
     }
 
-    shared_ptr< QueryOp > MultiPlanScanner::runOpOnce( QueryOp &op ) {
-        massert( 13271, "can't run more ops", mayRunMore() );
+    shared_ptr<QueryOp> MultiPlanScanner::runOpOnce( QueryOp &op ) {
+        assertMayRunMore();
         if ( !_or ) {
             ++_i;
             return _currentQps->runOp( op );
         }
         ++_i;
-        auto_ptr< FieldRangeSet > frs( _fros.topFrs() );
-        auto_ptr< FieldRangeSet > originalFrs( _fros.topFrsOriginal() );
+        auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+        auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
         BSONElement hintElt = _hint.firstElement();
-        _currentQps.reset( new QueryPlanSet( _ns, frs, originalFrs, _query, BSONObj(), &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
-        shared_ptr< QueryOp > ret( _currentQps->runOp( op ) );
+        _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
+        shared_ptr<QueryOp> ret( _currentQps->runOp( op ) );
+        if ( ! ret->complete() )
+            throw MsgAssertionException( ret->exception() );
         if ( ret->qp().willScanTable() ) {
             _tableScanned = true;
+        } else {
+            // If the full table was scanned, don't bother popping the last or clause.
+	        _org->popOrClause( ret->qp().nsd(), ret->qp().idxNo(), ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() );
         }
-        _fros.popOrClause( ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() );
         return ret;
     }
 
-    shared_ptr< QueryOp > MultiPlanScanner::runOp( QueryOp &op ) {
-        shared_ptr< QueryOp > ret = runOpOnce( op );
+    shared_ptr<QueryOp> MultiPlanScanner::runOp( QueryOp &op ) {
+        shared_ptr<QueryOp> ret = runOpOnce( op );
         while( !ret->stopRequested() && mayRunMore() ) {
             ret = runOpOnce( *ret );
         }
         return ret;
     }
+    
+    shared_ptr<QueryOp> MultiPlanScanner::nextOpHandleEndOfClause() {
+        shared_ptr<QueryOp> op = _currentQps->nextOp( *_baseOp );
+        if ( !op->complete() ) {
+            return op;   
+        }
+        if ( op->qp().willScanTable() ) {
+            _tableScanned = true;   
+        } else {
+            _org->popOrClause( op->qp().nsd(), op->qp().idxNo(), op->qp().indexed() ? op->qp().indexKey() : BSONObj() );         	   
+        }
+        return op;
+    }
+    
+    shared_ptr<QueryOp> MultiPlanScanner::nextOpBeginningClause() {
+        assertMayRunMore();
+        shared_ptr<QueryOp> op;
+        while( mayRunMore() ) {
+	        ++_i;
+    	    auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+        	auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
+	        BSONElement hintElt = _hint.firstElement();
+    	    _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
+            op = nextOpHandleEndOfClause();
+            if ( !op->complete() ) {
+             	return op;
+            }
+            _baseOp = op;
+        }
+        return op;
+    }
+
+    shared_ptr<QueryOp> MultiPlanScanner::nextOp() {
+        if ( !_or ) {
+            if ( _i == 0 ) {
+                assertMayRunMore();
+	         	++_i;
+            }            
+            return _currentQps->nextOp( *_baseOp );   
+        }
+        if ( _i == 0 ) {
+            return nextOpBeginningClause();
+        }
+        shared_ptr<QueryOp> op = nextOpHandleEndOfClause();
+        if ( !op->complete() ) {
+            return op;   
+        }
+        if ( !op->stopRequested() && mayRunMore() ) {
+            // Finished scanning the clause, but stop hasn't been requested.
+            // Start scanning the next clause.
+            _baseOp = op;
+            return nextOpBeginningClause();
+        }
+        return op;
+    }
+    
+    bool MultiPlanScanner::prepareToYield() {
+        return _currentQps.get() ? _currentQps->prepareToYield() : true;
+    }
+    
+    void MultiPlanScanner::recoverFromYield() {
+        if ( _currentQps.get() ) {
+            _currentQps->recoverFromYield();   
+        }
+    }
+    
+    shared_ptr<Cursor> MultiPlanScanner::singleCursor() const {
+        if ( _or || _currentQps->nPlans() != 1 || _currentQps->firstPlan()->scanAndOrderRequired() ) {
+            return shared_ptr<Cursor>();
+        }
+        // If there is only one plan and it does not require an in memory
+        // sort, we do not expect its cursor op to throw an exception and
+        // so do not need a QueryOptimizerCursor to handle this case.
+        return _currentQps->firstPlan()->newCursor();
+    }
 
     bool MultiPlanScanner::uselessOr( const BSONElement &hint ) const {
         NamespaceDetails *nsd = nsdetails( _ns );
         if ( !nsd ) {
             return true;
         }
-        IndexDetails *id = 0;
         if ( !hint.eoo() ) {
             IndexDetails *id = parseHint( hint, nsd );
             if ( !id ) {
                 return true;
             }
+            return QueryUtilIndexed::uselessOr( *_org, nsd, nsd->idxNo( *id ) );
         }
-        vector< BSONObj > ret;
-        _fros.allClausesSimplified( ret );
-        for( vector< BSONObj >::const_iterator i = ret.begin(); i != ret.end(); ++i ) {
-            if ( id ) {
-                if ( id->getSpec().suitability( *i, BSONObj() ) == USELESS ) {
-                    return true;
-                }
-            }
-            else {
-                bool useful = false;
-                NamespaceDetails::IndexIterator j = nsd->ii();
-                while( j.more() ) {
-                    IndexDetails &id = j.next();
-                    if ( id.getSpec().suitability( *i, BSONObj() ) != USELESS ) {
-                        useful = true;
-                        break;
-                    }
-                }
-                if ( !useful ) {
-                    return true;
-                }
+        return QueryUtilIndexed::uselessOr( *_org, nsd, -1 );
+    }
+    
+    MultiCursor::MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op, bool mayYield )
+    : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ), _nscanned() {
+        if ( op.get() ) {
+            _op = op;
+        }
+        else {
+            _op.reset( new NoOp() );
+        }
+        if ( _mps->mayRunMore() ) {
+            nextClause();
+            if ( !ok() ) {
+                advance();
             }
         }
-        return false;
+        else {
+            _c.reset( new BasicCursor( DiskLoc() ) );
+        }
+    }    
+
+    MultiCursor::MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned )
+    : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ), _nscanned( nscanned ) {
+        _mps->setBestGuessOnly();
+        _mps->mayYield( false ); // with a NoOp, there's no need to yield in QueryPlanSet
+        if ( !ok() ) {
+            // would have been advanced by UserQueryOp if possible
+            advance();
+        }
     }
-
+    
+    void MultiCursor::nextClause() {
+        if ( _nscanned >= 0 && _c.get() ) {
+            _nscanned += _c->nscanned();
+        }
+        shared_ptr<CursorOp> best = _mps->runOpOnce( *_op );
+        if ( ! best->complete() )
+            throw MsgAssertionException( best->exception() );
+        _c = best->newCursor();
+        _matcher = best->matcher( _c );
+        _op = best;
+    }    
+    
     bool indexWorks( const BSONObj &idxPattern, const BSONObj &sampleKey, int direction, int firstSignificantField ) {
         BSONObjIterator p( idxPattern );
         BSONObjIterator k( sampleKey );
@@ -816,7 +1079,7 @@ doneCheckOrder:
         return b.obj();
     }
 
-    pair< int, int > keyAudit( const BSONObj &min, const BSONObj &max ) {
+    pair<int,int> keyAudit( const BSONObj &min, const BSONObj &max ) {
         int direction = 0;
         int firstSignificantField = 0;
         BSONObjIterator i( min );
@@ -841,7 +1104,7 @@ doneCheckOrder:
         return make_pair( direction, firstSignificantField );
     }
 
-    pair< int, int > flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) {
+    pair<int,int> flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) {
         if ( min.isEmpty() || max.isEmpty() ) {
             return make_pair( 1, -1 );
         }
@@ -865,7 +1128,7 @@ doneCheckOrder:
             return 0;
         }
 
-        pair< int, int > ret = flexibleKeyAudit( min, max );
+        pair<int,int> ret = flexibleKeyAudit( min, max );
         if ( ret == make_pair( -1, -1 ) ) {
             errmsg = "min and max keys do not share pattern";
             return 0;
@@ -924,5 +1187,115 @@ doneCheckOrder:
 
         return id;
     }
+    
+    bool isSimpleIdQuery( const BSONObj& query ) {
+        BSONObjIterator i(query);
+        
+        if( !i.more() ) 
+            return false;
 
+        BSONElement e = i.next();
+
+        if( i.more() ) 
+            return false;
+
+        if( strcmp("_id", e.fieldName()) != 0 ) 
+            return false;
+        
+        if ( e.isSimpleType() ) // e.g. not something like { _id : { $gt : ...
+            return true;
+
+        if ( e.type() == Object )
+            return e.Obj().firstElementFieldName()[0] != '$';
+
+        return false;
+    }
+
+    shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ) {
+        if( !query.getField( "$or" ).eoo() ) {
+            return shared_ptr<Cursor>( new MultiCursor( ns, query, sort ) );
+        }
+        else {
+            auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, query, true ) );
+            auto_ptr<FieldRangeSetPair> origFrsp( new FieldRangeSetPair( *frsp ) );
+
+            QueryPlanSet qps( ns, frsp, origFrsp, query, sort, false );
+            QueryPlanSet::QueryPlanPtr qpp = qps.getBestGuess();
+            if( ! qpp.get() ) return shared_ptr<Cursor>();
+
+            shared_ptr<Cursor> ret = qpp->newCursor();
+
+            // If we don't already have a matcher, supply one.
+            if ( !query.isEmpty() && ! ret->matcher() ) {
+                shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, ret->indexKeyPattern() ) );
+                ret->setMatcher( matcher );
+            }
+            return ret;
+        }
+    }
+
+    bool QueryUtilIndexed::indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order ) {
+        DEV frsp.assertValidIndex( d, idxNo );
+        BSONObj keyPattern = d->idx( idxNo ).keyPattern();
+        if ( !frsp.matchPossibleForIndex( d, idxNo, keyPattern ) ) {
+            // No matches are possible in the index so the index may be useful.
+            return true;   
+        }
+        return d->idx( idxNo ).getSpec().suitability( frsp.simplifiedQueryForIndex( d, idxNo, keyPattern ), order ) != USELESS;
+    }
+    
+    void QueryUtilIndexed::clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) {
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() );
+        nsd.registerIndexForPattern( frsp._singleKey.pattern( order ), BSONObj(), 0 );
+        nsd.registerIndexForPattern( frsp._multiKey.pattern( order ), BSONObj(), 0 );
+    }
+    
+    pair< BSONObj, long long > QueryUtilIndexed::bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) {
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() );
+        // TODO Maybe it would make sense to return the index with the lowest
+        // nscanned if there are two possibilities.
+        if ( frsp._singleKey.matchPossible() ) {
+            QueryPattern pattern = frsp._singleKey.pattern( order );
+            BSONObj oldIdx = nsd.indexForPattern( pattern );
+            if ( !oldIdx.isEmpty() ) {
+                long long oldNScanned = nsd.nScannedForPattern( pattern );
+                return make_pair( oldIdx, oldNScanned );
+            }
+        }
+        if ( frsp._multiKey.matchPossible() ) {
+            QueryPattern pattern = frsp._multiKey.pattern( order );
+            BSONObj oldIdx = nsd.indexForPattern( pattern );
+            if ( !oldIdx.isEmpty() ) {
+                long long oldNScanned = nsd.nScannedForPattern( pattern );
+                return make_pair( oldIdx, oldNScanned );
+            }
+        }
+        return make_pair( BSONObj(), 0 );
+    }
+    
+    bool QueryUtilIndexed::uselessOr( const OrRangeGenerator &org, NamespaceDetails *d, int hintIdx ) {
+        for( list<FieldRangeSetPair>::const_iterator i = org._originalOrSets.begin(); i != org._originalOrSets.end(); ++i ) {
+            if ( hintIdx != -1 ) {
+                if ( !indexUseful( *i, d, hintIdx, BSONObj() ) ) {
+                    return true;   
+                }
+            }
+            else {
+                bool useful = false;
+                for( int j = 0; j < d->nIndexes; ++j ) {
+                    if ( indexUseful( *i, d, j, BSONObj() ) ) {
+                        useful = true;
+                        break;
+                    }
+                }
+                if ( !useful ) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+    
 } // namespace mongo
diff --git a/db/queryoptimizer.h b/db/queryoptimizer.h
index ebd264e..fea6c0b 100644
--- a/db/queryoptimizer.h
+++ b/db/queryoptimizer.h
@@ -1,4 +1,4 @@
-/* queryoptimizer.h */
+// @file queryoptimizer.h
 
 /**
 *    Copyright (C) 2008 10gen Inc.
@@ -22,58 +22,79 @@
 #include "jsobj.h"
 #include "queryutil.h"
 #include "matcher.h"
-#include "../util/message.h"
+#include "../util/net/listen.h"
+#include <queue>
 
 namespace mongo {
 
     class IndexDetails;
     class IndexType;
+    class ElapsedTracker;
 
+    /** A plan for executing a query using the given index spec and FieldRangeSet. */
     class QueryPlan : boost::noncopyable {
     public:
 
+        /**
+         * @param originalFrsp - original constraints for this query clause.  If null, frsp will be used instead.
+         */
         QueryPlan(NamespaceDetails *d,
                   int idxNo, // -1 = no index
-                  const FieldRangeSet &fbs,
-                  const FieldRangeSet &originalFrs,
+                  const FieldRangeSetPair &frsp,
+                  const FieldRangeSetPair *originalFrsp,
                   const BSONObj &originalQuery,
                   const BSONObj &order,
+                  bool mustAssertOnYieldFailure = true,
                   const BSONObj &startKey = BSONObj(),
-                  const BSONObj &endKey = BSONObj() ,
+                  const BSONObj &endKey = BSONObj(),
                   string special="" );
 
-        /* If true, no other index can do better. */
+        /** @return true iff no other plans should be considered. */
         bool optimal() const { return _optimal; }
-        /* ScanAndOrder processing will be required if true */
+        /* @return true iff this plan should not be considered at all. */
+        bool unhelpful() const { return _unhelpful; }
+        /** @return true iff ScanAndOrder processing will be required for result set. */
         bool scanAndOrderRequired() const { return _scanAndOrderRequired; }
-        /* When true, the index we are using has keys such that it can completely resolve the
-         query expression to match by itself without ever checking the main object.
+        /**
+         * @return true iff the index we are using has keys such that it can completely resolve the
+         * query expression to match by itself without ever checking the main object.
          */
         bool exactKeyMatch() const { return _exactKeyMatch; }
-        /* If true, the startKey and endKey are unhelpful and the index order doesn't match the
-           requested sort order */
-        bool unhelpful() const { return _unhelpful; }
-        int direction() const { return _direction; }
+        /** @return true iff this QueryPlan would perform an unindexed scan. */
+        bool willScanTable() const { return _idxNo < 0 && !_impossible; }
+
+        /** @return a new cursor based on this QueryPlan's index and FieldRangeSet. */
         shared_ptr<Cursor> newCursor( const DiskLoc &startLoc = DiskLoc() , int numWanted=0 ) const;
+        /** @return a new reverse cursor if this is an unindexed plan. */
         shared_ptr<Cursor> newReverseCursor() const;
+        /** Register this plan as a winner for its QueryPattern, with specified 'nscanned'. */
+        void registerSelf( long long nScanned ) const;
+
+        int direction() const { return _direction; }
         BSONObj indexKey() const;
         bool indexed() const { return _index; }
-        bool willScanTable() const { return !_index && _fbs.matchPossible(); }
-        const char *ns() const { return _fbs.ns(); }
+        int idxNo() const { return _idxNo; }
+        const char *ns() const { return _frs.ns(); }
         NamespaceDetails *nsd() const { return _d; }
         BSONObj originalQuery() const { return _originalQuery; }
-        BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return _fbs.simplifiedQuery( fields ); }
-        const FieldRange &range( const char *fieldName ) const { return _fbs.range( fieldName ); }
-        void registerSelf( long long nScanned ) const;
-        shared_ptr< FieldRangeVector > originalFrv() const { return _originalFrv; }
-        // just for testing
-        shared_ptr< FieldRangeVector > frv() const { return _frv; }
+        BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return _frs.simplifiedQuery( fields ); }
+        const FieldRange &range( const char *fieldName ) const { return _frs.range( fieldName ); }
+        shared_ptr<FieldRangeVector> originalFrv() const { return _originalFrv; }
+
+        const FieldRangeSet &multikeyFrs() const { return _frsMulti; }
+        
+        bool mustAssertOnYieldFailure() const { return _mustAssertOnYieldFailure; }
+        
+        /** just for testing */
+        
+        shared_ptr<FieldRangeVector> frv() const { return _frv; }
         bool isMultiKey() const;
 
     private:
         NamespaceDetails * _d;
         int _idxNo;
-        const FieldRangeSet &_fbs;
+        const FieldRangeSet &_frs;
+        const FieldRangeSet &_frsMulti;
         const BSONObj &_originalQuery;
         const BSONObj &_order;
         const IndexDetails * _index;
@@ -81,86 +102,104 @@ namespace mongo {
         bool _scanAndOrderRequired;
         bool _exactKeyMatch;
         int _direction;
-        shared_ptr< FieldRangeVector > _frv;
-        shared_ptr< FieldRangeVector > _originalFrv;
+        shared_ptr<FieldRangeVector> _frv;
+        shared_ptr<FieldRangeVector> _originalFrv;
         BSONObj _startKey;
         BSONObj _endKey;
         bool _endKeyInclusive;
         bool _unhelpful;
+        bool _impossible;
         string _special;
         IndexType * _type;
         bool _startOrEndSpec;
+        bool _mustAssertOnYieldFailure;
     };
 
-    // Inherit from this interface to implement a new query operation.
-    // The query optimizer will clone the QueryOp that is provided, giving
-    // each clone its own query plan.
+    /**
+     * Inherit from this interface to implement a new query operation.
+     * The query optimizer will clone the QueryOp that is provided, giving
+     * each clone its own query plan.
+     *
+     * Normal sequence of events:
+     * 1) A new QueryOp is generated using createChild().
+     * 2) A QueryPlan is assigned to this QueryOp with setQueryPlan().
+     * 3) _init() is called on the QueryPlan.
+     * 4) next() is called repeatedly, with nscanned() checked after each call.
+     * 5) In one of these calls to next(), setComplete() is called.
+     * 6) The QueryPattern for the QueryPlan may be recorded as a winner.
+     */
     class QueryOp {
     public:
         QueryOp() : _complete(), _stopRequested(), _qp(), _error() {}
 
-        // Used when handing off from one QueryOp type to another
+        /** Used when handing off from one QueryOp to another. */
         QueryOp( const QueryOp &other ) :
             _complete(), _stopRequested(), _qp(), _error(), _matcher( other._matcher ),
             _orConstraint( other._orConstraint ) {}
 
         virtual ~QueryOp() {}
 
-        /** these gets called after a query plan is set */
-        void init() {
-            if ( _oldMatcher.get() ) {
-                _matcher.reset( _oldMatcher->nextClauseMatcher( qp().indexKey() ) );
-            }
-            else {
-                _matcher.reset( new CoveredIndexMatcher( qp().originalQuery(), qp().indexKey(), alwaysUseRecord() ) );
-            }
-            _init();
-        }
+        /** @return QueryPlan assigned to this QueryOp by the query optimizer. */
+        const QueryPlan &qp() const { return *_qp; }
+                
+        /** Advance to next potential matching document (eg using a cursor). */
         virtual void next() = 0;
-
-        virtual bool mayRecordPlan() const = 0;
-
+        /**
+         * @return current 'nscanned' metric for this QueryOp.  Used to compare
+         * cost to other QueryOps.
+         */
+        virtual long long nscanned() = 0;
+        /** Take any steps necessary before the db mutex is yielded. */
         virtual bool prepareToYield() { massert( 13335, "yield not supported", false ); return false; }
+        /** Recover once the db mutex is regained. */
         virtual void recoverFromYield() { massert( 13336, "yield not supported", false ); }
+                
+        /**
+         * @return true iff the QueryPlan for this QueryOp may be registered
+         * as a winning plan.
+         */
+        virtual bool mayRecordPlan() const = 0;
 
-        virtual long long nscanned() = 0;
-
-        /** @return a copy of the inheriting class, which will be run with its own
-                    query plan.  If multiple plan sets are required for an $or query,
-                    the QueryOp of the winning plan from a given set will be cloned
-                    to generate QueryOps for the subsequent plan set.  This function
-                    should only be called after the query op has completed executing.
-        */
-        QueryOp *createChild() {
-            if( _orConstraint.get() ) {
-                _matcher->advanceOrClause( _orConstraint );
-                _orConstraint.reset();
-            }
-            QueryOp *ret = _createChild();
-            ret->_oldMatcher = _matcher;
-            return ret;
-        }
+        /** @return true iff the implementation called setComplete() or setStop(). */
         bool complete() const { return _complete; }
-        bool error() const { return _error; }
+        /** @return true iff the implementation called steStop(). */
         bool stopRequested() const { return _stopRequested; }
+        /** @return true iff the implementation threw an exception. */
+        bool error() const { return _error; }
+        /** @return the exception thrown by implementation if one was thrown. */
         ExceptionInfo exception() const { return _exception; }
-        const QueryPlan &qp() const { return *_qp; }
-        // To be called by QueryPlanSet::Runner only.
-        void setQueryPlan( const QueryPlan *qp ) { _qp = qp; }
+        
+        /** To be called by QueryPlanSet::Runner only. */
+        
+        QueryOp *createChild();
+        void setQueryPlan( const QueryPlan *qp ) { _qp = qp; assert( _qp != NULL ); }
+        void init();        
         void setException( const DBException &e ) {
             _error = true;
             _exception = e.getInfo();
         }
-        shared_ptr< CoveredIndexMatcher > matcher() const { return _matcher; }
+
+        shared_ptr<CoveredIndexMatcher> matcher( const shared_ptr<Cursor>& c ) const {
+           return matcher( c.get() );
+        }
+        shared_ptr<CoveredIndexMatcher> matcher( Cursor* c ) const {
+            if( ! c ) return _matcher;
+            return c->matcher() ? c->matcherPtr() : _matcher;
+        }
+        
     protected:
+        /** Call if all results have been found. */
         void setComplete() {
             _orConstraint = qp().originalFrv();
             _complete = true;
         }
+        /** Call if the scan is complete even if not all results have been found. */
         void setStop() { setComplete(); _stopRequested = true; }
 
+        /** Handle initialization after a QueryPlan has been set. */
         virtual void _init() = 0;
 
+        /** @return a copy of the inheriting class, which will be run with its own query plan. */
         virtual QueryOp *_createChild() const = 0;
 
         virtual bool alwaysUseRecord() const { return false; }
@@ -171,42 +210,98 @@ namespace mongo {
         ExceptionInfo _exception;
         const QueryPlan *_qp;
         bool _error;
-        shared_ptr< CoveredIndexMatcher > _matcher;
-        shared_ptr< CoveredIndexMatcher > _oldMatcher;
-        shared_ptr< FieldRangeVector > _orConstraint;
+        shared_ptr<CoveredIndexMatcher> _matcher;
+        shared_ptr<CoveredIndexMatcher> _oldMatcher;
+        shared_ptr<FieldRangeVector> _orConstraint;
     };
 
-    // Set of candidate query plans for a particular query.  Used for running
-    // a QueryOp on these plans.
+    // temp.  this class works if T::operator< is variant unlike a regular stl priority queue.
+    // but it's very slow.  however if v.size() is always very small, it would be fine, 
+    // maybe even faster than a smart impl that does more memory allocations.
+    template<class T>
+    class our_priority_queue : boost::noncopyable { 
+        vector<T> v;
+    public:
+        our_priority_queue() { 
+            v.reserve(4);
+        }
+        int size() const { return v.size(); }
+        bool empty() const { return v.empty(); }
+        void push(const T & x) { 
+            v.push_back(x); 
+        }
+        T pop() { 
+            size_t t = 0;
+            for( size_t i = 1; i < v.size(); i++ ) { 
+                if( v[t] < v[i] )
+                    t = i;
+            }
+            T ret = v[t];
+            v.erase(v.begin()+t);
+            return ret;
+        }
+    };
+
+    /**
+     * A set of candidate query plans for a query.  This class can return a best buess plan or run a
+     * QueryOp on all the plans.
+     */
     class QueryPlanSet {
     public:
 
-        typedef boost::shared_ptr< QueryPlan > QueryPlanPtr;
-        typedef vector< QueryPlanPtr > PlanSet;
+        typedef boost::shared_ptr<QueryPlan> QueryPlanPtr;
+        typedef vector<QueryPlanPtr> PlanSet;
 
+        /**
+         * @param originalFrsp - original constraints for this query clause; if null, frsp will be used.
+         */
         QueryPlanSet( const char *ns,
-                      auto_ptr< FieldRangeSet > frs,
-                      auto_ptr< FieldRangeSet > originalFrs,
+                      auto_ptr<FieldRangeSetPair> frsp,
+                      auto_ptr<FieldRangeSetPair> originalFrsp,
                       const BSONObj &originalQuery,
                       const BSONObj &order,
+                      bool mustAssertOnYieldFailure = true,
                       const BSONElement *hint = 0,
                       bool honorRecordedPlan = true,
                       const BSONObj &min = BSONObj(),
                       const BSONObj &max = BSONObj(),
                       bool bestGuessOnly = false,
                       bool mayYield = false);
+
+        /** @return number of candidate plans. */
         int nPlans() const { return _plans.size(); }
-        shared_ptr< QueryOp > runOp( QueryOp &op );
-        template< class T >
-        shared_ptr< T > runOp( T &op ) {
-            return dynamic_pointer_cast< T >( runOp( static_cast< QueryOp& >( op ) ) );
+
+        /**
+         * Clone op for each query plan, and @return the first cloned op to call
+         * setComplete() or setStop().
+         */
+
+        shared_ptr<QueryOp> runOp( QueryOp &op );
+        template<class T>
+        shared_ptr<T> runOp( T &op ) {
+            return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) );
         }
+
+        /** Initialize or iterate a runner generated from @param originalOp. */
+        shared_ptr<QueryOp> nextOp( QueryOp &originalOp, bool retried = false );
+        
+        /** Yield the runner member. */
+        
+        bool prepareToYield();
+        void recoverFromYield();
+        
+        QueryPlanPtr firstPlan() const { return _plans[ 0 ]; }
+        
+        /** @return metadata about cursors and index bounds for all plans, suitable for explain output. */
         BSONObj explain() const;
+        /** @return true iff a plan is selected based on previous success of this plan. */
         bool usingPrerecordedPlan() const { return _usingPrerecordedPlan; }
+        /** @return a single plan that may work well for the specified query. */
         QueryPlanPtr getBestGuess() const;
+
         //for testing
-        const FieldRangeSet &fbs() const { return *_fbs; }
-        const FieldRangeSet &originalFrs() const { return *_originalFrs; }
+        const FieldRangeSetPair &frsp() const { return *_frsp; }
+        const FieldRangeSetPair *originalFrsp() const { return _originalFrsp.get(); }
         bool modifiedKeys() const;
         bool hasMultiKey() const;
 
@@ -219,22 +314,55 @@ namespace mongo {
         }
         void init();
         void addHint( IndexDetails &id );
-        struct Runner {
+        class Runner {
+        public:
             Runner( QueryPlanSet &plans, QueryOp &op );
-            shared_ptr< QueryOp > run();
-            void mayYield( const vector< shared_ptr< QueryOp > > &ops );
+
+            /**
+             * Iterate interactively through candidate documents on all plans.
+             * QueryOp objects are returned at each interleaved step.
+             */
+            
+            /** @return a plan that has completed, otherwise an arbitrary plan. */
+            shared_ptr<QueryOp> init();
+            /**
+             * Move the Runner forward one iteration, and @return the plan for
+             * this iteration.
+             */
+            shared_ptr<QueryOp> next();
+            /** @return next non error op if there is one, otherwise an error op. */
+            shared_ptr<QueryOp> nextNonError();
+
+            bool prepareToYield();
+            void recoverFromYield();
+            
+            /** Run until first op completes. */
+            shared_ptr<QueryOp> runUntilFirstCompletes();
+             
+            void mayYield();
             QueryOp &_op;
             QueryPlanSet &_plans;
             static void initOp( QueryOp &op );
             static void nextOp( QueryOp &op );
-            static bool prepareToYield( QueryOp &op );
-            static void recoverFromYield( QueryOp &op );
+            static bool prepareToYieldOp( QueryOp &op );
+            static void recoverFromYieldOp( QueryOp &op );
+        private:
+            vector<shared_ptr<QueryOp> > _ops;
+            struct OpHolder {
+                OpHolder( const shared_ptr<QueryOp> &op ) : _op( op ), _offset() {}
+                shared_ptr<QueryOp> _op;
+                long long _offset;
+                bool operator<( const OpHolder &other ) const {
+                    return _op->nscanned() + _offset > other._op->nscanned() + other._offset;
+                }
+            };
+            our_priority_queue<OpHolder> _queue;
         };
 
         const char *_ns;
         BSONObj _originalQuery;
-        auto_ptr< FieldRangeSet > _fbs;
-        auto_ptr< FieldRangeSet > _originalFrs;
+        auto_ptr<FieldRangeSetPair> _frsp;
+        auto_ptr<FieldRangeSetPair> _originalFrsp;
         PlanSet _plans;
         bool _mayRecordPlan;
         bool _usingPrerecordedPlan;
@@ -248,31 +376,11 @@ namespace mongo {
         bool _bestGuessOnly;
         bool _mayYield;
         ElapsedTracker _yieldSometimesTracker;
+        shared_ptr<Runner> _runner;
+        bool _mustAssertOnYieldFailure;
     };
 
-    // Handles $or type queries by generating a QueryPlanSet for each $or clause
-    // NOTE on our $or implementation: In our current qo implementation we don't
-    // keep statistics on our data, but we can conceptualize the problem of
-    // selecting an index when statistics exist for all index ranges.  The
-    // d-hitting set problem on k sets and n elements can be reduced to the
-    // problem of index selection on k $or clauses and n index ranges (where
-    // d is the max number of indexes, and the number of ranges n is unbounded).
-    // In light of the fact that d-hitting set is np complete, and we don't even
-    // track statistics (so cost calculations are expensive) our first
-    // implementation uses the following greedy approach: We take one $or clause
-    // at a time and treat each as a separate query for index selection purposes.
-    // But if an index range is scanned for a particular $or clause, we eliminate
-    // that range from all subsequent clauses.  One could imagine an opposite
-    // implementation where we select indexes based on the union of index ranges
-    // for all $or clauses, but this can have much poorer worst case behavior.
-    // (An index range that suits one $or clause may not suit another, and this
-    // is worse than the typical case of index range choice staleness because
-    // with $or the clauses may likely be logically distinct.)  The greedy
-    // implementation won't do any worse than all the $or clauses individually,
-    // and it can often do better.  In the first cut we are intentionally using
-    // QueryPattern tracking to record successful plans on $or clauses for use by
-    // subsequent $or clauses, even though there may be a significant aggregate
-    // $nor component that would not be represented in QueryPattern.
+    /** Handles $or type queries by generating a QueryPlanSet for each $or clause. */
     class MultiPlanScanner {
     public:
         MultiPlanScanner( const char *ns,
@@ -284,23 +392,54 @@ namespace mongo {
                           const BSONObj &max = BSONObj(),
                           bool bestGuessOnly = false,
                           bool mayYield = false);
-        shared_ptr< QueryOp > runOp( QueryOp &op );
-        template< class T >
-        shared_ptr< T > runOp( T &op ) {
-            return dynamic_pointer_cast< T >( runOp( static_cast< QueryOp& >( op ) ) );
+
+        /**
+         * Clone op for each query plan of a single $or clause, and @return the first cloned op
+         * to call setComplete() or setStop().
+         */
+
+        shared_ptr<QueryOp> runOpOnce( QueryOp &op );
+        template<class T>
+        shared_ptr<T> runOpOnce( T &op ) {
+            return dynamic_pointer_cast<T>( runOpOnce( static_cast<QueryOp&>( op ) ) );
         }
-        shared_ptr< QueryOp > runOpOnce( QueryOp &op );
-        template< class T >
-        shared_ptr< T > runOpOnce( T &op ) {
-            return dynamic_pointer_cast< T >( runOpOnce( static_cast< QueryOp& >( op ) ) );
+
+        /**
+         * For each $or clause, calls runOpOnce on the child QueryOp cloned from the winning QueryOp
+         * of the previous $or clause (or from the supplied 'op' for the first $or clause).
+         */
+
+        shared_ptr<QueryOp> runOp( QueryOp &op );
+        template<class T>
+        shared_ptr<T> runOp( T &op ) {
+            return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) );
         }
-        bool mayRunMore() const { return _or ? ( !_tableScanned && !_fros.orFinished() ) : _i == 0; }
+
+        /** Initialize or iterate a runner generated from @param originalOp. */
+        
+        void initialOp( const shared_ptr<QueryOp> &originalOp ) { _baseOp = originalOp; }
+        shared_ptr<QueryOp> nextOp();
+        
+        /** Yield the runner member. */
+        
+        bool prepareToYield();
+        void recoverFromYield();
+        
+        /**
+         * @return a single simple cursor if the scanner would run a single cursor
+         * for this query, otherwise return an empty shared_ptr.
+         */
+        shared_ptr<Cursor> singleCursor() const;
+        
+        /** @return true iff more $or clauses need to be scanned. */
+        bool mayRunMore() const { return _or ? ( !_tableScanned && !_org->orFinished() ) : _i == 0; }
+        /** @return non-$or version of explain output. */
         BSONObj oldExplain() const { assertNotOr(); return _currentQps->explain(); }
-        // just report this when only one query op
-        bool usingPrerecordedPlan() const {
-            return !_or && _currentQps->usingPrerecordedPlan();
-        }
+        /** @return true iff this is not a $or query and a plan is selected based on previous success of this plan. */
+        bool usingPrerecordedPlan() const { return !_or && _currentQps->usingPrerecordedPlan(); }
+        /** Don't attempt to scan multiple plans, just use the best guess. */
         void setBestGuessOnly() { _bestGuessOnly = true; }
+        /** Yielding is allowed while running each QueryPlan. */
         void mayYield( bool val ) { _mayYield = val; }
         bool modifiedKeys() const { return _currentQps->modifiedKeys(); }
         bool hasMultiKey() const { return _currentQps->hasMultiKey(); }
@@ -309,57 +448,46 @@ namespace mongo {
         void assertNotOr() const {
             massert( 13266, "not implemented for $or query", !_or );
         }
+        void assertMayRunMore() const {
+            massert( 13271, "can't run more ops", mayRunMore() );
+        }
+        shared_ptr<QueryOp> nextOpBeginningClause();
+        shared_ptr<QueryOp> nextOpHandleEndOfClause();
         bool uselessOr( const BSONElement &hint ) const;
         const char * _ns;
         bool _or;
         BSONObj _query;
-        FieldRangeOrSet _fros;
-        auto_ptr< QueryPlanSet > _currentQps;
+        shared_ptr<OrRangeGenerator> _org; // May be null in certain non $or query cases.
+        auto_ptr<QueryPlanSet> _currentQps;
         int _i;
         bool _honorRecordedPlan;
         bool _bestGuessOnly;
         BSONObj _hint;
         bool _mayYield;
         bool _tableScanned;
+        shared_ptr<QueryOp> _baseOp;
     };
 
+    /** Provides a cursor interface for certain limited uses of a MultiPlanScanner. */
     class MultiCursor : public Cursor {
     public:
         class CursorOp : public QueryOp {
         public:
             CursorOp() {}
             CursorOp( const QueryOp &other ) : QueryOp( other ) {}
-            virtual shared_ptr< Cursor > newCursor() const = 0;
+            virtual shared_ptr<Cursor> newCursor() const = 0;
         };
-        // takes ownership of 'op'
-        MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr< CursorOp > op = shared_ptr< CursorOp >(), bool mayYield = false )
-            : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ), _nscanned() {
-            if ( op.get() ) {
-                _op = op;
-            }
-            else {
-                _op.reset( new NoOp() );
-            }
-            if ( _mps->mayRunMore() ) {
-                nextClause();
-                if ( !ok() ) {
-                    advance();
-                }
-            }
-            else {
-                _c.reset( new BasicCursor( DiskLoc() ) );
-            }
-        }
-        // used to handoff a query to a getMore()
-        MultiCursor( auto_ptr< MultiPlanScanner > mps, const shared_ptr< Cursor > &c, const shared_ptr< CoveredIndexMatcher > &matcher, const QueryOp &op )
-            : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ), _nscanned( -1 ) {
-            _mps->setBestGuessOnly();
-            _mps->mayYield( false ); // with a NoOp, there's no need to yield in QueryPlanSet
-            if ( !ok() ) {
-                // would have been advanced by UserQueryOp if possible
-                advance();
-            }
-        }
+        /** takes ownership of 'op' */
+        MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op = shared_ptr<CursorOp>(), bool mayYield = false );
+        /**
+         * Used
+         * 1. To handoff a query to a getMore()
+         * 2. To handoff a QueryOptimizerCursor
+         * @param nscanned is an optional initial value, if not supplied nscanned()
+         * will always return -1
+         */
+        MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned = -1 );
+
         virtual bool ok() { return _c->ok(); }
         virtual Record* _current() { return _c->_current(); }
         virtual BSONObj current() { return _c->current(); }
@@ -373,31 +501,30 @@ namespace mongo {
         }
         virtual BSONObj currKey() const { return _c->currKey(); }
         virtual DiskLoc refLoc() { return _c->refLoc(); }
-        virtual void noteLocation() {
-            _c->noteLocation();
-        }
-        virtual void checkLocation() {
-            _c->checkLocation();
-        }
+        virtual void noteLocation() { _c->noteLocation(); }
+        virtual void checkLocation() { _c->checkLocation(); }
         virtual bool supportGetMore() { return true; }
         virtual bool supportYields() { return _c->supportYields(); }
+        virtual BSONObj indexKeyPattern() { return _c->indexKeyPattern(); }
 
-        // with update we could potentially get the same document on multiple
-        // indexes, but update appears to already handle this with seenObjects
-        // so we don't have to do anything special here.
-        virtual bool getsetdup(DiskLoc loc) {
-            return _c->getsetdup( loc );
-        }
+        /**
+         * with update we could potentially get the same document on multiple
+         * indexes, but update appears to already handle this with seenObjects
+         * so we don't have to do anything special here.
+         */
+        virtual bool getsetdup(DiskLoc loc) { return _c->getsetdup( loc ); }
 
         virtual bool modifiedKeys() const { return _mps->modifiedKeys(); }
 
         virtual bool isMultiKey() const { return _mps->hasMultiKey(); }
 
-        virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
-        // return -1 if we're a getmore handoff
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+        virtual CoveredIndexMatcher* matcher() const { return _matcher.get(); }
+
+        /** return -1 if we're a getmore handoff */
         virtual long long nscanned() { return _nscanned >= 0 ? _nscanned + _c->nscanned() : _nscanned; }
-        // just for testing
-        shared_ptr< Cursor > sub_c() const { return _c; }
+        /** just for testing */
+        shared_ptr<Cursor> sub_c() const { return _c; }
     private:
         class NoOp : public CursorOp {
         public:
@@ -407,55 +534,45 @@ namespace mongo {
             virtual void next() {}
             virtual bool mayRecordPlan() const { return false; }
             virtual QueryOp *_createChild() const { return new NoOp(); }
-            virtual shared_ptr< Cursor > newCursor() const { return qp().newCursor(); }
+            virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); }
             virtual long long nscanned() { assert( false ); return 0; }
         };
-        void nextClause() {
-            if ( _nscanned >= 0 && _c.get() ) {
-                _nscanned += _c->nscanned();
-            }
-            shared_ptr< CursorOp > best = _mps->runOpOnce( *_op );
-            if ( ! best->complete() )
-                throw MsgAssertionException( best->exception() );
-            _c = best->newCursor();
-            _matcher = best->matcher();
-            _op = best;
-        }
-        shared_ptr< CursorOp > _op;
-        shared_ptr< Cursor > _c;
-        auto_ptr< MultiPlanScanner > _mps;
-        shared_ptr< CoveredIndexMatcher > _matcher;
+        void nextClause();
+        shared_ptr<CursorOp> _op;
+        shared_ptr<Cursor> _c;
+        auto_ptr<MultiPlanScanner> _mps;
+        shared_ptr<CoveredIndexMatcher> _matcher;
         long long _nscanned;
     };
 
-    // NOTE min, max, and keyPattern will be updated to be consistent with the selected index.
+    /** NOTE min, max, and keyPattern will be updated to be consistent with the selected index. */
     IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern );
 
-    inline bool isSimpleIdQuery( const BSONObj& query ) {
-        BSONObjIterator i(query);
-        if( !i.more() ) return false;
-        BSONElement e = i.next();
-        if( i.more() ) return false;
-        if( strcmp("_id", e.fieldName()) != 0 ) return false;
-        return e.isSimpleType(); // e.g. not something like { _id : { $gt : ...
-    }
-
-    // matcher() will always work on the returned cursor
-    inline shared_ptr< Cursor > bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ) {
-        if( !query.getField( "$or" ).eoo() ) {
-            return shared_ptr< Cursor >( new MultiCursor( ns, query, sort ) );
-        }
-        else {
-            auto_ptr< FieldRangeSet > frs( new FieldRangeSet( ns, query ) );
-            auto_ptr< FieldRangeSet > origFrs( new FieldRangeSet( *frs ) );
-            shared_ptr< Cursor > ret = QueryPlanSet( ns, frs, origFrs, query, sort ).getBestGuess()->newCursor();
-            // If we don't already have a matcher, supply one.
-            if ( !query.isEmpty() && ! ret->matcher() ) {
-                shared_ptr< CoveredIndexMatcher > matcher( new CoveredIndexMatcher( query, ret->indexKeyPattern() ) );
-                ret->setMatcher( matcher );
-            }
-            return ret;
-        }
-    }
-
+    bool isSimpleIdQuery( const BSONObj& query );
+
+    /**
+     * @return a single cursor that may work well for the given query.
+     * It is possible no cursor is returned if the sort is not supported by an index.  Clients are responsible
+     * for checking this if they are not sure an index for a sort exists, and defaulting to a non-sort if
+     * no suitable indices exist.
+     */
+    shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort );
+
+    /**
+     * Add-on functionality for queryutil classes requiring access to indexing
+     * functionality not currently linked to mongos.
+     * TODO Clean this up a bit, possibly with separate sharded and non sharded
+     * implementations for the appropriate queryutil classes or by pulling index
+     * related functionality into separate wrapper classes.
+     */
+    struct QueryUtilIndexed {
+        /** @return true if the index may be useful according to its KeySpec. */
+        static bool indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order );
+        /** Clear any indexes recorded as the best for either the single or multi key pattern. */
+        static void clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order );
+        /** Return a recorded best index for the single or multi key pattern. */
+        static pair< BSONObj, long long > bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order );        
+        static bool uselessOr( const OrRangeGenerator& org, NamespaceDetails *d, int hintIdx );
+    };
+    
 } // namespace mongo
diff --git a/db/queryoptimizercursor.cpp b/db/queryoptimizercursor.cpp
new file mode 100644
index 0000000..9260889
--- /dev/null
+++ b/db/queryoptimizercursor.cpp
@@ -0,0 +1,387 @@
+// @file queryoptimizercursor.cpp
+
+/**
+ *    Copyright (C) 2011 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "queryoptimizer.h"
+#include "pdfile.h"
+#include "clientcursor.h"
+#include "btree.h"
+
+namespace mongo {
+    
+    static const int OutOfOrderDocumentsAssertionCode = 14810;
+    
+    /**
+     * A QueryOp implementation utilized by the QueryOptimizerCursor
+     */
+    class QueryOptimizerCursorOp : public QueryOp {
+    public:
+        /**
+         * @param aggregateNscanned - shared int counting total nscanned for
+         * query ops for all cursors.
+         */
+        QueryOptimizerCursorOp( long long &aggregateNscanned ) : _matchCount(), _mustAdvance(), _nscanned(), _aggregateNscanned( aggregateNscanned ) {}
+        
+        virtual void _init() {
+            if ( qp().scanAndOrderRequired() ) {
+                throw MsgAssertionException( OutOfOrderDocumentsAssertionCode, "order spec cannot be satisfied with index" );
+            }
+            _c = qp().newCursor();
+            _capped = _c->capped();
+            mayAdvance();
+        }
+        
+        virtual long long nscanned() {
+            return _c ? _c->nscanned() : _nscanned;
+        }
+        
+        virtual bool prepareToYield() {
+            if ( _c && !_cc ) {
+                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , qp().ns() ) );
+            }
+            if ( _cc ) {
+                _posBeforeYield = currLoc();
+                return _cc->prepareToYield( _yieldData );
+            }
+            // no active cursor - ok to yield
+            return true;
+        }
+        
+        virtual void recoverFromYield() {
+            if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _c.reset();
+                _cc.reset();
+                
+                if ( _capped ) {
+                    msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun: " << qp().ns() );
+                }
+                else if ( qp().mustAssertOnYieldFailure() ) {
+                    msgassertedNoTrace( 15892, str::stream() << "QueryOptimizerCursorOp::recoverFromYield() failed to recover" );
+                }
+                else {
+                    // we don't fail query since we're fine with returning partial data if collection dropped
+                    // also, see SERVER-2454
+                }
+            }
+            else {
+                if ( _posBeforeYield != currLoc() ) {
+                    // If the yield advanced our position, the next next() will be a no op.
+                    _mustAdvance = false;
+                }
+            }
+        }
+        
+        virtual void next() {
+            mayAdvance();
+            
+            if ( _matchCount >= 101 ) {
+                // This is equivalent to the default condition for switching from
+                // a query to a getMore.
+                setStop();
+                return;
+            }
+            if ( !_c || !_c->ok() ) {
+                setComplete();
+                return;
+            }
+            
+            if ( matcher( _c )->matchesCurrent( _c.get() ) && !_c->getsetdup( _c->currLoc() ) ) {
+                ++_matchCount;
+            }
+            _mustAdvance = true;
+        }
+        virtual QueryOp *_createChild() const {
+            QueryOptimizerCursorOp *ret = new QueryOptimizerCursorOp( _aggregateNscanned );
+            ret->_matchCount = _matchCount;
+            return ret;
+        }
+        DiskLoc currLoc() const { return _c ? _c->currLoc() : DiskLoc(); }
+        BSONObj currKey() const { return _c ? _c->currKey() : BSONObj(); }
+        virtual bool mayRecordPlan() const {
+            return complete() && !stopRequested();
+        }
+        shared_ptr<Cursor> cursor() const { return _c; }
+    private:
+        void mayAdvance() {
+            if ( _mustAdvance && _c ) {
+                _c->advance();
+                _mustAdvance = false;
+            }
+            _aggregateNscanned += ( _c->nscanned() - _nscanned );
+            _nscanned = _c->nscanned();
+        }
+        int _matchCount;
+        bool _mustAdvance;
+        long long _nscanned;
+        bool _capped;
+        shared_ptr<Cursor> _c;
+        ClientCursor::CleanupPointer _cc;
+        DiskLoc _posBeforeYield;
+        ClientCursor::YieldData _yieldData;
+        long long &_aggregateNscanned;
+    };
+    
+    /**
+     * This cursor runs a MultiPlanScanner iteratively and returns results from
+     * the scanner's cursors as they become available.  Once the scanner chooses
+     * a single plan, this cursor becomes a simple wrapper around that single
+     * plan's cursor (called the 'takeover' cursor).
+     */
+    class QueryOptimizerCursor : public Cursor {
+    public:
+        QueryOptimizerCursor( auto_ptr<MultiPlanScanner> &mps ) :
+        _mps( mps ),
+        _originalOp( new QueryOptimizerCursorOp( _nscanned ) ),
+        _currOp(),
+        _nscanned() {
+            _mps->initialOp( _originalOp );
+            shared_ptr<QueryOp> op = _mps->nextOp();
+            rethrowOnError( op );
+            if ( !op->complete() ) {
+                _currOp = dynamic_cast<QueryOptimizerCursorOp*>( op.get() );
+            }
+        }
+        
+        virtual bool ok() { return _takeover ? _takeover->ok() : !currLoc().isNull(); }
+        virtual Record* _current() {
+            if ( _takeover ) {
+                return _takeover->_current();
+            }
+            assertOk();
+            return currLoc().rec();
+        }
+        virtual BSONObj current() {
+            if ( _takeover ) {
+                return _takeover->current();
+            }
+            assertOk();
+            return currLoc().obj();
+        }
+        virtual DiskLoc currLoc() { return _takeover ? _takeover->currLoc() : _currLoc(); }
+        DiskLoc _currLoc() const {
+            verify( 14826, !_takeover );
+            if ( _currOp ) {
+                return _currOp->currLoc();
+            }
+            return DiskLoc();            
+        }
+        virtual bool advance() {
+            if ( _takeover ) {
+                return _takeover->advance();
+            }
+            
+            // Ok to advance if currOp in an error state due to failed yield recovery.
+            // This may be the case when advance() is called by recoverFromYield().
+            if ( !( _currOp && _currOp->error() ) && !ok() ) {
+                return false;
+            }
+            
+            _currOp = 0;
+            shared_ptr<QueryOp> op = _mps->nextOp();
+            rethrowOnError( op );            
+
+            QueryOptimizerCursorOp *qocop = dynamic_cast<QueryOptimizerCursorOp*>( op.get() );
+            if ( !op->complete() ) {
+                // 'qocop' will be valid until we call _mps->nextOp() again.
+                _currOp = qocop;
+            }
+            else if ( op->stopRequested() ) {
+                if ( qocop->cursor() ) {
+                    _takeover.reset( new MultiCursor( _mps,
+                                                     qocop->cursor(),
+                                                     op->matcher( qocop->cursor() ),
+                                                     *op,
+                                                     _nscanned - qocop->cursor()->nscanned() ) );
+                }
+            }
+            
+            return ok();
+        }
+        virtual BSONObj currKey() const {
+            if ( _takeover ) {
+             	return _takeover->currKey();   
+            }
+            assertOk();
+            return _currOp->currKey();
+        }
+        
+        /** This cursor will be ignored for yielding by the client cursor implementation. */
+        virtual DiskLoc refLoc() { return _takeover ? _takeover->refLoc() : DiskLoc(); }
+        
+        virtual BSONObj indexKeyPattern() {
+            if ( _takeover ) {
+                return _takeover->indexKeyPattern();
+            }
+            assertOk();
+            return _currOp->cursor()->indexKeyPattern();
+        }
+        
+        virtual bool supportGetMore() { return false; }
+
+        virtual bool supportYields() { return _takeover ? _takeover->supportYields() : true; }
+        virtual bool prepareToYield() {
+            if ( _takeover ) {
+                return _takeover->prepareToYield();
+            }
+            else if ( _currOp ) {
+                return _mps->prepareToYield();
+            }
+            else {
+                return true;
+            }
+        }
+        virtual void recoverFromYield() {
+            if ( _takeover ) {
+                _takeover->recoverFromYield();
+                return;
+            }
+            if ( _currOp ) {
+                _mps->recoverFromYield();
+                if ( _currOp->error() ) {
+                    // See if we can advance to a non error op.
+                    advance();
+                }
+            }
+        }
+        
+        virtual string toString() { return "QueryOptimizerCursor"; }
+        
+        virtual bool getsetdup(DiskLoc loc) {
+            if ( _takeover ) {
+                if ( getdupInternal( loc ) ) {
+                    return true;   
+                }
+             	return _takeover->getsetdup( loc );   
+            }
+            assertOk();
+            return getsetdupInternal( loc );                
+        }
+        
+        /** Matcher needs to know if the the cursor being forwarded to is multikey. */
+        virtual bool isMultiKey() const {
+            if ( _takeover ) {
+                return _takeover->isMultiKey();
+            }
+            assertOk();
+            return _currOp->cursor()->isMultiKey();
+        }
+        
+        virtual bool modifiedKeys() const { return true; }
+        
+        virtual long long nscanned() { return _takeover ? _takeover->nscanned() : _nscanned; }
+
+        /** @return the matcher for the takeover cursor or current active op. */
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const {
+            if ( _takeover ) {
+                return _takeover->matcherPtr();
+            }
+            assertOk();
+            return _currOp->matcher( _currOp->cursor() );
+        }
+
+        /** @return the matcher for the takeover cursor or current active op. */
+        virtual CoveredIndexMatcher* matcher() const {
+            if ( _takeover ) {
+                return _takeover->matcher();
+            }
+            assertOk();
+            return _currOp->matcher( _currOp->cursor() ).get();
+        }
+
+    private:
+        void rethrowOnError( const shared_ptr< QueryOp > &op ) {
+            // If all plans have erred out, assert.
+            if ( op->error() ) {
+                throw MsgAssertionException( op->exception() );   
+            }
+        }
+        
+        void assertOk() const {
+            massert( 14809, "Invalid access for cursor that is not ok()", !_currLoc().isNull() );
+        }
+
+        /** Insert and check for dups before takeover occurs */
+        bool getsetdupInternal(const DiskLoc &loc) {
+            pair<set<DiskLoc>::iterator, bool> p = _dups.insert(loc);
+            return !p.second;
+        }
+
+        /** Just check for dups - after takeover occurs */
+        bool getdupInternal(const DiskLoc &loc) {
+            return _dups.count( loc ) > 0;
+        }
+        
+        auto_ptr<MultiPlanScanner> _mps;
+        shared_ptr<QueryOptimizerCursorOp> _originalOp;
+        QueryOptimizerCursorOp *_currOp;
+        set<DiskLoc> _dups;
+        shared_ptr<Cursor> _takeover;
+        long long _nscanned;
+    };
+    
+    shared_ptr<Cursor> newQueryOptimizerCursor( auto_ptr<MultiPlanScanner> mps ) {
+        try {
+            return shared_ptr<Cursor>( new QueryOptimizerCursor( mps ) );
+        } catch( const AssertionException &e ) {
+            if ( e.getCode() == OutOfOrderDocumentsAssertionCode ) {
+                // If no indexes follow the requested sort order, return an
+                // empty pointer.
+                return shared_ptr<Cursor>();
+            }
+            throw;
+        }
+        return shared_ptr<Cursor>( new QueryOptimizerCursor( mps ) );
+    }
+    
+    shared_ptr<Cursor> NamespaceDetailsTransient::getCursor( const char *ns, const BSONObj &query, const BSONObj &order ) {
+        if ( query.isEmpty() && order.isEmpty() ) {
+            // TODO This will not use a covered index.
+            return theDataFileMgr.findAll( ns );
+        }
+        if ( isSimpleIdQuery( query ) ) {
+            Database *database = cc().database();
+            assert( database );
+            NamespaceDetails *d = database->namespaceIndex.details(ns);
+            if ( d ) {
+                int idxNo = d->findIdIndex();
+                if ( idxNo >= 0 ) {
+                    IndexDetails& i = d->idx( idxNo );
+                    BSONObj key = i.getKeyFromQuery( query );
+                    return shared_ptr<Cursor>( BtreeCursor::make( d, idxNo, i, key, key, true, 1 ) );
+                }
+            }
+        }
+        auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false
+        shared_ptr<Cursor> single = mps->singleCursor();
+        if ( single ) {
+            if ( !query.isEmpty() && !single->matcher() ) {
+                shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, single->indexKeyPattern() ) );
+                single->setMatcher( matcher );
+            }
+            return single;
+        }
+        return newQueryOptimizerCursor( mps );
+    }
+
+    /** This interface just available for testing. */
+    shared_ptr<Cursor> newQueryOptimizerCursor( const char *ns, const BSONObj &query, const BSONObj &order ) {
+        auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false
+        return newQueryOptimizerCursor( mps );
+    }
+        
+} // namespace mongo;
diff --git a/db/querypattern.cpp b/db/querypattern.cpp
new file mode 100644
index 0000000..589182d
--- /dev/null
+++ b/db/querypattern.cpp
@@ -0,0 +1,54 @@
+// @file querypattern.cpp - Query pattern matching for selecting similar plans given similar queries.
+
+/*    Copyright 2011 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "querypattern.h"
+
+namespace mongo {
+
+    /** for testing only - speed unimportant */
+    bool QueryPattern::operator==( const QueryPattern &other ) const {
+        bool less = operator<( other );
+        bool more = other.operator<( *this );
+        assert( !( less && more ) );
+        return !( less || more );
+    }
+    
+    /** for testing only - speed unimportant */
+    bool QueryPattern::operator!=( const QueryPattern &other ) const {
+        return !operator==( other );
+    }
+    
+    void QueryPattern::setSort( const BSONObj sort ) {
+        _sort = normalizeSort( sort );
+    }
+    
+    BSONObj QueryPattern::normalizeSort( const BSONObj &spec ) {
+        if ( spec.isEmpty() )
+            return spec;
+        int direction = ( spec.firstElement().number() >= 0 ) ? 1 : -1;
+        BSONObjIterator i( spec );
+        BSONObjBuilder b;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            b.append( e.fieldName(), direction * ( ( e.number() >= 0 ) ? -1 : 1 ) );
+        }
+        return b.obj();
+    }
+    
+} // namespace mongo
diff --git a/db/querypattern.h b/db/querypattern.h
new file mode 100644
index 0000000..d87cc64
--- /dev/null
+++ b/db/querypattern.h
@@ -0,0 +1,76 @@
+// @file querypattern.h - Query pattern matching for selecting similar plans given similar queries.
+
+/*    Copyright 2011 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+
+    /**
+     * Implements query pattern matching, used to determine if a query is
+     * similar to an earlier query and should use the same plan.
+     *
+     * Two queries will generate the same QueryPattern, and therefore match each
+     * other, if their fields have the same Types and they have the same sort
+     * spec.
+     */
+    class QueryPattern {
+    public:
+        friend class FieldRangeSet;
+        enum Type {
+            Equality,
+            LowerBound,
+            UpperBound,
+            UpperAndLowerBound
+        };
+        bool operator<( const QueryPattern &other ) const;
+        /** for testing only */
+        bool operator==( const QueryPattern &other ) const;
+        /** for testing only */
+        bool operator!=( const QueryPattern &other ) const;
+    private:
+        QueryPattern() {}
+        void setSort( const BSONObj sort );
+        static BSONObj normalizeSort( const BSONObj &spec );
+        map<string,Type> _fieldTypes;
+        BSONObj _sort;
+    };
+
+    inline bool QueryPattern::operator<( const QueryPattern &other ) const {
+        map<string,Type>::const_iterator i = _fieldTypes.begin();
+        map<string,Type>::const_iterator j = other._fieldTypes.begin();
+        while( i != _fieldTypes.end() ) {
+            if ( j == other._fieldTypes.end() )
+                return false;
+            if ( i->first < j->first )
+                return true;
+            else if ( i->first > j->first )
+                return false;
+            if ( i->second < j->second )
+                return true;
+            else if ( i->second > j->second )
+                return false;
+            ++i;
+            ++j;
+        }
+        if ( j != other._fieldTypes.end() )
+            return true;
+        return _sort.woCompare( other._sort ) < 0;
+    }
+        
+} // namespace mongo
diff --git a/db/queryutil-inl.h b/db/queryutil-inl.h
new file mode 100644
index 0000000..d0fc212
--- /dev/null
+++ b/db/queryutil-inl.h
@@ -0,0 +1,153 @@
+// @file queryutil-inl.h - Inline definitions for frequently called queryutil.h functions
+
+/*    Copyright 2011 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+namespace mongo {
+    
+    inline bool FieldInterval::equality() const {
+        if ( _cachedEquality == -1 ) {
+            _cachedEquality = ( _lower._inclusive && _upper._inclusive && _lower._bound.woCompare( _upper._bound, false ) == 0 );
+        }
+        return _cachedEquality;
+    }
+
+    inline bool FieldRange::equality() const {
+        return
+            !empty() &&
+            min().woCompare( max(), false ) == 0 &&
+            maxInclusive() &&
+            minInclusive();
+    }
+
+    inline bool FieldRange::inQuery() const {
+        if ( equality() ) {
+            return true;
+        }
+        for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            if ( !i->equality() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * TODO Assumes intervals are contiguous and minKey/maxKey will not be
+     * matched against.
+     */
+    inline bool FieldRange::nontrivial() const {
+        return
+            ! empty() &&
+            ( _intervals.size() != 1 ||
+              minKey.firstElement().woCompare( min(), false ) != 0 ||
+              maxKey.firstElement().woCompare( max(), false ) != 0 );
+    }
+
+    inline const FieldRange &FieldRangeSet::range( const char *fieldName ) const {
+        map<string,FieldRange>::const_iterator f = _ranges.find( fieldName );
+        if ( f == _ranges.end() )
+            return trivialRange();
+        return f->second;
+    }
+
+    inline FieldRange &FieldRangeSet::range( const char *fieldName ) {
+        map<string,FieldRange>::iterator f = _ranges.find( fieldName );
+        if ( f == _ranges.end() ) {
+            _ranges.insert( make_pair( string( fieldName ), trivialRange() ) );
+            return _ranges.find( fieldName )->second;
+        }
+        return f->second;
+    }
+
+    inline int FieldRangeSet::nNontrivialRanges() const {
+        int count = 0;
+        for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            if ( i->second.nontrivial() )
+                ++count;
+        }
+        return count;
+    }
+
+    inline bool FieldRangeSet::matchPossible() const {
+        for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            if ( i->second.empty() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    inline bool FieldRangeSet::matchPossibleForIndex( const BSONObj &keyPattern ) const {
+        if ( !_singleKey ) {
+            return matchPossible();   
+        }
+        BSONObjIterator i( keyPattern );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( e.fieldName() == string( "$natural" ) ) {
+                return true;
+            }
+            if ( range( e.fieldName() ).empty() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    inline long long FieldRangeVector::size() {
+        long long ret = 1;
+        for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            ret *= i->intervals().size();
+        }
+        return ret;
+    }
+
+    inline FieldRangeSetPair *OrRangeGenerator::topFrsp() const {
+        FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet );
+        if (_orSets.size()) {
+            *ret &= _orSets.front();
+        }
+        return ret;
+    }
+
+    inline FieldRangeSetPair *OrRangeGenerator::topFrspOriginal() const {
+        FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet );
+        if (_originalOrSets.size()) {
+            *ret &= _originalOrSets.front();
+        }
+        return ret;
+    }
+    
+    inline bool FieldRangeSetPair::matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+        assertValidIndexOrNoIndex( d, idxNo );
+        if ( !matchPossible() ) {
+            return false;
+        }
+        if ( idxNo < 0 ) {
+            // multi key matchPossible() is true, so return true.
+            return true;   
+        }
+        return frsForIndex( d, idxNo ).matchPossibleForIndex( keyPattern );
+    }
+
+    inline void FieldRangeSetPair::assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const {
+        massert( 14049, "FieldRangeSetPair invalid index specified", idxNo >= -1 );
+        if ( idxNo >= 0 ) {
+            assertValidIndex( d, idxNo );   
+        }
+    }        
+    
+} // namespace mongo
diff --git a/db/queryutil.cpp b/db/queryutil.cpp
index 1cd750b..717eac8 100644
--- a/db/queryutil.cpp
+++ b/db/queryutil.cpp
@@ -1,4 +1,4 @@
-// queryutil.cpp
+// @file queryutil.cpp
 
 /*    Copyright 2009 10gen Inc.
  *
@@ -24,9 +24,11 @@
 #include "../util/unittest.h"
 #include "dbmessage.h"
 #include "indexkey.h"
+#include "../util/mongoutils/str.h"
 
 namespace mongo {
     extern BSONObj staticNull;
+    extern BSONObj staticUndefined;
 
     /** returns a string that when used as a matcher, would match a super set of regex()
         returns "" for complex regular expressions
@@ -78,21 +80,39 @@ namespace mongo {
                 r = r.substr( 0 , r.size() - 1 );
                 return r; //breaking here fails with /^a?/
             }
+            else if (c == '|') {
+                // whole match so far is optional. Nothing we can do here.
+                return string();
+            }
             else if (c == '\\') {
-                // slash followed by non-alphanumeric represents the following char
                 c = *(regex++);
-                if ((c >= 'A' && c <= 'Z') ||
+                if (c == 'Q'){
+                    // \Q...\E quotes everything inside
+                    while (*regex) {
+                        c = (*regex++);
+                        if (c == '\\' && (*regex == 'E')){
+                            regex++; //skip the 'E'
+                            break; // go back to start of outer loop
+                        }
+                        else {
+                            ss << c; // character should match itself
+                        }
+                    }
+                }
+                else if ((c >= 'A' && c <= 'Z') ||
                         (c >= 'a' && c <= 'z') ||
                         (c >= '0' && c <= '0') ||
                         (c == '\0')) {
+                    // don't know what to do with these
                     r = ss.str();
                     break;
                 }
                 else {
+                    // slash followed by non-alphanumeric represents the following char
                     ss << c;
                 }
             }
-            else if (strchr("^$.[|()+{", c)) {
+            else if (strchr("^$.[()+{", c)) {
                 // list of "metacharacters" from man pcrepattern
                 r = ss.str();
                 break;
@@ -136,42 +156,63 @@ namespace mongo {
     }
 
 
-    FieldRange::FieldRange( const BSONElement &e, bool isNot, bool optimize ) {
+    FieldRange::FieldRange( const BSONElement &e, bool singleKey, bool isNot, bool optimize )
+    : _singleKey( singleKey ) {
+        int op = e.getGtLtOp();
+
         // NOTE with $not, we could potentially form a complementary set of intervals.
-        if ( !isNot && !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) {
-            set< BSONElement, element_lt > vals;
-            vector< FieldRange > regexes;
+        if ( !isNot && !e.eoo() && e.type() != RegEx && op == BSONObj::opIN ) {
+            set<BSONElement,element_lt> vals;
+            vector<FieldRange> regexes;
             uassert( 12580 , "invalid query" , e.isABSONObj() );
             BSONObjIterator i( e.embeddedObject() );
             while( i.more() ) {
                 BSONElement ie = i.next();
+                uassert( 15881, "$elemMatch not allowed within $in",
+                         ie.type() != Object ||
+                         ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
                 if ( ie.type() == RegEx ) {
-                    regexes.push_back( FieldRange( ie, false, optimize ) );
+                    regexes.push_back( FieldRange( ie, singleKey, false, optimize ) );
                 }
                 else {
-                    vals.insert( ie );
+                    // A document array may be indexed by its first element, by undefined
+                    // if it is empty, or as a full array if it is embedded within another
+                    // array.
+                    vals.insert( ie );                        
+                    if ( ie.type() == Array ) {
+                        BSONElement temp = ie.embeddedObject().firstElement();
+                        if ( temp.eoo() ) {
+                            temp = staticUndefined.firstElement();
+                        }                        
+                        vals.insert( temp );
+                    }
                 }
             }
 
-            for( set< BSONElement, element_lt >::const_iterator i = vals.begin(); i != vals.end(); ++i )
+            for( set<BSONElement,element_lt>::const_iterator i = vals.begin(); i != vals.end(); ++i )
                 _intervals.push_back( FieldInterval(*i) );
 
-            for( vector< FieldRange >::const_iterator i = regexes.begin(); i != regexes.end(); ++i )
+            for( vector<FieldRange>::const_iterator i = regexes.begin(); i != regexes.end(); ++i )
                 *this |= *i;
 
             return;
         }
 
-        if ( e.type() == Array && e.getGtLtOp() == BSONObj::Equality ) {
+        // A document array may be indexed by its first element, by undefined
+        // if it is empty, or as a full array if it is embedded within another
+        // array.
+        if ( e.type() == Array && op == BSONObj::Equality ) {
 
             _intervals.push_back( FieldInterval(e) );
-
-            const BSONElement& temp = e.embeddedObject().firstElement();
-            if ( ! temp.eoo() ) {
-                if ( temp < e )
-                    _intervals.insert( _intervals.begin() , temp );
-                else
-                    _intervals.push_back( FieldInterval(temp) );
+            BSONElement temp = e.embeddedObject().firstElement();
+            if ( temp.eoo() ) {
+             	temp = staticUndefined.firstElement();
+            }
+            if ( temp < e ) {
+                _intervals.insert( _intervals.begin() , temp );
+            }
+            else {
+                _intervals.push_back( FieldInterval(temp) );
             }
 
             return;
@@ -190,7 +231,12 @@ namespace mongo {
 
         if ( e.eoo() )
             return;
-        int op = e.getGtLtOp();
+
+        bool existsSpec = false;
+        if ( op == BSONObj::opEXISTS ) {
+            existsSpec = e.trueValue();
+        }
+        
         if ( e.type() == RegEx
                 || (e.type() == Object && !e.embeddedObject()["$regex"].eoo())
            ) {
@@ -254,6 +300,9 @@ namespace mongo {
             case BSONObj::GTE:
                 op = BSONObj::LT;
                 break;
+            case BSONObj::opEXISTS:
+                existsSpec = !existsSpec;
+                break;
             default: // otherwise doesn't matter
                 break;
             }
@@ -286,7 +335,7 @@ namespace mongo {
             lower = e;
             break;
         case BSONObj::opALL: {
-            massert( 10370 ,  "$all requires array", e.type() == Array );
+            uassert( 10370 ,  "$all requires array", e.type() == Array );
             BSONObjIterator i( e.embeddedObject() );
             bool bound = false;
             while ( i.more() ) {
@@ -356,6 +405,13 @@ namespace mongo {
         case BSONObj::opWITHIN:
             _special = "2d";
             break;
+        case BSONObj::opEXISTS: {
+            if ( !existsSpec ) {
+                lower = upper = staticNull.firstElement();
+            }
+            optimize = false;
+            break;
+        }
         default:
             break;
         }
@@ -367,6 +423,8 @@ namespace mongo {
                 upper = addObj( b.obj() ).firstElement();
             }
             else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ) { // TODO: get rid of isSimpleType
+                if( upper.type() == Date ) 
+                    lowerInclusive = false;
                 BSONObjBuilder b;
                 b.appendMinForType( upper.fieldName() , upper.type() );
                 lower = addObj( b.obj() ).firstElement();
@@ -375,9 +433,9 @@ namespace mongo {
 
     }
 
-    void FieldRange::finishOperation( const vector< FieldInterval > &newIntervals, const FieldRange &other ) {
+    void FieldRange::finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other ) {
         _intervals = newIntervals;
-        for( vector< BSONObj >::const_iterator i = other._objData.begin(); i != other._objData.end(); ++i )
+        for( vector<BSONObj>::const_iterator i = other._objData.begin(); i != other._objData.end(); ++i )
             _objData.push_back( *i );
         if ( _special.size() == 0 && other._special.size() )
             _special = other._special;
@@ -407,9 +465,15 @@ namespace mongo {
     }
 
     const FieldRange &FieldRange::operator&=( const FieldRange &other ) {
-        vector< FieldInterval > newIntervals;
-        vector< FieldInterval >::const_iterator i = _intervals.begin();
-        vector< FieldInterval >::const_iterator j = other._intervals.begin();
+        if ( !_singleKey && nontrivial() ) {
+            if ( other <= *this ) {
+             	*this = other;
+            }
+            return *this;
+        }
+        vector<FieldInterval> newIntervals;
+        vector<FieldInterval>::const_iterator i = _intervals.begin();
+        vector<FieldInterval>::const_iterator j = other._intervals.begin();
         while( i != _intervals.end() && j != other._intervals.end() ) {
             FieldInterval overlap;
             if ( fieldIntervalOverlap( *i, *j, overlap ) ) {
@@ -426,7 +490,7 @@ namespace mongo {
         return *this;
     }
 
-    void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector< FieldInterval > &newIntervals ) {
+    void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector<FieldInterval> &newIntervals ) {
         if ( low._bound.eoo() ) {
             low = lower._lower; high = lower._upper;
         }
@@ -446,11 +510,11 @@ namespace mongo {
     }
 
     const FieldRange &FieldRange::operator|=( const FieldRange &other ) {
-        vector< FieldInterval > newIntervals;
+        vector<FieldInterval> newIntervals;
         FieldBound low;
         FieldBound high;
-        vector< FieldInterval >::const_iterator i = _intervals.begin();
-        vector< FieldInterval >::const_iterator j = other._intervals.begin();
+        vector<FieldInterval>::const_iterator i = _intervals.begin();
+        vector<FieldInterval>::const_iterator j = other._intervals.begin();
         while( i != _intervals.end() && j != other._intervals.end() ) {
             int cmp = i->_lower._bound.woCompare( j->_lower._bound, false );
             if ( ( cmp == 0 && i->_lower._inclusive ) || cmp < 0 ) {
@@ -479,9 +543,9 @@ namespace mongo {
     }
 
     const FieldRange &FieldRange::operator-=( const FieldRange &other ) {
-        vector< FieldInterval > newIntervals;
-        vector< FieldInterval >::iterator i = _intervals.begin();
-        vector< FieldInterval >::const_iterator j = other._intervals.begin();
+        vector<FieldInterval> newIntervals;
+        vector<FieldInterval>::iterator i = _intervals.begin();
+        vector<FieldInterval>::const_iterator j = other._intervals.begin();
         while( i != _intervals.end() && j != other._intervals.end() ) {
             int cmp = i->_lower._bound.woCompare( j->_lower._bound, false );
             if ( cmp < 0 ||
@@ -543,20 +607,60 @@ namespace mongo {
     }
 
     // TODO write a proper implementation that doesn't do a full copy
-    bool FieldRange::operator<=( const FieldRange &other ) {
+    bool FieldRange::operator<=( const FieldRange &other ) const {
         FieldRange temp = *this;
         temp -= other;
         return temp.empty();
     }
 
+    void FieldRange::setExclusiveBounds() {
+        for( vector<FieldInterval>::iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            i->_lower._inclusive = false;
+            i->_upper._inclusive = false;
+        }
+    }
+
+    void FieldRange::reverse( FieldRange &ret ) const {
+        assert( _special.empty() );
+        ret._intervals.clear();
+        ret._objData = _objData;
+        for( vector<FieldInterval>::const_reverse_iterator i = _intervals.rbegin(); i != _intervals.rend(); ++i ) {
+            FieldInterval fi;
+            fi._lower = i->_upper;
+            fi._upper = i->_lower;
+            ret._intervals.push_back( fi );
+        }
+    }
+    
     BSONObj FieldRange::addObj( const BSONObj &o ) {
         _objData.push_back( o );
         return o;
     }
 
+    string FieldInterval::toString() const {
+        StringBuilder buf;
+        buf << ( _lower._inclusive ? "[" : "(" );
+        buf << _lower._bound;
+        buf << " , ";
+        buf << _upper._bound;
+        buf << ( _upper._inclusive ? "]" : ")" );
+        return buf.str();
+    }
+
+    string FieldRange::toString() const {
+        StringBuilder buf;
+        buf << "(FieldRange special: " << _special << " singleKey: " << _special << " intervals: ";
+        for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            buf << i->toString();
+        }
+
+        buf << ")";
+        return buf.str();
+    }
+
     string FieldRangeSet::getSpecial() const {
         string s = "";
-        for ( map<string,FieldRange>::iterator i=_ranges.begin(); i!=_ranges.end(); i++ ) {
+        for ( map<string,FieldRange>::const_iterator i=_ranges.begin(); i!=_ranges.end(); i++ ) {
             if ( i->second.getSpecial().size() == 0 )
                 continue;
             uassert( 13033 , "can't have 2 special fields" , s.size() == 0 );
@@ -565,12 +669,111 @@ namespace mongo {
         return s;
     }
 
+    /**
+     * Btree scanning for a multidimentional key range will yield a
+     * multidimensional box.  The idea here is that if an 'other'
+     * multidimensional box contains the current box we don't have to scan
+     * the current box.  If the 'other' box contains the current box in
+     * all dimensions but one, we can safely subtract the values of 'other'
+     * along that one dimension from the values for the current box on the
+     * same dimension.  In other situations, subtracting the 'other'
+     * box from the current box yields a result that is not a box (but
+     * rather can be expressed as a union of boxes).  We don't support
+     * such splitting currently in calculating index ranges.  Note that
+     * where I have said 'box' above, I actually mean sets of boxes because
+     * a field range can consist of multiple intervals.
+     */    
+    const FieldRangeSet &FieldRangeSet::operator-=( const FieldRangeSet &other ) {
+        int nUnincluded = 0;
+        string unincludedKey;
+        map<string,FieldRange>::iterator i = _ranges.begin();
+        map<string,FieldRange>::const_iterator j = other._ranges.begin();
+        while( nUnincluded < 2 && i != _ranges.end() && j != other._ranges.end() ) {
+            int cmp = i->first.compare( j->first );
+            if ( cmp == 0 ) {
+                if ( i->second <= j->second ) {
+                    // nothing
+                }
+                else {
+                    ++nUnincluded;
+                    unincludedKey = i->first;
+                }
+                ++i;
+                ++j;
+            }
+            else if ( cmp < 0 ) {
+                ++i;
+            }
+            else {
+                // other has a bound we don't, nothing can be done
+                return *this;
+            }
+        }
+        if ( j != other._ranges.end() ) {
+            // other has a bound we don't, nothing can be done
+            return *this;
+        }
+        if ( nUnincluded > 1 ) {
+            return *this;
+        }
+        if ( nUnincluded == 0 ) {
+            makeEmpty();
+            return *this;
+        }
+        // nUnincluded == 1
+        range( unincludedKey.c_str() ) -= other.range( unincludedKey.c_str() );
+        appendQueries( other );
+        return *this;
+    }
+    
+    const FieldRangeSet &FieldRangeSet::operator&=( const FieldRangeSet &other ) {
+        map<string,FieldRange>::iterator i = _ranges.begin();
+        map<string,FieldRange>::const_iterator j = other._ranges.begin();
+        while( i != _ranges.end() && j != other._ranges.end() ) {
+            int cmp = i->first.compare( j->first );
+            if ( cmp == 0 ) {
+                // Same field name, so find range intersection.
+                i->second &= j->second;
+                ++i;
+                ++j;
+            }
+            else if ( cmp < 0 ) {
+                // Field present in *this.
+                ++i;
+            }
+            else {
+                // Field not present in *this, so add it.
+                range( j->first.c_str() ) = j->second;
+                ++j;
+            }
+        }
+        while( j != other._ranges.end() ) {
+            // Field not present in *this, add it.
+            range( j->first.c_str() ) = j->second;
+            ++j;
+        }
+        appendQueries( other );
+        return *this;
+    }    
+    
+    void FieldRangeSet::appendQueries( const FieldRangeSet &other ) {
+        for( vector<BSONObj>::const_iterator i = other._queries.begin(); i != other._queries.end(); ++i ) {
+            _queries.push_back( *i );
+        }
+    }
+    
+    void FieldRangeSet::makeEmpty() {
+        for( map<string,FieldRange>::iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            i->second.makeEmpty();
+        }
+    }    
+    
     void FieldRangeSet::processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize ) {
         BSONElement g = f;
         int op2 = g.getGtLtOp();
         if ( op2 == BSONObj::opALL ) {
             BSONElement h = g;
-            massert( 13050 ,  "$all requires array", h.type() == Array );
+            uassert( 13050 ,  "$all requires array", h.type() == Array );
             BSONObjIterator i( h.embeddedObject() );
             if( i.more() ) {
                 BSONElement x = i.next();
@@ -590,29 +793,56 @@ namespace mongo {
 
                 int op3 = getGtLtOp( h );
                 if ( op3 == BSONObj::Equality ) {
-                    _ranges[ fullname ] &= FieldRange( h , isNot , optimize );
+                    range( fullname.c_str() ) &= FieldRange( h , _singleKey , isNot , optimize );
                 }
                 else {
                     BSONObjIterator l( h.embeddedObject() );
                     while ( l.more() ) {
-                        _ranges[ fullname ] &= FieldRange( l.next() , isNot , optimize );
+                        range( fullname.c_str() ) &= FieldRange( l.next() , _singleKey , isNot , optimize );
                     }
                 }
             }
         }
         else {
-            _ranges[ fieldName ] &= FieldRange( f , isNot , optimize );
+            range( fieldName ) &= FieldRange( f , _singleKey , isNot , optimize );
         }
     }
 
     void FieldRangeSet::processQueryField( const BSONElement &e, bool optimize ) {
+        if ( e.fieldName()[ 0 ] == '$' ) {
+            if ( strcmp( e.fieldName(), "$and" ) == 0 ) {
+                uassert( 14816 , "$and expression must be a nonempty array" , e.type() == Array && e.embeddedObject().nFields() > 0 );
+                BSONObjIterator i( e.embeddedObject() );
+                while( i.more() ) {
+                    BSONElement e = i.next();
+                    uassert( 14817 , "$and elements must be objects" , e.type() == Object );
+                    BSONObjIterator j( e.embeddedObject() );
+                    while( j.more() ) {
+                        processQueryField( j.next(), optimize );
+                    }
+                }            
+            }
+        
+            if ( strcmp( e.fieldName(), "$where" ) == 0 ) {
+                return;
+            }
+        
+            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+                return;
+            }
+        
+            if ( strcmp( e.fieldName(), "$nor" ) == 0 ) {
+                return;
+            }
+        }
+        
         bool equality = ( getGtLtOp( e ) == BSONObj::Equality );
         if ( equality && e.type() == Object ) {
-            equality = ( strcmp( e.embeddedObject().firstElement().fieldName(), "$not" ) != 0 );
+            equality = ( strcmp( e.embeddedObject().firstElementFieldName(), "$not" ) != 0 );
         }
 
         if ( equality || ( e.type() == Object && !e.embeddedObject()[ "$regex" ].eoo() ) ) {
-            _ranges[ e.fieldName() ] &= FieldRange( e , false , optimize );
+            range( e.fieldName() ) &= FieldRange( e , _singleKey , false , optimize );
         }
         if ( !equality ) {
             BSONObjIterator j( e.embeddedObject() );
@@ -643,93 +873,97 @@ namespace mongo {
         }
     }
 
-    FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query , bool optimize )
-        : _ns( ns ), _queries( 1, query.getOwned() ) {
+    FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query, bool singleKey, bool optimize )
+        : _ns( ns ), _queries( 1, query.getOwned() ), _singleKey( singleKey ) {
         BSONObjIterator i( _queries[ 0 ] );
 
         while( i.more() ) {
+            processQueryField( i.next(), optimize );
+        }
+    }
+    
+    FieldRangeVector::FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction )
+    :_indexSpec( indexSpec ), _direction( direction >= 0 ? 1 : -1 ) {
+        _queries = frs._queries;
+        BSONObjIterator i( _indexSpec.keyPattern );
+        set< string > baseObjectNontrivialPrefixes;
+        while( i.more() ) {
             BSONElement e = i.next();
-            // e could be x:1 or x:{$gt:1}
-
-            if ( strcmp( e.fieldName(), "$where" ) == 0 ) {
-                continue;
+            const FieldRange *range = &frs.range( e.fieldName() );
+            if ( !frs.singleKey() ) {
+                string prefix = str::before( e.fieldName(), '.' );
+                if ( baseObjectNontrivialPrefixes.count( prefix ) > 0 ) {
+                    // A field with the same parent field has already been
+                    // constrainted, and with a multikey index we cannot
+                    // constrain this field.
+                    range = &frs.trivialRange();
+                } else {
+                    if ( range->nontrivial() ) {
+                        baseObjectNontrivialPrefixes.insert( prefix );
+                    }
+                }
             }
-
-            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
-                continue;
+            int number = (int) e.number(); // returns 0.0 if not numeric
+            bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
+            if ( forward ) {
+                _ranges.push_back( *range );
             }
-
-            if ( strcmp( e.fieldName(), "$nor" ) == 0 ) {
-                continue;
+            else {
+                _ranges.push_back( FieldRange( BSONObj().firstElement(), frs.singleKey(), false, true ) );
+                range->reverse( _ranges.back() );
             }
+            assert( !_ranges.back().empty() );
+        }
+        uassert( 13385, "combinatorial limit of $in partitioning of result set exceeded", size() < 1000000 );
+    }    
 
-            processQueryField( e, optimize );
+    BSONObj FieldRangeVector::startKey() const {
+        BSONObjBuilder b;
+        for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            const FieldInterval &fi = i->intervals().front();
+            b.appendAs( fi._lower._bound, "" );
         }
+        return b.obj();
     }
 
-    FieldRangeOrSet::FieldRangeOrSet( const char *ns, const BSONObj &query , bool optimize )
-        : _baseSet( ns, query, optimize ), _orFound() {
-
-        BSONObjIterator i( _baseSet._queries[ 0 ] );
-
-        while( i.more() ) {
-            BSONElement e = i.next();
-            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
-                massert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
-                BSONObjIterator j( e.embeddedObject() );
-                while( j.more() ) {
-                    BSONElement f = j.next();
-                    massert( 13263, "$or array must contain objects", f.type() == Object );
-                    _orSets.push_back( FieldRangeSet( ns, f.embeddedObject(), optimize ) );
-                    massert( 13291, "$or may not contain 'special' query", _orSets.back().getSpecial().empty() );
-                    _originalOrSets.push_back( _orSets.back() );
-                }
-                _orFound = true;
-                continue;
-            }
+    BSONObj FieldRangeVector::endKey() const {
+        BSONObjBuilder b;
+        for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            const FieldInterval &fi = i->intervals().back();
+            b.appendAs( fi._upper._bound, "" );
         }
+        return b.obj();
     }
 
-    void FieldRangeOrSet::popOrClause( const BSONObj &indexSpec ) {
-        massert( 13274, "no or clause to pop", !orFinished() );
-        auto_ptr< FieldRangeSet > holder;
-        FieldRangeSet *toDiff = &_originalOrSets.front();
-        if ( toDiff->matchPossible() && !indexSpec.isEmpty() ) {
-            holder.reset( toDiff->subset( indexSpec ) );
-            toDiff = holder.get();
-        }
-        list< FieldRangeSet >::iterator i = _orSets.begin();
-        list< FieldRangeSet >::iterator j = _originalOrSets.begin();
-        ++i;
-        ++j;
-        while( i != _orSets.end() ) {
-            *i -= *toDiff;
-            if( !i->matchPossible() ) {
-                i = _orSets.erase( i );
-                j = _originalOrSets.erase( j );
-            }
-            else {
-                ++i;
-                ++j;
-            }
+    BSONObj FieldRangeVector::obj() const {
+        BSONObjBuilder b;
+        BSONObjIterator k( _indexSpec.keyPattern );
+        for( int i = 0; i < (int)_ranges.size(); ++i ) {
+            BSONArrayBuilder a( b.subarrayStart( k.next().fieldName() ) );
+            for( vector<FieldInterval>::const_iterator j = _ranges[ i ].intervals().begin();
+                j != _ranges[ i ].intervals().end(); ++j ) {
+                a << BSONArray( BSON_ARRAY( j->_lower._bound << j->_upper._bound ).clientReadable() );
+            }
+            a.done();
         }
-        _oldOrSets.push_front( _orSets.front() );
-        _orSets.pop_front();
-        _originalOrSets.pop_front();
+        return b.obj();
     }
-
-    FieldRange *FieldRangeSet::trivialRange_ = 0;
-    FieldRange &FieldRangeSet::trivialRange() {
-        if ( trivialRange_ == 0 )
-            trivialRange_ = new FieldRange();
-        return *trivialRange_;
+    
+    FieldRange *FieldRangeSet::__singleKeyTrivialRange = 0;
+    FieldRange *FieldRangeSet::__multiKeyTrivialRange = 0;
+    const FieldRange &FieldRangeSet::trivialRange() const {
+        FieldRange *&ret = _singleKey ? __singleKeyTrivialRange : __multiKeyTrivialRange;
+        if ( ret == 0 ) {
+            ret = new FieldRange( BSONObj().firstElement(), _singleKey, false, true );
+        }
+        return *ret;
     }
 
     BSONObj FieldRangeSet::simplifiedQuery( const BSONObj &_fields ) const {
         BSONObj fields = _fields;
         if ( fields.isEmpty() ) {
             BSONObjBuilder b;
-            for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
                 b.append( i->first, 1 );
             }
             fields = b.obj();
@@ -739,17 +973,17 @@ namespace mongo {
         while( i.more() ) {
             BSONElement e = i.next();
             const char *name = e.fieldName();
-            const FieldRange &range = _ranges[ name ];
-            assert( !range.empty() );
-            if ( range.equality() )
-                b.appendAs( range.min(), name );
-            else if ( range.nontrivial() ) {
+            const FieldRange &eRange = range( name );
+            assert( !eRange.empty() );
+            if ( eRange.equality() )
+                b.appendAs( eRange.min(), name );
+            else if ( eRange.nontrivial() ) {
                 BSONObj o;
                 BSONObjBuilder c;
-                if ( range.min().type() != MinKey )
-                    c.appendAs( range.min(), range.minInclusive() ? "$gte" : "$gt" );
-                if ( range.max().type() != MaxKey )
-                    c.appendAs( range.max(), range.maxInclusive() ? "$lte" : "$lt" );
+                if ( eRange.min().type() != MinKey )
+                    c.appendAs( eRange.min(), eRange.minInclusive() ? "$gte" : "$gt" );
+                if ( eRange.max().type() != MaxKey )
+                    c.appendAs( eRange.max(), eRange.maxInclusive() ? "$lte" : "$lt" );
                 o = c.obj();
                 b.append( name, o );
             }
@@ -759,7 +993,7 @@ namespace mongo {
 
     QueryPattern FieldRangeSet::pattern( const BSONObj &sort ) const {
         QueryPattern qp;
-        for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+        for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
             assert( !i->second.empty() );
             if ( i->second.equality() ) {
                 qp._fieldTypes[ i->first ] = QueryPattern::Equality;
@@ -781,9 +1015,9 @@ namespace mongo {
 
     // TODO get rid of this
     BoundList FieldRangeSet::indexBounds( const BSONObj &keyPattern, int direction ) const {
-        typedef vector< pair< shared_ptr< BSONObjBuilder >, shared_ptr< BSONObjBuilder > > > BoundBuilders;
+        typedef vector<pair<shared_ptr<BSONObjBuilder>, shared_ptr<BSONObjBuilder> > > BoundBuilders;
         BoundBuilders builders;
-        builders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );
+        builders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
         BSONObjIterator i( keyPattern );
         bool ineq = false; // until ineq is true, we are just dealing with equality and $in bounds
         while( i.more() ) {
@@ -803,16 +1037,16 @@ namespace mongo {
                         ineq = true;
                     }
                     BoundBuilders newBuilders;
-                    const vector< FieldInterval > &intervals = fr.intervals();
+                    const vector<FieldInterval> &intervals = fr.intervals();
                     for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i ) {
                         BSONObj first = i->first->obj();
                         BSONObj second = i->second->obj();
 
                         const unsigned maxCombinations = 4000000;
                         if ( forward ) {
-                            for( vector< FieldInterval >::const_iterator j = intervals.begin(); j != intervals.end(); ++j ) {
+                            for( vector<FieldInterval>::const_iterator j = intervals.begin(); j != intervals.end(); ++j ) {
                                 uassert( 13303, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
-                                newBuilders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );
+                                newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
                                 newBuilders.back().first->appendElements( first );
                                 newBuilders.back().second->appendElements( second );
                                 newBuilders.back().first->appendAs( j->_lower._bound, "" );
@@ -820,9 +1054,9 @@ namespace mongo {
                             }
                         }
                         else {
-                            for( vector< FieldInterval >::const_reverse_iterator j = intervals.rbegin(); j != intervals.rend(); ++j ) {
+                            for( vector<FieldInterval>::const_reverse_iterator j = intervals.rbegin(); j != intervals.rend(); ++j ) {
                                 uassert( 13304, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
-                                newBuilders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );
+                                newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
                                 newBuilders.back().first->appendElements( first );
                                 newBuilders.back().second->appendElements( second );
                                 newBuilders.back().first->appendAs( j->_upper._bound, "" );
@@ -847,18 +1081,52 @@ namespace mongo {
     }
 
     FieldRangeSet *FieldRangeSet::subset( const BSONObj &fields ) const {
-        FieldRangeSet *ret = new FieldRangeSet( _ns, BSONObj() );
+        FieldRangeSet *ret = new FieldRangeSet( _ns, BSONObj(), _singleKey, true );
         BSONObjIterator i( fields );
         while( i.more() ) {
             BSONElement e = i.next();
-            if ( _ranges[ e.fieldName() ].nontrivial() ) {
-                ret->_ranges[ e.fieldName() ] = _ranges[ e.fieldName() ];
+            if ( range( e.fieldName() ).nontrivial() ) {
+                ret->range( e.fieldName() ) = range( e.fieldName() );
             }
         }
         ret->_queries = _queries;
         return ret;
     }
+    
+    bool FieldRangeSetPair::noNontrivialRanges() const {
+        return _singleKey.matchPossible() && _singleKey.nNontrivialRanges() == 0 &&
+                 _multiKey.matchPossible() && _multiKey.nNontrivialRanges() == 0;
+    }
+    
+    FieldRangeSetPair &FieldRangeSetPair::operator&=( const FieldRangeSetPair &other ) {
+        _singleKey &= other._singleKey;
+        _multiKey &= other._multiKey;
+        return *this;
+    }
 
+    FieldRangeSetPair &FieldRangeSetPair::operator-=( const FieldRangeSet &scanned ) {
+        _singleKey -= scanned;
+        _multiKey -= scanned;
+        return *this;            
+    }    
+    
+    BSONObj FieldRangeSetPair::simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+        return frsForIndex( d, idxNo ).simplifiedQuery( keyPattern );
+    }    
+    
+    void FieldRangeSetPair::assertValidIndex( const NamespaceDetails *d, int idxNo ) const {
+        massert( 14048, "FieldRangeSetPair invalid index specified", idxNo >= 0 && idxNo < d->nIndexes );   
+    }
+        
+    const FieldRangeSet &FieldRangeSetPair::frsForIndex( const NamespaceDetails* nsd, int idxNo ) const {
+        assertValidIndexOrNoIndex( nsd, idxNo );
+        if ( idxNo < 0 ) {
+            // An unindexed cursor cannot have a "single key" constraint.
+            return _multiKey;
+        }
+        return nsd->isMultikey( idxNo ) ? _multiKey : _singleKey;
+    }    
+        
     bool FieldRangeVector::matchesElement( const BSONElement &e, int i, bool forward ) const {
         bool eq;
         int l = matchingLowElement( e, i, forward, eq );
@@ -913,41 +1181,52 @@ namespace mongo {
         return l;
     }
 
-    bool FieldRangeVector::matches( const BSONObj &obj ) const {
-        if ( !_indexSpec.get() ) {
-            _indexSpec.reset( new IndexSpec( _keyPattern ) );
+    bool FieldRangeVector::matchesKey( const BSONObj &key ) const {
+        BSONObjIterator j( key );
+        BSONObjIterator k( _indexSpec.keyPattern );
+        for( int l = 0; l < (int)_ranges.size(); ++l ) {
+            int number = (int) k.next().number();
+            bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0;
+            if ( !matchesElement( j.next(), l, forward ) ) {
+                return false;
+            }
         }
+        return true;
+    }
+    
+    bool FieldRangeVector::matches( const BSONObj &obj ) const {
         // TODO The representation of matching keys could potentially be optimized
         // more for the case at hand.  (For example, we can potentially consider
         // fields individually instead of constructing several bson objects using
         // multikey arrays.)  But getKeys() canonically defines the key set for a
         // given object and for now we are using it as is.
-        BSONObjSetDefaultOrder keys;
-        _indexSpec->getKeys( obj, keys );
-        for( BSONObjSetDefaultOrder::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
-            BSONObjIterator j( *i );
-            BSONObjIterator k( _keyPattern );
-            bool match = true;
-            for( int l = 0; l < (int)_ranges.size(); ++l ) {
-                int number = (int) k.next().number();
-                bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0;
-                if ( !matchesElement( j.next(), l, forward ) ) {
-                    match = false;
-                    break;
-                }
-            }
-            if ( match ) {
-                // The *i key matched a valid range for every element.
-                return true;
+        BSONObjSet keys;
+        _indexSpec.getKeys( obj, keys );
+        for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+            if ( matchesKey( *i ) ) {
+                return true;   
             }
         }
         return false;
     }
 
+    BSONObj FieldRangeVector::firstMatch( const BSONObj &obj ) const {
+        // NOTE Only works in forward direction.
+        assert( _direction >= 0 );
+        BSONObjSet keys( BSONObjCmp( _indexSpec.keyPattern ) );
+        _indexSpec.getKeys( obj, keys );
+        for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+            if ( matchesKey( *i ) ) {
+                return *i;
+            }
+        }
+        return BSONObj();
+    }
+    
     // TODO optimize more
-    int FieldRangeVector::Iterator::advance( const BSONObj &curr ) {
+    int FieldRangeVectorIterator::advance( const BSONObj &curr ) {
         BSONObjIterator j( curr );
-        BSONObjIterator o( _v._keyPattern );
+        BSONObjIterator o( _v._indexSpec.keyPattern );
         // track first field for which we are not at the end of the valid values,
         // since we may need to advance from the key prefix ending with this field
         int latestNonEndpoint = -1;
@@ -1085,13 +1364,109 @@ namespace mongo {
         return -1;
     }
 
-    void FieldRangeVector::Iterator::prepDive() {
+    void FieldRangeVectorIterator::prepDive() {
         for( int j = 0; j < (int)_i.size(); ++j ) {
             _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
             _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
         }
     }
 
+    BSONObj FieldRangeVectorIterator::startKey() {
+        BSONObjBuilder b;
+        for( int unsigned i = 0; i < _i.size(); ++i ) {
+            const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
+            b.appendAs( fi._lower._bound, "" );
+        }
+        return b.obj();
+    }
+
+    // temp
+    BSONObj FieldRangeVectorIterator::endKey() {
+        BSONObjBuilder b;
+        for( int unsigned i = 0; i < _i.size(); ++i ) {
+            const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
+            b.appendAs( fi._upper._bound, "" );
+        }
+        return b.obj();
+    }
+    
+    OrRangeGenerator::OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize )
+    : _baseSet( ns, query, optimize ), _orFound() {
+        
+        BSONObjIterator i( _baseSet.originalQuery() );
+        
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+                uassert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
+                BSONObjIterator j( e.embeddedObject() );
+                while( j.more() ) {
+                    BSONElement f = j.next();
+                    uassert( 13263, "$or array must contain objects", f.type() == Object );
+                    _orSets.push_back( FieldRangeSetPair( ns, f.embeddedObject(), optimize ) );
+                    uassert( 13291, "$or may not contain 'special' query", _orSets.back().getSpecial().empty() );
+                    _originalOrSets.push_back( _orSets.back() );
+                }
+                _orFound = true;
+                continue;
+            }
+        }
+    }
+
+    void OrRangeGenerator::assertMayPopOrClause() {
+        massert( 13274, "no or clause to pop", !orFinished() );        
+    }
+    
+    void OrRangeGenerator::popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern ) {
+        assertMayPopOrClause();
+        auto_ptr<FieldRangeSet> holder;
+        const FieldRangeSet *toDiff = &_originalOrSets.front().frsForIndex( nsd, idxNo );
+        BSONObj indexSpec = keyPattern;
+        if ( !indexSpec.isEmpty() && toDiff->matchPossibleForIndex( indexSpec ) ) {
+            holder.reset( toDiff->subset( indexSpec ) );
+            toDiff = holder.get();
+        }
+        popOrClause( toDiff, nsd, idxNo, keyPattern );
+    }
+    
+    void OrRangeGenerator::popOrClauseSingleKey() {
+        assertMayPopOrClause();
+        FieldRangeSet *toDiff = &_originalOrSets.front()._singleKey;
+        popOrClause( toDiff );
+    }
+    
+    /**
+     * Removes the top or clause, which would have been recently scanned, and
+     * removes the field ranges it covers from all subsequent or clauses.  As a
+     * side effect, this function may invalidate the return values of topFrs()
+     * calls made before this function was called.
+     * @param indexSpec - Keys of the index that was used to satisfy the last or
+     * clause.  Used to determine the range of keys that were scanned.  If
+     * empty we do not constrain the previous clause's ranges using index keys,
+     * which may reduce opportunities for range elimination.
+     */
+    void OrRangeGenerator::popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) {
+        list<FieldRangeSetPair>::iterator i = _orSets.begin();
+        list<FieldRangeSetPair>::iterator j = _originalOrSets.begin();
+        ++i;
+        ++j;
+        while( i != _orSets.end() ) {
+            *i -= *toDiff;
+            // Check if match is possible at all, and if it is possible for the recently scanned index.
+            if( !i->matchPossible() || ( d && !i->matchPossibleForIndex( d, idxNo, keyPattern ) ) ) {
+                i = _orSets.erase( i );
+                j = _originalOrSets.erase( j );
+            }
+            else {
+                ++i;
+                ++j;
+            }
+        }
+        _oldOrSets.push_front( _orSets.front() );
+        _orSets.pop_front();
+        _originalOrSets.pop_front();
+    }
+    
     struct SimpleRegexUnitTest : UnitTest {
         void run() {
             {
@@ -1148,6 +1523,16 @@ namespace mongo {
                 BSONObj o = b.done();
                 assert( simpleRegex(o.firstElement()) == "foo #" );
             }
+            {
+                assert( simpleRegex("^\\Qasdf\\E", "", NULL) == "asdf" );
+                assert( simpleRegex("^\\Qasdf\\E.*", "", NULL) == "asdf" );
+                assert( simpleRegex("^\\Qasdf", "", NULL) == "asdf" ); // PCRE supports this
+                assert( simpleRegex("^\\Qasdf\\\\E", "", NULL) == "asdf\\" );
+                assert( simpleRegex("^\\Qas.*df\\E", "", NULL) == "as.*df" );
+                assert( simpleRegex("^\\Qas\\Q[df\\E", "", NULL) == "as\\Q[df" );
+                assert( simpleRegex("^\\Qas\\E\\\\E\\Q$df\\E", "", NULL) == "as\\E$df" ); // quoted string containing \E
+            }
+
         }
     } simple_regex_unittest;
 
@@ -1173,36 +1558,5 @@ namespace mongo {
         return num;
     }
 
-    string debugString( Message& m ) {
-        stringstream ss;
-        ss << "op: " << opToString( m.operation() ) << " len: " << m.size();
-        if ( m.operation() >= 2000 && m.operation() < 2100 ) {
-            DbMessage d(m);
-            ss << " ns: " << d.getns();
-            switch ( m.operation() ) {
-            case dbUpdate: {
-                int flags = d.pullInt();
-                BSONObj q = d.nextJsObj();
-                BSONObj o = d.nextJsObj();
-                ss << " flags: " << flags << " query: " << q << " update: " << o;
-                break;
-            }
-            case dbInsert:
-                ss << d.nextJsObj();
-                break;
-            case dbDelete: {
-                int flags = d.pullInt();
-                BSONObj q = d.nextJsObj();
-                ss << " flags: " << flags << " query: " << q;
-                break;
-            }
-            default:
-                ss << " CANNOT HANDLE YET";
-            }
-
-
-        }
-        return ss.str();
-    }
 
 } // namespace mongo
diff --git a/db/queryutil.h b/db/queryutil.h
index 2746695..104cde2 100644
--- a/db/queryutil.h
+++ b/db/queryutil.h
@@ -1,4 +1,4 @@
-// queryutil.h
+// @file queryutil.h - Utility classes representing ranges of valid BSONElement values for a query.
 
 /*    Copyright 2009 10gen Inc.
  *
@@ -18,9 +18,14 @@
 #pragma once
 
 #include "jsobj.h"
+#include "indexkey.h"
 
 namespace mongo {
 
+    /**
+     * One side of an interval of valid BSONElements, specified by a value and a
+     * boolean indicating whether the interval includes the value.
+     */
     struct FieldBound {
         BSONElement _bound;
         bool _inclusive;
@@ -31,6 +36,7 @@ namespace mongo {
         void flipInclusive() { _inclusive = !_inclusive; }
     };
 
+    /** A closed interval composed of a lower and an upper FieldBound. */
     struct FieldInterval {
         FieldInterval() : _cachedEquality( -1 ) {}
         FieldInterval( const BSONElement& e ) : _cachedEquality( -1 ) {
@@ -39,381 +45,270 @@ namespace mongo {
         }
         FieldBound _lower;
         FieldBound _upper;
+        /** @return true iff no single element can be contained in the interval. */
         bool strictValid() const {
             int cmp = _lower._bound.woCompare( _upper._bound, false );
             return ( cmp < 0 || ( cmp == 0 && _lower._inclusive && _upper._inclusive ) );
         }
-        bool equality() const {
-            if ( _cachedEquality == -1 ) {
-                _cachedEquality = ( _lower._inclusive && _upper._inclusive && _lower._bound.woCompare( _upper._bound, false ) == 0 );
-            }
-            return _cachedEquality;
-        }
+        /** @return true iff the interval is an equality constraint. */
+        bool equality() const;
         mutable int _cachedEquality;
+
+        string toString() const;
     };
 
-    // range of a field's value that may be determined from query -- used to
-    // determine index limits
+    /**
+     * An ordered list of FieldIntervals expressing constraints on valid
+     * BSONElement values for a field.
+     */
     class FieldRange {
     public:
-        FieldRange( const BSONElement &e = BSONObj().firstElement() , bool isNot=false , bool optimize=true );
+        FieldRange( const BSONElement &e , bool singleKey , bool isNot=false , bool optimize=true );
+
+        /** @return Range intersection with 'other'. */
         const FieldRange &operator&=( const FieldRange &other );
+        /** @return Range union with 'other'. */
         const FieldRange &operator|=( const FieldRange &other );
+        /** @return Range of elements elements included in 'this' but not 'other'. */
         const FieldRange &operator-=( const FieldRange &other );
-        // true iff other includes this
-        bool operator<=( const FieldRange &other );
+        /** @return true iff this range is a subset of 'other'. */
+        bool operator<=( const FieldRange &other ) const;
+
+        /**
+         * If there are any valid values for this range, the extreme values can
+         * be extracted.
+         */
+        
         BSONElement min() const { assert( !empty() ); return _intervals[ 0 ]._lower._bound; }
         BSONElement max() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._bound; }
         bool minInclusive() const { assert( !empty() ); return _intervals[ 0 ]._lower._inclusive; }
         bool maxInclusive() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._inclusive; }
-        bool equality() const {
-            return
-                !empty() &&
-                min().woCompare( max(), false ) == 0 &&
-                maxInclusive() &&
-                minInclusive();
-        }
-        bool inQuery() const {
-            if ( equality() ) {
-                return true;
-            }
-            for( vector< FieldInterval >::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
-                if ( !i->equality() ) {
-                    return false;
-                }
-            }
-            return true;
-        }
-        bool nontrivial() const {
-            return
-                ! empty() &&
-                ( _intervals.size() != 1 ||
-                  minKey.firstElement().woCompare( min(), false ) != 0 ||
-                  maxKey.firstElement().woCompare( max(), false ) != 0 );
-        }
+
+        /** @return true iff this range expresses a single equality interval. */
+        bool equality() const;
+        /** @return true if all the intervals for this range are equalities */
+        bool inQuery() const;
+        /** @return true iff this range does not include every BSONElement */
+        bool nontrivial() const;
+        /** @return true iff this range matches no BSONElements. */
         bool empty() const { return _intervals.empty(); }
+        
+        /** Empty the range so it matches no BSONElements. */
         void makeEmpty() { _intervals.clear(); }
-        const vector< FieldInterval > &intervals() const { return _intervals; }
+        const vector<FieldInterval> &intervals() const { return _intervals; }
         string getSpecial() const { return _special; }
-        void setExclusiveBounds() {
-            for( vector< FieldInterval >::iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
-                i->_lower._inclusive = false;
-                i->_upper._inclusive = false;
-            }
-        }
-        // constructs a range which is the reverse of the current one
-        // note - the resulting intervals may not be strictValid()
-        void reverse( FieldRange &ret ) const {
-            assert( _special.empty() );
-            ret._intervals.clear();
-            ret._objData = _objData;
-            for( vector< FieldInterval >::const_reverse_iterator i = _intervals.rbegin(); i != _intervals.rend(); ++i ) {
-                FieldInterval fi;
-                fi._lower = i->_upper;
-                fi._upper = i->_lower;
-                ret._intervals.push_back( fi );
-            }
-        }
+        /** Make component intervals noninclusive. */
+        void setExclusiveBounds();
+        /**
+         * Constructs a range where all FieldIntervals and FieldBounds are in
+         * the opposite order of the current range.
+         * NOTE the resulting intervals might not be strictValid().
+         */
+        void reverse( FieldRange &ret ) const;
+
+        string toString() const;
     private:
         BSONObj addObj( const BSONObj &o );
-        void finishOperation( const vector< FieldInterval > &newIntervals, const FieldRange &other );
-        vector< FieldInterval > _intervals;
-        vector< BSONObj > _objData;
+        void finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other );
+        vector<FieldInterval> _intervals;
+        // Owns memory for our BSONElements.
+        vector<BSONObj> _objData;
         string _special;
+        bool _singleKey;
     };
 
-    // implements query pattern matching, used to determine if a query is
-    // similar to an earlier query and should use the same plan
-    class QueryPattern {
-    public:
-        friend class FieldRangeSet;
-        enum Type {
-            Equality,
-            LowerBound,
-            UpperBound,
-            UpperAndLowerBound
-        };
-        // for testing only, speed unimportant
-        bool operator==( const QueryPattern &other ) const {
-            bool less = operator<( other );
-            bool more = other.operator<( *this );
-            assert( !( less && more ) );
-            return !( less || more );
-        }
-        bool operator!=( const QueryPattern &other ) const {
-            return !operator==( other );
-        }
-        bool operator<( const QueryPattern &other ) const {
-            map< string, Type >::const_iterator i = _fieldTypes.begin();
-            map< string, Type >::const_iterator j = other._fieldTypes.begin();
-            while( i != _fieldTypes.end() ) {
-                if ( j == other._fieldTypes.end() )
-                    return false;
-                if ( i->first < j->first )
-                    return true;
-                else if ( i->first > j->first )
-                    return false;
-                if ( i->second < j->second )
-                    return true;
-                else if ( i->second > j->second )
-                    return false;
-                ++i;
-                ++j;
-            }
-            if ( j != other._fieldTypes.end() )
-                return true;
-            return _sort.woCompare( other._sort ) < 0;
-        }
-    private:
-        QueryPattern() {}
-        void setSort( const BSONObj sort ) {
-            _sort = normalizeSort( sort );
-        }
-        BSONObj static normalizeSort( const BSONObj &spec ) {
-            if ( spec.isEmpty() )
-                return spec;
-            int direction = ( spec.firstElement().number() >= 0 ) ? 1 : -1;
-            BSONObjIterator i( spec );
-            BSONObjBuilder b;
-            while( i.moreWithEOO() ) {
-                BSONElement e = i.next();
-                if ( e.eoo() )
-                    break;
-                b.append( e.fieldName(), direction * ( ( e.number() >= 0 ) ? -1 : 1 ) );
-            }
-            return b.obj();
-        }
-        map< string, Type > _fieldTypes;
-        BSONObj _sort;
-    };
-
-    // a BoundList contains intervals specified by inclusive start
-    // and end bounds.  The intervals should be nonoverlapping and occur in
-    // the specified direction of traversal.  For example, given a simple index {i:1}
-    // and direction +1, one valid BoundList is: (1, 2); (4, 6).  The same BoundList
-    // would be valid for index {i:-1} with direction -1.
-    typedef vector< pair< BSONObj, BSONObj > > BoundList;
+    /**
+     * A BoundList contains intervals specified by inclusive start
+     * and end bounds.  The intervals should be nonoverlapping and occur in
+     * the specified direction of traversal.  For example, given a simple index {i:1}
+     * and direction +1, one valid BoundList is: (1, 2); (4, 6).  The same BoundList
+     * would be valid for index {i:-1} with direction -1.
+     */
+    typedef vector<pair<BSONObj,BSONObj> > BoundList;
 
-    // ranges of fields' value that may be determined from query -- used to
-    // determine index limits
+    class QueryPattern;
+    
+    /**
+     * A set of FieldRanges determined from constraints on the fields of a query,
+     * that may be used to determine index bounds.
+     */
     class FieldRangeSet {
     public:
-        friend class FieldRangeOrSet;
+        friend class OrRangeGenerator;
         friend class FieldRangeVector;
-        FieldRangeSet( const char *ns, const BSONObj &query , bool optimize=true );
+        FieldRangeSet( const char *ns, const BSONObj &query , bool singleKey , bool optimize=true );
+        
+        /** @return true if there is a nontrivial range for the given field. */
         bool hasRange( const char *fieldName ) const {
-            map< string, FieldRange >::const_iterator f = _ranges.find( fieldName );
+            map<string, FieldRange>::const_iterator f = _ranges.find( fieldName );
             return f != _ranges.end();
         }
-        const FieldRange &range( const char *fieldName ) const {
-            map< string, FieldRange >::const_iterator f = _ranges.find( fieldName );
-            if ( f == _ranges.end() )
-                return trivialRange();
-            return f->second;
-        }
-        FieldRange &range( const char *fieldName ) {
-            map< string, FieldRange >::iterator f = _ranges.find( fieldName );
-            if ( f == _ranges.end() )
-                return trivialRange();
-            return f->second;
-        }
-        int nNontrivialRanges() const {
-            int count = 0;
-            for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
-                if ( i->second.nontrivial() )
-                    ++count;
-            }
-            return count;
-        }
+        /** @return range for the given field. */
+        const FieldRange &range( const char *fieldName ) const;
+        /** @return range for the given field. */
+        FieldRange &range( const char *fieldName );
+        /** @return the number of nontrivial ranges. */
+        int nNontrivialRanges() const;
+        /** 
+         * @return true if a match could be possible on every field. Generally this
+         * is not useful information for a single key FieldRangeSet and
+         * matchPossibleForIndex() should be used instead.
+         */
+        bool matchPossible() const;
+        /**
+         * @return true if a match could be possible given the value of _singleKey
+         * and index key 'keyPattern'.
+         * @param keyPattern May be {} or {$natural:1} for a non index scan.
+         */
+        bool matchPossibleForIndex( const BSONObj &keyPattern ) const;
+        
         const char *ns() const { return _ns; }
-        // if fields is specified, order fields of returned object to match those of 'fields'
+        
+        /**
+         * @return a simplified query from the extreme values of the nontrivial
+         * fields.
+         * @param fields If specified, the fields of the returned object are
+         * ordered to match those of 'fields'.
+         */
         BSONObj simplifiedQuery( const BSONObj &fields = BSONObj() ) const;
-        bool matchPossible() const {
-            for( map< string, FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i )
-                if ( i->second.empty() )
-                    return false;
-            return true;
-        }
+        
         QueryPattern pattern( const BSONObj &sort = BSONObj() ) const;
         string getSpecial() const;
-        // Btree scanning for a multidimentional key range will yield a
-        // multidimensional box.  The idea here is that if an 'other'
-        // multidimensional box contains the current box we don't have to scan
-        // the current box.  If the 'other' box contains the current box in
-        // all dimensions but one, we can safely subtract the values of 'other'
-        // along that one dimension from the values for the current box on the
-        // same dimension.  In other situations, subtracting the 'other'
-        // box from the current box yields a result that is not a box (but
-        // rather can be expressed as a union of boxes).  We don't support
-        // such splitting currently in calculating index ranges.  Note that
-        // where I have said 'box' above, I actually mean sets of boxes because
-        // a field range can consist of multiple intervals.
-        const FieldRangeSet &operator-=( const FieldRangeSet &other ) {
-            int nUnincluded = 0;
-            string unincludedKey;
-            map< string, FieldRange >::iterator i = _ranges.begin();
-            map< string, FieldRange >::const_iterator j = other._ranges.begin();
-            while( nUnincluded < 2 && i != _ranges.end() && j != other._ranges.end() ) {
-                int cmp = i->first.compare( j->first );
-                if ( cmp == 0 ) {
-                    if ( i->second <= j->second ) {
-                        // nothing
-                    }
-                    else {
-                        ++nUnincluded;
-                        unincludedKey = i->first;
-                    }
-                    ++i;
-                    ++j;
-                }
-                else if ( cmp < 0 ) {
-                    ++i;
-                }
-                else {
-                    // other has a bound we don't, nothing can be done
-                    return *this;
-                }
-            }
-            if ( j != other._ranges.end() ) {
-                // other has a bound we don't, nothing can be done
-                return *this;
-            }
-            if ( nUnincluded > 1 ) {
-                return *this;
-            }
-            if ( nUnincluded == 0 ) {
-                makeEmpty();
-                return *this;
-            }
-            // nUnincluded == 1
-            _ranges[ unincludedKey ] -= other._ranges[ unincludedKey ];
-            appendQueries( other );
-            return *this;
-        }
-        const FieldRangeSet &operator&=( const FieldRangeSet &other ) {
-            map< string, FieldRange >::iterator i = _ranges.begin();
-            map< string, FieldRange >::const_iterator j = other._ranges.begin();
-            while( i != _ranges.end() && j != other._ranges.end() ) {
-                int cmp = i->first.compare( j->first );
-                if ( cmp == 0 ) {
-                    i->second &= j->second;
-                    ++i;
-                    ++j;
-                }
-                else if ( cmp < 0 ) {
-                    ++i;
-                }
-                else {
-                    _ranges[ j->first ] = j->second;
-                    ++j;
-                }
-            }
-            while( j != other._ranges.end() ) {
-                _ranges[ j->first ] = j->second;
-                ++j;
-            }
-            appendQueries( other );
-            return *this;
-        }
-        // TODO get rid of this
+
+        /**
+         * @return a FieldRangeSet approximation of the documents in 'this' but
+         * not in 'other'.  The approximation will be a superset of the documents
+         * in 'this' but not 'other'.
+         */
+        const FieldRangeSet &operator-=( const FieldRangeSet &other );
+        /** @return intersection of 'this' with 'other'. */
+        const FieldRangeSet &operator&=( const FieldRangeSet &other );
+        
+        /**
+         * @return an ordered list of bounds generated using an index key pattern
+         * and traversal direction.
+         *
+         * NOTE This function is deprecated in the query optimizer and only
+         * currently used by the sharding code.
+         */
         BoundList indexBounds( const BSONObj &keyPattern, int direction ) const;
 
         /**
-         * @param return - A new FieldRangeSet based on this FieldRangeSet, but with only
+         * @return - A new FieldRangeSet based on this FieldRangeSet, but with only
          * a subset of the fields.
          * @param fields - Only fields which are represented as field names in this object
          * will be included in the returned FieldRangeSet.
          */
         FieldRangeSet *subset( const BSONObj &fields ) const;
+        
+        bool singleKey() const { return _singleKey; }
+        
+        BSONObj originalQuery() const { return _queries[ 0 ]; }
     private:
-        void appendQueries( const FieldRangeSet &other ) {
-            for( vector< BSONObj >::const_iterator i = other._queries.begin(); i != other._queries.end(); ++i ) {
-                _queries.push_back( *i );
-            }
-        }
-        void makeEmpty() {
-            for( map< string, FieldRange >::iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
-                i->second.makeEmpty();
-            }
-        }
+        void appendQueries( const FieldRangeSet &other );
+        void makeEmpty();
         void processQueryField( const BSONElement &e, bool optimize );
         void processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize );
-        static FieldRange *trivialRange_;
-        static FieldRange &trivialRange();
-        mutable map< string, FieldRange > _ranges;
+        static FieldRange *__singleKeyTrivialRange;
+        static FieldRange *__multiKeyTrivialRange;
+        const FieldRange &trivialRange() const;
+        map<string,FieldRange> _ranges;
         const char *_ns;
-        // make sure memory for FieldRange BSONElements is owned
-        vector< BSONObj > _queries;
+        // Owns memory for FieldRange BSONElements.
+        vector<BSONObj> _queries;
+        bool _singleKey;
     };
 
+    class NamespaceDetails;
+    
+    /**
+     * A pair of FieldRangeSets, one representing constraints for single key
+     * indexes and the other representing constraints for multi key indexes and
+     * unindexed scans.  In several member functions the caller is asked to
+     * supply an index so that the implementation may utilize the proper
+     * FieldRangeSet and return results that are appropriate with respect to that
+     * supplied index.
+     */
+    class FieldRangeSetPair {
+    public:
+        FieldRangeSetPair( const char *ns, const BSONObj &query, bool optimize=true )
+        :_singleKey( ns, query, true, optimize ), _multiKey( ns, query, false, optimize ) {}
+
+        /**
+         * @return the appropriate single or multi key FieldRangeSet for the specified index.
+         * @param idxNo -1 for non index scan.
+         */
+        const FieldRangeSet &frsForIndex( const NamespaceDetails* nsd, int idxNo ) const;
+
+        /** @return a field range in the single key FieldRangeSet. */
+        const FieldRange &singleKeyRange( const char *fieldName ) const {
+            return _singleKey.range( fieldName );
+        }
+        /** @return true if the range limits are equivalent to an empty query. */
+        bool noNontrivialRanges() const;
+        /** @return false if a match is impossible regardless of index. */
+        bool matchPossible() const { return _multiKey.matchPossible(); }
+        /**
+         * @return false if a match is impossible on the specified index.
+         * @param idxNo -1 for non index scan.
+         */
+        bool matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const;
+        
+        const char *ns() const { return _singleKey.ns(); }
+
+        string getSpecial() const { return _singleKey.getSpecial(); }
+
+        /** Intersect with another FieldRangeSetPair. */
+        FieldRangeSetPair &operator&=( const FieldRangeSetPair &other );
+        /**
+         * Subtract a FieldRangeSet, generally one expressing a range that has
+         * already been scanned.
+         */
+        FieldRangeSetPair &operator-=( const FieldRangeSet &scanned );
+
+        BoundList singleKeyIndexBounds( const BSONObj &keyPattern, int direction ) const {
+            return _singleKey.indexBounds( keyPattern, direction );
+        }
+        
+        BSONObj originalQuery() const { return _singleKey.originalQuery(); }
+
+    private:
+        FieldRangeSetPair( const FieldRangeSet &singleKey, const FieldRangeSet &multiKey )
+        :_singleKey( singleKey ), _multiKey( multiKey ) {}
+        void assertValidIndex( const NamespaceDetails *d, int idxNo ) const;
+        void assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const;
+        /** matchPossibleForIndex() must be true. */
+        BSONObj simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const;        
+        FieldRangeSet _singleKey;
+        FieldRangeSet _multiKey;
+        friend class OrRangeGenerator;
+        friend struct QueryUtilIndexed;
+    };
+    
     class IndexSpec;
 
     /**
-     * This class manages the ranges of valid element values for each field in
-     * an ordered list of signed fields corresponding to an index specification.
+     * An ordered list of fields and their FieldRanges, correspoinding to valid
+     * index keys for a given index spec.
      */
     class FieldRangeVector {
     public:
         /**
          * @param frs The valid ranges for all fields, as defined by the query spec
-         * @prarm keyPattern The index key pattern
+         * @param indexSpec The index spec (key pattern and info)
          * @param direction The direction of index traversal
          */
-        FieldRangeVector( const FieldRangeSet &frs, const BSONObj &keyPattern, int direction )
-            :_keyPattern( keyPattern ), _direction( direction >= 0 ? 1 : -1 ) {
-            _queries = frs._queries;
-            BSONObjIterator i( _keyPattern );
-            while( i.more() ) {
-                BSONElement e = i.next();
-                int number = (int) e.number(); // returns 0.0 if not numeric
-                bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
-                if ( forward ) {
-                    _ranges.push_back( frs.range( e.fieldName() ) );
-                }
-                else {
-                    _ranges.push_back( FieldRange() );
-                    frs.range( e.fieldName() ).reverse( _ranges.back() );
-                }
-                assert( !_ranges.back().empty() );
-            }
-            uassert( 13385, "combinatorial limit of $in partitioning of result set exceeded", size() < 1000000 );
-        }
-        long long size() {
-            long long ret = 1;
-            for( vector< FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
-                ret *= i->intervals().size();
-            }
-            return ret;
-        }
-        BSONObj startKey() const {
-            BSONObjBuilder b;
-            for( vector< FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
-                const FieldInterval &fi = i->intervals().front();
-                b.appendAs( fi._lower._bound, "" );
-            }
-            return b.obj();
-        }
-        BSONObj endKey() const {
-            BSONObjBuilder b;
-            for( vector< FieldRange >::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
-                const FieldInterval &fi = i->intervals().back();
-                b.appendAs( fi._upper._bound, "" );
-            }
-            return b.obj();
-        }
-        BSONObj obj() const {
-            BSONObjBuilder b;
-            BSONObjIterator k( _keyPattern );
-            for( int i = 0; i < (int)_ranges.size(); ++i ) {
-                BSONArrayBuilder a( b.subarrayStart( k.next().fieldName() ) );
-                for( vector< FieldInterval >::const_iterator j = _ranges[ i ].intervals().begin();
-                        j != _ranges[ i ].intervals().end(); ++j ) {
-                    a << BSONArray( BSON_ARRAY( j->_lower._bound << j->_upper._bound ).clientReadable() );
-                }
-                a.done();
-            }
-            return b.obj();
-        }
+        FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction );
+
+        /** @return the number of index ranges represented by 'this' */
+        long long size();
+        /** @return starting point for an index traversal. */
+        BSONObj startKey() const;
+        /** @return end point for an index traversal. */
+        BSONObj endKey() const;
+        /** @return a client readable representation of 'this' */
+        BSONObj obj() const;
+        
         /**
          * @return true iff the provided document matches valid ranges on all
          * of this FieldRangeVector's fields, which is the case iff this document
@@ -421,144 +316,109 @@ namespace mongo {
          * FieldRangeVector.  This function is used for $or clause deduping.
          */
         bool matches( const BSONObj &obj ) const;
-        class Iterator {
-        public:
-            Iterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _inc( _v._ranges.size(), false ), _after() {
-            }
-            static BSONObj minObject() {
-                BSONObjBuilder b;
-                b.appendMinKey( "" );
-                return b.obj();
-            }
-            static BSONObj maxObject() {
-                BSONObjBuilder b;
-                b.appendMaxKey( "" );
-                return b.obj();
-            }
-            bool advance() {
-                int i = _i.size() - 1;
-                while( i >= 0 && _i[ i ] >= ( (int)_v._ranges[ i ].intervals().size() - 1 ) ) {
-                    --i;
-                }
-                if( i >= 0 ) {
-                    _i[ i ]++;
-                    for( unsigned j = i + 1; j < _i.size(); ++j ) {
-                        _i[ j ] = 0;
-                    }
-                }
-                else {
-                    _i[ 0 ] = _v._ranges[ 0 ].intervals().size();
-                }
-                return ok();
-            }
-            // return value
-            // -2 end of iteration
-            // -1 no skipping
-            // >= 0 skip parameter
-            int advance( const BSONObj &curr );
-            const vector< const BSONElement * > &cmp() const { return _cmp; }
-            const vector< bool > &inc() const { return _inc; }
-            bool after() const { return _after; }
-            void prepDive();
-            void setZero( int i ) {
-                for( int j = i; j < (int)_i.size(); ++j ) {
-                    _i[ j ] = 0;
-                }
-            }
-            void setMinus( int i ) {
-                for( int j = i; j < (int)_i.size(); ++j ) {
-                    _i[ j ] = -1;
-                }
-            }
-            bool ok() {
-                return _i[ 0 ] < (int)_v._ranges[ 0 ].intervals().size();
-            }
-            BSONObj startKey() {
-                BSONObjBuilder b;
-                for( int unsigned i = 0; i < _i.size(); ++i ) {
-                    const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
-                    b.appendAs( fi._lower._bound, "" );
-                }
-                return b.obj();
-            }
-            // temp
-            BSONObj endKey() {
-                BSONObjBuilder b;
-                for( int unsigned i = 0; i < _i.size(); ++i ) {
-                    const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
-                    b.appendAs( fi._upper._bound, "" );
-                }
-                return b.obj();
-            }
-            // check
-        private:
-            const FieldRangeVector &_v;
-            vector< int > _i;
-            vector< const BSONElement* > _cmp;
-            vector< bool > _inc;
-            bool _after;
-        };
+        
+        /**
+         * @return first key of 'obj' that would be encountered by a forward
+         * index scan using this FieldRangeVector, BSONObj() if no such key.
+         */
+        BSONObj firstMatch( const BSONObj &obj ) const;
+        
     private:
         int matchingLowElement( const BSONElement &e, int i, bool direction, bool &lowEquality ) const;
         bool matchesElement( const BSONElement &e, int i, bool direction ) const;
-        vector< FieldRange > _ranges;
-        BSONObj _keyPattern;
+        bool matchesKey( const BSONObj &key ) const;
+        vector<FieldRange> _ranges;
+        const IndexSpec &_indexSpec;
         int _direction;
-        vector< BSONObj > _queries; // make sure mem owned
-        // This IndexSpec is lazily constructed directly from _keyPattern if needed.
-        mutable shared_ptr< IndexSpec > _indexSpec;
+        vector<BSONObj> _queries; // make sure mem owned
+        friend class FieldRangeVectorIterator;
     };
-
-    // generages FieldRangeSet objects, accounting for or clauses
-    class FieldRangeOrSet {
+    
+    /**
+     * Helper class for iterating through an ordered representation of keys
+     * to find those keys that match a specified FieldRangeVector.
+     */
+    class FieldRangeVectorIterator {
     public:
-        FieldRangeOrSet( const char *ns, const BSONObj &query , bool optimize=true );
-        // if there's a useless or clause, we won't use or ranges to help with scanning
-        bool orFinished() const { return _orFound && _orSets.empty(); }
-        /**
-         * Removes the top or clause, which would have been recently scanned, and
-         * removes the field ranges it covers from all subsequent or clauses.  As a
-         * side effect, this function may invalidate the return values of topFrs()
-         * calls made before this function was called.
-         * @param indexSpec - Keys of the index that was used to satisfy the last or
-         * clause.  Used to determine the range of keys that were scanned.  If
-         * empty we do not constrain the previous clause's ranges using index keys,
-         * which may reduce opportunities for range elimination.
-         */
-        void popOrClause( const BSONObj &indexSpec = BSONObj() );
-        FieldRangeSet *topFrs() const {
-            FieldRangeSet *ret = new FieldRangeSet( _baseSet );
-            if (_orSets.size()) {
-                *ret &= _orSets.front();
-            }
-            return ret;
+        FieldRangeVectorIterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _inc( _v._ranges.size(), false ), _after() {
         }
-        // while the original bounds are looser, they are composed of fewer
-        // ranges and it is faster to do operations with them; when they can be
-        // used instead of more precise bounds, they should
-        FieldRangeSet *topFrsOriginal() const {
-            FieldRangeSet *ret = new FieldRangeSet( _baseSet );
-            if (_originalOrSets.size()) {
-                *ret &= _originalOrSets.front();
-            }
-            return ret;
+        static BSONObj minObject() {
+            BSONObjBuilder b; b.appendMinKey( "" );
+            return b.obj();
         }
-        void allClausesSimplified( vector< BSONObj > &ret ) const {
-            for( list< FieldRangeSet >::const_iterator i = _orSets.begin(); i != _orSets.end(); ++i ) {
-                if ( i->matchPossible() ) {
-                    ret.push_back( i->simplifiedQuery() );
-                }
-            }
+        static BSONObj maxObject() {
+            BSONObjBuilder b; b.appendMaxKey( "" );
+            return b.obj();
         }
+        /**
+         * @return Suggested advance method, based on current key.
+         *   -2 Iteration is complete, no need to advance.
+         *   -1 Advance to the next key, without skipping.
+         *  >=0 Skip parameter.  If @return is r, skip to the key comprised
+         *      of the first r elements of curr followed by the (r+1)th and
+         *      remaining elements of cmp() (with inclusivity specified by
+         *      the (r+1)th and remaining elements of inc()).  If after() is
+         *      true, skip past this key not to it.
+         */
+        int advance( const BSONObj &curr );
+        const vector<const BSONElement *> &cmp() const { return _cmp; }
+        const vector<bool> &inc() const { return _inc; }
+        bool after() const { return _after; }
+        void prepDive();
+        void setZero( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = 0; }
+        void setMinus( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = -1; }
+        bool ok() { return _i[ 0 ] < (int)_v._ranges[ 0 ].intervals().size(); }
+        BSONObj startKey();
+        // temp
+        BSONObj endKey();
+    private:
+        const FieldRangeVector &_v;
+        vector<int> _i;
+        vector<const BSONElement*> _cmp;
+        vector<bool> _inc;
+        bool _after;
+    };
+    
+    /**
+     * As we iterate through $or clauses this class generates a FieldRangeSetPair
+     * for the current $or clause, in some cases by excluding ranges that were
+     * included in a previous clause.
+     */
+    class OrRangeGenerator {
+    public:
+        OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize=true );
+
+        /**
+         * @return true iff we are done scanning $or clauses.  if there's a
+         * useless or clause, we won't use or index ranges to help with scanning.
+         */
+        bool orFinished() const { return _orFound && _orSets.empty(); }
+        /** Iterates to the next $or clause by removing the current $or clause. */
+        void popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern );
+        void popOrClauseSingleKey();
+        /** @return FieldRangeSetPair for the current $or clause. */
+        FieldRangeSetPair *topFrsp() const;
+        /**
+         * @return original FieldRangeSetPair for the current $or clause. While the
+         * original bounds are looser, they are composed of fewer ranges and it
+         * is faster to do operations with them; when they can be used instead of
+         * more precise bounds, they should.
+         */
+        FieldRangeSetPair *topFrspOriginal() const;
+        
         string getSpecial() const { return _baseSet.getSpecial(); }
 
         bool moreOrClauses() const { return !_orSets.empty(); }
     private:
-        FieldRangeSet _baseSet;
-        list< FieldRangeSet > _orSets;
-        list< FieldRangeSet > _originalOrSets;
-        list< FieldRangeSet > _oldOrSets; // make sure memory is owned
+        void assertMayPopOrClause();
+        void popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d = 0, int idxNo = -1, const BSONObj &keyPattern = BSONObj() );
+        FieldRangeSetPair _baseSet;
+        list<FieldRangeSetPair> _orSets;
+        list<FieldRangeSetPair> _originalOrSets;
+        // ensure memory is owned
+        list<FieldRangeSetPair> _oldOrSets;
         bool _orFound;
+        friend struct QueryUtilIndexed;
     };
 
     /** returns a string that when used as a matcher, would match a super set of regex()
@@ -575,3 +435,5 @@ namespace mongo {
     long long applySkipLimit( long long num , const BSONObj& cmd );
 
 } // namespace mongo
+
+#include "queryutil-inl.h"
diff --git a/db/record.cpp b/db/record.cpp
new file mode 100644
index 0000000..51dc520
--- /dev/null
+++ b/db/record.cpp
@@ -0,0 +1,230 @@
+// record.cpp
+
+#include "pch.h"
+#include "pdfile.h"
+#include "../util/processinfo.h"
+#include "../util/net/listen.h"
+
+namespace mongo {
+
+    namespace ps {
+        
+        enum State {
+            In , Out, Unk
+        };
+
+        enum Constants {
+            SliceSize = 65536 , 
+            MaxChain = 20 , // intentionally very low
+            NumSlices = 10 ,
+            RotateTimeSecs = 90 
+        };
+        
+        int hash( size_t region ) {
+            return 
+                abs( ( ( 7 + (int)(region & 0xFFFF) ) 
+                       * ( 11 + (int)( ( region >> 16 ) & 0xFFFF ) ) 
+#if defined(_WIN64) || defined(__amd64__)
+                       * ( 13 + (int)( ( region >> 32 ) & 0xFFFF ) )
+                       * ( 17 + (int)( ( region >> 48 ) & 0xFFFF ) )
+#endif
+                       ) % SliceSize );
+        }
+        
+                
+        /**
+         * simple hash map for region -> status
+         * this constitures a single region of time
+         * it does chaining, but very short chains
+         */
+        class Slice {
+            
+            struct Entry {
+                size_t region;
+                unsigned long long value;
+            };
+
+        public:
+            
+            Slice() {
+                reset();
+            }
+            
+            void reset() {
+                memset( _data , 0 , SliceSize * sizeof(Entry) );
+            }
+
+            State get( int regionHash , size_t region  , short offset ) {
+                DEV assert( hash( region ) == regionHash );
+                
+                Entry * e = _get( regionHash , region , false );
+                if ( ! e )
+                    return Unk;
+                
+                return ( e->value & ( ((unsigned long long)1) << offset ) ) ? In : Out;
+            }
+            
+            /**
+             * @return true if added, false if full
+             */
+            bool in( int regionHash , size_t region , short offset ) {
+                DEV assert( hash( region ) == regionHash );
+                
+                Entry * e = _get( regionHash , region , true );
+                if ( ! e )
+                    return false;
+                
+                e->value |= ((unsigned long long)1) << offset;
+                return true;
+            }
+
+        private:
+
+            Entry* _get( int start , size_t region , bool add ) {
+                for ( int i=0; i<MaxChain; i++ ) {
+
+                    int bucket = ( start + i ) % SliceSize;
+                    
+                    if ( _data[bucket].region == 0 ) {
+                        if ( ! add ) 
+                            return 0;
+
+                        _data[bucket].region = region;
+                        return &_data[bucket];
+                    }
+                    
+                    if ( _data[bucket].region == region ) {
+                        return &_data[bucket];
+                    }
+                }
+                return 0;
+            }
+
+            Entry _data[SliceSize];
+        };
+        
+        
+        /**
+         * this contains many slices of times
+         * the idea you put mem status in the current time slice
+         * and then after a certain period of time, it rolls off so we check again
+         */
+        class Rolling {
+            
+        public:
+            Rolling() {
+                _curSlice = 0;
+                _lastRotate = Listener::getElapsedTimeMillis();
+            }
+            
+
+            /**
+             * after this call, we assume the page is in ram
+             * @param doHalf if this is a known good access, want to put in first half
+             * @return whether we know the page is in ram
+             */
+            bool access( size_t region , short offset , bool doHalf ) {
+                int regionHash = hash(region);
+                
+                scoped_spinlock lk( _lock );
+                
+                static int rarely_count = 0;
+                if ( rarely_count++ % 2048 == 0 ) {
+                    long long now = Listener::getElapsedTimeMillis();
+                    RARELY if ( now == 0 ) {
+                        tlog() << "warning Listener::getElapsedTimeMillis returning 0ms" << endl;
+                    }
+                    
+                    if ( now - _lastRotate > ( 1000 * RotateTimeSecs ) ) {
+                        _rotate();
+                    }
+                }
+                
+                for ( int i=0; i<NumSlices / ( doHalf ? 2 : 1 ); i++ ) {
+                    int pos = (_curSlice+i)%NumSlices;
+                    State s = _slices[pos].get( regionHash , region , offset );
+
+                    if ( s == In )
+                        return true;
+                    
+                    if ( s == Out ) {
+                        _slices[pos].in( regionHash , region , offset );
+                        return false;
+                    }
+                }
+
+                // we weren't in any slice
+                // so add to cur
+                if ( ! _slices[_curSlice].in( regionHash , region , offset ) ) {
+                    _rotate();
+                    _slices[_curSlice].in( regionHash , region , offset );
+                }
+                return false;
+            }
+            
+        private:
+            
+            void _rotate() {
+                _curSlice = ( _curSlice + 1 ) % NumSlices;
+                _slices[_curSlice].reset();
+                _lastRotate = Listener::getElapsedTimeMillis();
+            }
+
+            int _curSlice;
+            long long _lastRotate;
+            Slice _slices[NumSlices];
+
+            SpinLock _lock;
+        } rolling;
+        
+    }
+
+    bool Record::MemoryTrackingEnabled = true;
+    
+
+    volatile int __record_touch_dummy = 1; // this is used to make sure the compiler doesn't get too smart on us
+    void Record::touch( bool entireRecrd ) {
+
+        if ( lengthWithHeaders > HeaderSize ) { // this also makes sure lengthWithHeaders is in memory
+            char * addr = data;
+            char * end = data + netLength();
+            for ( ; addr <= end ; addr += 2048 ) {
+                __record_touch_dummy += addr[0];
+
+                break; // TODO: remove this, pending SERVER-3711
+                
+                if ( ! entireRecrd )
+                    break;
+            }
+        }
+
+    }
+
+    bool Record::likelyInPhysicalMemory() {
+        if ( ! MemoryTrackingEnabled )
+            return true;
+
+        static bool blockSupported = ProcessInfo::blockCheckSupported();
+
+        const size_t page = (size_t)data >> 12;
+        const size_t region = page >> 6;
+        const size_t offset = page & 0x3f;
+        
+        if ( ps::rolling.access( region , offset , false ) )
+            return true;
+
+        if ( ! blockSupported )
+            return false;
+        return ProcessInfo::blockInMemory( data );
+    }
+
+    Record* Record::accessed() {
+        const size_t page = (size_t)data >> 12;
+        const size_t region = page >> 6;
+        const size_t offset = page & 0x3f;
+        
+        ps::rolling.access( region , offset , true );
+        return this;
+    }
+    
+}
diff --git a/db/repl.cpp b/db/repl.cpp
index b14034d..a18d725 100644
--- a/db/repl.cpp
+++ b/db/repl.cpp
@@ -26,27 +26,30 @@
    local.sources         - indicates what sources we pull from as a "slave", and the last update of each
    local.oplog.$main     - our op log as "master"
    local.dbinfo.<dbname> - no longer used???
-   local.pair.startup    - can contain a special value indicating for a pair that we have the master copy.
+   local.pair.startup    - [deprecated] can contain a special value indicating for a pair that we have the master copy.
                            used when replacing other half of the pair which has permanently failed.
-   local.pair.sync       - { initialsynccomplete: 1 }
+   local.pair.sync       - [deprecated] { initialsynccomplete: 1 }
 */
 
 #include "pch.h"
 #include "jsobj.h"
 #include "../util/goodies.h"
 #include "repl.h"
-#include "../util/message.h"
+#include "../util/net/message.h"
 #include "../util/background.h"
 #include "../client/dbclient.h"
 #include "../client/connpool.h"
 #include "pdfile.h"
-#include "query.h"
+#include "ops/query.h"
 #include "db.h"
 #include "commands.h"
 #include "security.h"
 #include "cmdline.h"
 #include "repl_block.h"
 #include "repl/rs.h"
+#include "replutil.h"
+#include "repl/connections.h"
+#include "ops/update.h"
 
 namespace mongo {
 
@@ -57,11 +60,6 @@ namespace mongo {
     volatile int syncing = 0;
     static volatile int relinquishSyncingSome = 0;
 
-    /* if true replace our peer in a replication pair -- don't worry about if his
-       local.oplog.$main is empty.
-    */
-    bool replacePeer = false;
-
     /* "dead" means something really bad happened like replication falling completely out of sync.
        when non-null, we are dead and the string is informational
     */
@@ -69,23 +67,10 @@ namespace mongo {
 
     time_t lastForcedResync = 0;
 
-    IdTracker &idTracker = *( new IdTracker() );
-
 } // namespace mongo
 
-#include "replpair.h"
-
 namespace mongo {
 
-    PairSync *pairSync = new PairSync();
-    bool getInitialSyncCompleted() {
-        return pairSync->initialSyncCompleted();
-    }
-
-    /* --- ReplPair -------------------------------- */
-
-    ReplPair *replPair = 0;
-
     /* output by the web console */
     const char *replInfo = "";
     struct ReplInfo {
@@ -97,116 +82,6 @@ namespace mongo {
         }
     };
 
-    void ReplPair::setMaster(int n, const char *_comment ) {
-        if ( n == State_Master && !getInitialSyncCompleted() )
-            return;
-        info = _comment;
-        if ( n != state && !cmdLine.quiet )
-            tlog() << "pair: setting master=" << n << " was " << state << endl;
-        state = n;
-    }
-
-    /* peer unreachable, try our arbiter */
-    void ReplPair::arbitrate() {
-        ReplInfo r("arbitrate");
-
-        if ( arbHost == "-" ) {
-            // no arbiter. we are up, let's assume partner is down and network is not partitioned.
-            setMasterLocked(State_Master, "remote unreachable");
-            return;
-        }
-
-        auto_ptr<DBClientConnection> conn( newClientConnection() );
-        string errmsg;
-        if ( !conn->connect(arbHost.c_str(), errmsg) ) {
-            tlog() << "repl:   cantconn arbiter " << errmsg << endl;
-            setMasterLocked(State_CantArb, "can't connect to arb");
-            return;
-        }
-
-        negotiate( conn.get(), "arbiter" );
-    }
-
-    /* --------------------------------------------- */
-
-    class CmdReplacePeer : public Command {
-    public:
-        virtual bool slaveOk() const {
-            return true;
-        }
-        virtual bool adminOnly() const {
-            return true;
-        }
-        virtual LockType locktype() const { return WRITE; }
-        void help(stringstream&h) const { h << "replace a node in a replica pair"; }
-        CmdReplacePeer() : Command("replacePeer", false, "replacepeer") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            if ( replPair == 0 ) {
-                errmsg = "not paired";
-                return false;
-            }
-            if ( !getInitialSyncCompleted() ) {
-                errmsg = "not caught up cannot replace peer";
-                return false;
-            }
-            if ( syncing < 0 ) {
-                errmsg = "replacepeer already invoked";
-                return false;
-            }
-            Timer t;
-            while ( 1 ) {
-                if ( syncing == 0 || t.millis() > 30000 )
-                    break;
-                {
-                    dbtemprelease t;
-                    relinquishSyncingSome = 1;
-                    sleepmillis(1);
-                }
-            }
-            if ( syncing ) {
-                assert( syncing > 0 );
-                errmsg = "timeout waiting for sync() to finish";
-                return false;
-            }
-            {
-                ReplSource::SourceVector sources;
-                ReplSource::loadAll(sources);
-                if ( sources.size() != 1 ) {
-                    errmsg = "local.sources.count() != 1, cannot replace peer";
-                    return false;
-                }
-            }
-            {
-                Helpers::emptyCollection("local.sources");
-                BSONObj o = fromjson("{\"replacepeer\":1}");
-                Helpers::putSingleton("local.pair.startup", o);
-            }
-            syncing = -1;
-            replAllDead = "replacepeer invoked -- adjust local.sources hostname then restart this db process";
-            result.append("info", "adjust local.sources hostname; db restart now required");
-            return true;
-        }
-    } cmdReplacePeer;
-
-    class CmdForceDead : public Command {
-    public:
-        virtual bool slaveOk() const {
-            return true;
-        }
-        virtual bool adminOnly() const {
-            return true;
-        }
-        virtual void help(stringstream& h) const { h << "internal"; }
-        virtual LockType locktype() const { return WRITE; }
-        CmdForceDead() : Command("forcedead") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            replAllDead = "replication forced to stop by 'forcedead' command";
-            log() << "*********************************************************\n";
-            log() << "received 'forcedead' command, replication forced to stop" << endl;
-            return true;
-        }
-    } cmdForceDead;
-
     /* operator requested resynchronization of replication (on the slave).  { resync : 1 } */
     class CmdResync : public Command {
     public:
@@ -220,7 +95,7 @@ namespace mongo {
         virtual LockType locktype() const { return WRITE; }
         void help(stringstream&h) const { h << "resync (from scratch) an out of date replica slave.\nhttp://www.mongodb.org/display/DOCS/Master+Slave"; }
         CmdResync() : Command("resync") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( cmdLine.usingReplSets() ) {
                 errmsg = "resync command not currently supported with replica sets.  See RS102 info in the mongodb documentations";
                 result.append("info", "http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member");
@@ -265,7 +140,7 @@ namespace mongo {
     } cmdResync;
 
     bool anyReplEnabled() {
-        return replPair || replSettings.slave || replSettings.master || theReplSet;
+        return replSettings.slave || replSettings.master || theReplSet;
     }
 
     bool replAuthenticate(DBClientBase *conn);
@@ -276,7 +151,7 @@ namespace mongo {
             if( theReplSet == 0 ) {
                 result.append("ismaster", false);
                 result.append("secondary", false);
-                result.append("info", ReplSet::startupStatusMsg);
+                result.append("info", ReplSet::startupStatusMsg.get());
                 result.append( "isreplicaset" , true );
                 return;
             }
@@ -287,21 +162,9 @@ namespace mongo {
 
         if ( replAllDead ) {
             result.append("ismaster", 0);
-            if( authed ) {
-                if ( replPair )
-                    result.append("remote", replPair->remote);
-            }
             string s = string("dead: ") + replAllDead;
             result.append("info", s);
         }
-        else if ( replPair ) {
-            result.append("ismaster", replPair->state);
-            if( authed ) {
-                result.append("remote", replPair->remote);
-                if ( !replPair->info.empty() )
-                    result.append("info", replPair->info.toString());
-            }
-        }
         else {
             result.appendBool("ismaster", _isMaster() );
         }
@@ -369,7 +232,7 @@ namespace mongo {
         }
         virtual LockType locktype() const { return NONE; }
         CmdIsMaster() : Command("isMaster", true, "ismaster") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not
                authenticated.
                we allow unauthenticated ismaster but we aren't as verbose informationally if
@@ -383,159 +246,11 @@ namespace mongo {
         }
     } cmdismaster;
 
-    class CmdIsInitialSyncComplete : public Command {
-    public:
-        virtual bool requiresAuth() { return false; }
-        virtual bool slaveOk() const {
-            return true;
-        }
-        virtual LockType locktype() const { return NONE; }
-        CmdIsInitialSyncComplete() : Command( "isinitialsynccomplete" ) {}
-        virtual bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
-            result.appendBool( "initialsynccomplete", getInitialSyncCompleted() );
-            return true;
-        }
-    } cmdisinitialsynccomplete;
-
-    /* negotiate who is master
-
-       -1=not set (probably means we just booted)
-        0=was slave
-        1=was master
-
-       remote,local -> new remote,local
-       !1,1  -> 0,1
-       1,!1  -> 1,0
-       -1,-1 -> dominant->1, nondom->0
-       0,0   -> dominant->1, nondom->0
-       1,1   -> dominant->1, nondom->0
-
-       { negotiatemaster:1, i_was:<state>, your_name:<hostname> }
-       returns:
-       { ok:1, you_are:..., i_am:... }
-    */
-    class CmdNegotiateMaster : public Command {
-    public:
-        CmdNegotiateMaster() : Command("negotiatemaster") { }
-        virtual bool slaveOk() const {
-            return true;
-        }
-        virtual bool adminOnly() const {
-            return true;
-        }
-        virtual LockType locktype() const { return WRITE; }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
-            if ( replPair == 0 ) {
-                massert( 10383 ,  "Another mongod instance believes incorrectly that this node is its peer", !cmdObj.getBoolField( "fromArbiter" ) );
-                // assume that we are an arbiter and should forward the request
-                string host = cmdObj.getStringField("your_name");
-                int port = cmdObj.getIntField( "your_port" );
-                if ( port == INT_MIN ) {
-                    errmsg = "no port specified";
-                    problem() << errmsg << endl;
-                    return false;
-                }
-                stringstream ss;
-                ss << host << ":" << port;
-                string remote = ss.str();
-                BSONObj ret;
-                {
-                    dbtemprelease t;
-                    auto_ptr<DBClientConnection> conn( new DBClientConnection() );
-                    if ( !conn->connect( remote.c_str(), errmsg ) ) {
-                        result.append( "you_are", ReplPair::State_Master );
-                        return true;
-                    }
-                    BSONObjBuilder forwardCommand;
-                    forwardCommand.appendElements( cmdObj );
-                    forwardCommand.appendBool( "fromArbiter", true );
-                    ret = conn->findOne( "admin.$cmd", forwardCommand.done() );
-                }
-                BSONObjIterator i( ret );
-                while( i.moreWithEOO() ) {
-                    BSONElement e = i.next();
-                    if ( e.eoo() )
-                        break;
-                    if ( e.fieldName() != string( "ok" ) )
-                        result.append( e );
-                }
-                return ret["ok"].trueValue();
-            }
-
-            int was = cmdObj.getIntField("i_was");
-            string myname = cmdObj.getStringField("your_name");
-            if ( myname.empty() || was < -3 ) {
-                errmsg = "your_name/i_was not specified";
-                return false;
-            }
-
-            int N = ReplPair::State_Negotiating;
-            int M = ReplPair::State_Master;
-            int S = ReplPair::State_Slave;
-
-            if ( !replPair->dominant( myname ) ) {
-                result.append( "you_are", N );
-                result.append( "i_am", replPair->state );
-                return true;
-            }
-
-            int me, you;
-            if ( !getInitialSyncCompleted() || ( replPair->state != M && was == M ) ) {
-                me=S;
-                you=M;
-            }
-            else {
-                me=M;
-                you=S;
-            }
-            replPair->setMaster( me, "CmdNegotiateMaster::run()" );
-
-            result.append("you_are", you);
-            result.append("i_am", me);
-
-            return true;
-        }
-    } cmdnegotiatemaster;
-
-    int ReplPair::negotiate(DBClientConnection *conn, string method) {
-        BSONObjBuilder b;
-        b.append("negotiatemaster",1);
-        b.append("i_was", state);
-        b.append("your_name", remoteHost);
-        b.append("your_port", remotePort);
-        BSONObj cmd = b.done();
-        BSONObj res = conn->findOne("admin.$cmd", cmd);
-        if ( ! res["ok"].trueValue() ) {
-            string message = method + " negotiate failed";
-            problem() << message << ": " << res.toString() << '\n';
-            setMasterLocked(State_Confused, message.c_str());
-            return State_Confused;
-        }
-        int x = res.getIntField("you_are");
-        int remote = res.getIntField("i_am");
-        // State_Negotiating means the remote node is not dominant and cannot
-        // choose who is master.
-        if ( x != State_Slave && x != State_Master && x != State_Negotiating ) {
-            problem() << method << " negotiate: bad you_are value " << res.toString() << endl;
-        }
-        else if ( x != State_Negotiating ) {
-            string message = method + " negotiation";
-            setMasterLocked(x, message.c_str());
-        }
-        return remote;
-    }
-
-    /* --------------------------------------------------------------*/
-
     ReplSource::ReplSource() {
-        replacing = false;
         nClonedThisPass = 0;
-        paired = false;
     }
 
     ReplSource::ReplSource(BSONObj o) : nClonedThisPass(0) {
-        replacing = false;
-        paired = false;
         only = o.getStringField("only");
         hostName = o.getStringField("host");
         _sourceName = o.getStringField("source");
@@ -569,8 +284,6 @@ namespace mongo {
                 incompleteCloneDbs.insert( e.fieldName() );
             }
         }
-
-        _lastSavedLocalTs = OpTime( o.getField( "localLogTs" ).date() );
     }
 
     /* Turn our C++ Source object into a BSONObj */
@@ -583,8 +296,6 @@ namespace mongo {
         if ( !syncedTo.isNull() )
             b.appendTimestamp("syncedTo", syncedTo.asDate());
 
-        b.appendTimestamp("localLogTs", _lastSavedLocalTs.asDate());
-
         BSONObjBuilder dbsNextPassBuilder;
         int n = 0;
         for ( set<string>::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++ ) {
@@ -625,16 +336,6 @@ namespace mongo {
             assert( ! res.mod );
             assert( res.num == 1 );
         }
-
-        if ( replacing ) {
-            /* if we were in "replace" mode, we now have synced up with the replacement,
-               so turn that off.
-               */
-            replacing = false;
-            wassert( replacePeer );
-            replacePeer = false;
-            Helpers::emptyCollection("local.pair.startup");
-        }
     }
 
     static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, ReplSource::SourceVector &old) {
@@ -660,8 +361,6 @@ namespace mongo {
         SourceVector old = v;
         v.clear();
 
-        bool gotPairWith = false;
-
         if ( !cmdLine.source.empty() ) {
             // --source <host> specified.
             // check that no items are in sources other than that
@@ -705,71 +404,21 @@ namespace mongo {
             }
         }
 
-        if ( replPair ) {
-            const string &remote = replPair->remote;
-            // --pairwith host specified.
-            if ( replSettings.fastsync ) {
-                Helpers::emptyCollection( "local.sources" );  // ignore saved sources
-            }
-            // check that no items are in sources other than that
-            // add if missing
-            shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
-            int n = 0;
-            while ( c->ok() ) {
-                n++;
-                ReplSource tmp(c->current());
-                if ( tmp.hostName != remote ) {
-                    log() << "pairwith " << remote << " != " << tmp.hostName << " from local.sources collection" << endl;
-                    log() << "terminating after 30 seconds" << endl;
-                    sleepsecs(30);
-                    dbexit( EXIT_REPLICATION_ERROR );
-                }
-                c->advance();
-            }
-            uassert( 10122 ,  "local.sources collection corrupt?", n<2 );
-            if ( n == 0 ) {
-                // source missing.  add.
-                ReplSource s;
-                s.hostName = remote;
-                s.save();
-            }
-        }
-
         shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
         while ( c->ok() ) {
             ReplSource tmp(c->current());
-            if ( replPair && tmp.hostName == replPair->remote && tmp.sourceName() == "main" ) {
-                gotPairWith = true;
-                tmp.paired = true;
-                if ( replacePeer ) {
-                    // peer was replaced -- start back at the beginning.
-                    tmp.syncedTo = OpTime();
-                    tmp.replacing = true;
-                }
-            }
-            if ( ( !replPair && tmp.syncedTo.isNull() ) ||
-                    ( replPair && replSettings.fastsync ) ) {
+            if ( tmp.syncedTo.isNull() ) {
                 DBDirectClient c;
                 if ( c.exists( "local.oplog.$main" ) ) {
                     BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) );
                     if ( !op.isEmpty() ) {
                         tmp.syncedTo = op[ "ts" ].date();
-                        tmp._lastSavedLocalTs = op[ "ts" ].date();
                     }
                 }
             }
             addSourceToList(v, tmp, old);
             c->advance();
         }
-
-        if ( !gotPairWith && replPair ) {
-            /* add the --pairwith server */
-            shared_ptr< ReplSource > s( new ReplSource() );
-            s->paired = true;
-            s->hostName = replPair->remote;
-            s->replacing = replacePeer;
-            v.push_back(s);
-        }
     }
 
     BSONObj opTimeQuery = fromjson("{\"getoptime\":1}");
@@ -789,6 +438,7 @@ namespace mongo {
         SourceVector sources;
         ReplSource::loadAll(sources);
         for( SourceVector::iterator i = sources.begin(); i != sources.end(); ++i ) {
+            log() << requester << " forcing resync from "  << (*i)->hostName << endl;
             (*i)->forceResync( requester );
         }
         replAllDead = 0;
@@ -798,7 +448,9 @@ namespace mongo {
         BSONObj info;
         {
             dbtemprelease t;
-            oplogReader.connect(hostName);
+            if (!oplogReader.connect(hostName)) {
+                msgassertedNoTrace( 14051 , "unable to connect to resync");
+            }
             /* todo use getDatabaseNames() method here */
             bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
             massert( 10385 ,  "Unable to get database list", ok );
@@ -830,22 +482,132 @@ namespace mongo {
     }
 
     /* grab initial copy of a database from the master */
-    bool ReplSource::resync(string db) {
+    void ReplSource::resync(string db) {
         string dummyNs = resyncDrop( db.c_str(), "internal" );
         Client::Context ctx( dummyNs );
         {
             log() << "resync: cloning database " << db << " to get an initial copy" << endl;
             ReplInfo r("resync: cloning a database");
             string errmsg;
-            bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveok*/ true, /*replauth*/ true, /*snapshot*/false);
+            int errCode = 0;
+            bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveok*/ true, /*replauth*/ true, /*snapshot*/false, /*mayYield*/true, /*mayBeInterrupted*/false, &errCode);
             if ( !ok ) {
-                problem() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl;
-                throw SyncException();
+                if ( errCode == DatabaseDifferCaseCode ) {
+                    resyncDrop( db.c_str(), "internal" );
+                    log() << "resync: database " << db << " not valid on the master due to a name conflict, dropping." << endl;
+                    return;
+                }
+                else {
+                    problem() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl;
+                    throw SyncException();
+                }
             }
         }
 
         log() << "resync: done with initial clone for db: " << db << endl;
 
+        return;
+    }
+    
+    DatabaseIgnorer ___databaseIgnorer;
+    
+    void DatabaseIgnorer::doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime ) {
+        if ( futureOplogTime > _ignores[ db ] ) {
+            _ignores[ db ] = futureOplogTime;   
+        }
+    }
+
+    bool DatabaseIgnorer::ignoreAt( const string &db, const OpTime &currentOplogTime ) {
+        if ( _ignores[ db ].isNull() ) {
+            return false;
+        }
+        if ( _ignores[ db ] >= currentOplogTime ) {
+            return true;
+        } else {
+            // The ignore state has expired, so clear it.
+            _ignores.erase( db );
+            return false;
+        }
+    }
+
+    bool ReplSource::handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db ) {
+        if ( dbHolder.isLoaded( ns, dbpath ) ) {
+            // Database is already present.
+            return true;   
+        }
+        BSONElement ts = op.getField( "ts" );
+        if ( ( ts.type() == Date || ts.type() == Timestamp ) && ___databaseIgnorer.ignoreAt( db, ts.date() ) ) {
+            // Database is ignored due to a previous indication that it is
+            // missing from master after optime "ts".
+            return false;   
+        }
+        if ( Database::duplicateUncasedName( db, dbpath ).empty() ) {
+            // No duplicate database names are present.
+            return true;
+        }
+        
+        OpTime lastTime;
+        bool dbOk = false;
+        {
+            dbtemprelease release;
+        
+            // We always log an operation after executing it (never before), so
+            // a database list will always be valid as of an oplog entry generated
+            // before it was retrieved.
+            
+            BSONObj last = oplogReader.findOne( this->ns().c_str(), Query().sort( BSON( "$natural" << -1 ) ) );
+            if ( !last.isEmpty() ) {
+	            BSONElement ts = last.getField( "ts" );
+	            massert( 14032, "Invalid 'ts' in remote log", ts.type() == Date || ts.type() == Timestamp );
+	            lastTime = OpTime( ts.date() );
+            }
+
+            BSONObj info;
+            bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+            massert( 14033, "Unable to get database list", ok );
+            BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+            while( i.more() ) {
+                BSONElement e = i.next();
+            
+                const char * name = e.embeddedObject().getField( "name" ).valuestr();
+                if ( strcasecmp( name, db ) != 0 )
+                    continue;
+                
+                if ( strcmp( name, db ) == 0 ) {
+                    // The db exists on master, still need to check that no conflicts exist there.
+                    dbOk = true;
+                    continue;
+                }
+                
+                // The master has a db name that conflicts with the requested name.
+                dbOk = false;
+                break;
+            }
+        }
+        
+        if ( !dbOk ) {
+            ___databaseIgnorer.doIgnoreUntilAfter( db, lastTime );
+            incompleteCloneDbs.erase(db);
+            addDbNextPass.erase(db);
+            return false;   
+        }
+        
+        // Check for duplicates again, since we released the lock above.
+        set< string > duplicates;
+        Database::duplicateUncasedName( db, dbpath, &duplicates );
+        
+        // The database is present on the master and no conflicting databases
+        // are present on the master.  Drop any local conflicts.
+        for( set< string >::const_iterator i = duplicates.begin(); i != duplicates.end(); ++i ) {
+            ___databaseIgnorer.doIgnoreUntilAfter( *i, lastTime );
+            incompleteCloneDbs.erase(*i);
+            addDbNextPass.erase(*i);
+            Client::Context ctx(*i);
+            dropDatabase(*i);
+        }
+        
+        massert( 14034, "Duplicate database names present after attempting to delete duplicates",
+                Database::duplicateUncasedName( db, dbpath ).empty() );
         return true;
     }
 
@@ -869,7 +631,7 @@ namespace mongo {
 
        @param alreadyLocked caller already put us in write lock if true
     */
-    void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail, bool alreadyLocked) {
+    void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked) {
         if( logLevel >= 6 ) // op.tostring is expensive so doing this check explicitly
             log(6) << "processing op: " << op << endl;
 
@@ -936,17 +698,16 @@ namespace mongo {
 
         scoped_ptr<writelock> lk( alreadyLocked ? 0 : new writelock() );
 
-        if ( localLogTail && replPair && replPair->state == ReplPair::State_Master ) {
-            updateSetsWithLocalOps( *localLogTail, true ); // allow unlocking
-            updateSetsWithLocalOps( *localLogTail, false ); // don't allow unlocking or conversion to db backed storage
-        }
-
         if ( replAllDead ) {
             // hmmm why is this check here and not at top of this function? does it get set between top and here?
             log() << "replAllDead, throwing SyncException: " << replAllDead << endl;
             throw SyncException();
         }
 
+        if ( !handleDuplicateDbName( op, ns, clientName ) ) {
+            return;   
+        }
+                
         Client::Context ctx( ns );
         ctx.getClient()->curop()->reset();
 
@@ -988,78 +749,11 @@ namespace mongo {
             save();
         }
         else {
-            bool mod;
-            if ( replPair && replPair->state == ReplPair::State_Master ) {
-                BSONObj id = idForOp( op, mod );
-                if ( !idTracker.haveId( ns, id ) ) {
-                    applyOperation( op );
-                }
-                else if ( idTracker.haveModId( ns, id ) ) {
-                    log( 6 ) << "skipping operation matching mod id object " << op << endl;
-                    BSONObj existing;
-                    if ( Helpers::findOne( ns, id, existing ) )
-                        logOp( "i", ns, existing );
-                }
-                else {
-                    log( 6 ) << "skipping operation matching changed id object " << op << endl;
-                }
-            }
-            else {
-                applyOperation( op );
-            }
+            applyOperation( op );
             addDbNextPass.erase( clientName );
         }
     }
 
-    BSONObj ReplSource::idForOp( const BSONObj &op, bool &mod ) {
-        mod = false;
-        const char *opType = op.getStringField( "op" );
-        BSONObj o = op.getObjectField( "o" );
-        switch( opType[ 0 ] ) {
-        case 'i': {
-            BSONObjBuilder idBuilder;
-            BSONElement id;
-            if ( !o.getObjectID( id ) )
-                return BSONObj();
-            idBuilder.append( id );
-            return idBuilder.obj();
-        }
-        case 'u': {
-            BSONObj o2 = op.getObjectField( "o2" );
-            if ( strcmp( o2.firstElement().fieldName(), "_id" ) != 0 )
-                return BSONObj();
-            if ( o.firstElement().fieldName()[ 0 ] == '$' )
-                mod = true;
-            return o2;
-        }
-        case 'd': {
-            if ( opType[ 1 ] != '\0' )
-                return BSONObj(); // skip "db" op type
-            return o;
-        }
-        default:
-            break;
-        }
-        return BSONObj();
-    }
-
-    void ReplSource::updateSetsWithOp( const BSONObj &op, bool mayUnlock ) {
-        if ( mayUnlock ) {
-            idTracker.mayUpgradeStorage();
-        }
-        bool mod;
-        BSONObj id = idForOp( op, mod );
-        if ( !id.isEmpty() ) {
-            const char *ns = op.getStringField( "ns" );
-            // Since our range of local ops may not be the same as our peer's
-            // range of unapplied ops, it is always necessary to rewrite objects
-            // to the oplog after a mod update.
-            if ( mod )
-                idTracker.haveModId( ns, id, true );
-            idTracker.haveId( ns, id, true );
-        }
-    }
-
     void ReplSource::syncToTailOfRemoteLog() {
         string _ns = ns();
         BSONObjBuilder b;
@@ -1074,65 +768,6 @@ namespace mongo {
         }
     }
 
-    OpTime ReplSource::nextLastSavedLocalTs() const {
-        Client::Context ctx( "local.oplog.$main" );
-        shared_ptr<Cursor> c = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) );
-        if ( c->ok() )
-            return OpTime( c->current().getField( "ts" ).date() );
-        return OpTime();
-    }
-
-    void ReplSource::setLastSavedLocalTs( const OpTime &nextLocalTs ) {
-        _lastSavedLocalTs = nextLocalTs;
-        log( 3 ) << "updated _lastSavedLocalTs to: " << _lastSavedLocalTs << endl;
-    }
-
-    void ReplSource::resetSlave() {
-        log() << "**********************************************************\n";
-        log() << "Sending forcedead command to slave to stop its replication\n";
-        log() << "Host: " << hostName << " paired: " << paired << endl;
-        massert( 10387 ,  "request to kill slave replication failed",
-                 oplogReader.conn()->simpleCommand( "admin", 0, "forcedead" ) );
-        syncToTailOfRemoteLog();
-        {
-            dblock lk;
-            setLastSavedLocalTs( nextLastSavedLocalTs() );
-            save();
-            oplogReader.resetCursor();
-        }
-    }
-
-    bool ReplSource::updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock ) {
-        Client::Context ctx( "local.oplog.$main" );
-        shared_ptr<Cursor> localLog = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) );
-        OpTime newTail;
-        for( ; localLog->ok(); localLog->advance() ) {
-            BSONObj op = localLog->current();
-            OpTime ts( localLog->current().getField( "ts" ).date() );
-            if ( newTail.isNull() ) {
-                newTail = ts;
-            }
-            if ( !( localLogTail < ts ) )
-                break;
-            updateSetsWithOp( op, mayUnlock );
-            if ( mayUnlock ) {
-                RARELY {
-                    dbtemprelease t;
-                }
-            }
-        }
-        if ( !localLogTail.isNull() && !localLog->ok() ) {
-            // local log filled up
-            idTracker.reset();
-            dbtemprelease t;
-            resetSlave();
-            massert( 10388 ,  "local master log filled, forcing slave resync", false );
-        }
-        if ( !newTail.isNull() )
-            localLogTail = newTail;
-        return true;
-    }
-
     extern unsigned replApplyBatchSize;
 
     /* slave: pull some data from the master's oplog
@@ -1149,12 +784,6 @@ namespace mongo {
         bool tailing = true;
         oplogReader.tailCheck();
 
-        if ( replPair && replPair->state == ReplPair::State_Master ) {
-            dblock lk;
-            idTracker.reset();
-        }
-        OpTime localLogTail = _lastSavedLocalTs;
-
         bool initial = syncedTo.isNull();
 
         if ( !oplogReader.haveCursor() || initial ) {
@@ -1215,7 +844,7 @@ namespace mongo {
                 b.append("ns", *i + '.');
                 b.append("op", "db");
                 BSONObj op = b.done();
-                sync_pullOpLog_applyOperation(op, 0, false);
+                sync_pullOpLog_applyOperation(op, false);
             }
         }
 
@@ -1231,13 +860,6 @@ namespace mongo {
             }
             {
                 dblock lk;
-                OpTime nextLastSaved = nextLastSavedLocalTs();
-                {
-                    dbtemprelease t;
-                    if ( !oplogReader.more() ) {
-                        setLastSavedLocalTs( nextLastSaved );
-                    }
-                }
                 save();
             }
             return okResultCode;
@@ -1266,19 +888,6 @@ namespace mongo {
                 }
             }
 
-            if ( replPair && replPair->state == ReplPair::State_Master ) {
-
-                OpTime next( ts.date() );
-                if ( !tailing && !initial && next != syncedTo ) {
-                    log() << "remote slave log filled, forcing slave resync" << endl;
-                    resetSlave();
-                    return 1;
-                }
-
-                dblock lk;
-                updateSetsWithLocalOps( localLogTail, true );
-            }
-
             nextOpTime = OpTime( ts.date() );
             log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n';
             if ( initial ) {
@@ -1320,37 +929,21 @@ namespace mongo {
             int n = 0;
             time_t saveLast = time(0);
             while ( 1 ) {
-                /* from a.s.:
-                   I think the idea here is that we can establish a sync point between the local op log and the remote log with the following steps:
-
-                   1) identify most recent op in local log -- call it O
-                   2) ask "does nextOpTime reflect the tail of the remote op log?" (in other words, is more() false?) - If yes, all subsequent ops after nextOpTime in the remote log must have occurred after O.  If no, we can't establish a sync point.
-
-                   Note that we can't do step (2) followed by step (1) because if we do so ops may be added to both machines between steps (2) and (1) and we can't establish a sync point.  (In particular, between (2) and (1) an op may be added to the remote log before a different op is added to the local log.  In this case, the newest remote op will have occurred after nextOpTime but before O.)
-
-                   Now, for performance reasons we don't want to have to identify the most recent op in the local log every time we call c->more() because in performance sensitive situations more() will be true most of the time.  So we do:
-
-                   0) more()?
-                   1) find most recent op in local log
-                   2) more()?
-                */
 
                 bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place.
 
                 if ( moreInitialSyncsPending || !oplogReader.more() ) {
                     dblock lk;
-                    OpTime nextLastSaved = nextLastSavedLocalTs();
+
+                    // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior.
                     {
                         dbtemprelease t;
                         if ( !moreInitialSyncsPending && oplogReader.more() ) {
-                            if ( getInitialSyncCompleted() ) { // if initial sync hasn't completed, break out of loop so we can set to completed or clone more dbs
-                                continue;
-                            }
-                        }
-                        else {
-                            setLastSavedLocalTs( nextLastSaved );
+                            continue;
                         }
+                        // otherwise, break out of loop so we can set to completed or clone more dbs
                     }
+                    
                     if( oplogReader.awaitCapable() && tailing )
                         okResultCode = 0; // don't sleep
                     syncedTo = nextOpTime;
@@ -1415,7 +1008,7 @@ namespace mongo {
                         return okResultCode;
                     }
 
-                    sync_pullOpLog_applyOperation(op, &localLogTail, !justOne);
+                    sync_pullOpLog_applyOperation(op, !justOne);
                     n++;
 
                     if( --b == 0 )
@@ -1438,6 +1031,9 @@ namespace mongo {
     BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}");
 
     bool replAuthenticate(DBClientBase *conn) {
+        if( noauth ) {
+            return true;
+        }
         if( ! cc().isAdmin() ) {
             log() << "replauthenticate: requires admin permissions, failing\n";
             return false;
@@ -1458,7 +1054,7 @@ namespace mongo {
                         // try the first user in local
                         !Helpers::getSingleton("local.system.users", user) ) {
                     log() << "replauthenticate: no user in local.system.users to use for authentication\n";
-                    return noauth;
+                    return false;
                 }
             }
             u = user.getStringField("user");
@@ -1477,13 +1073,24 @@ namespace mongo {
 
     bool replHandshake(DBClientConnection *conn) {
 
+        string myname = getHostName();
+
         BSONObj me;
         {
+            
             dblock l;
             // local.me is an identifier for a server for getLastError w:2+
-            if ( ! Helpers::getSingleton( "local.me" , me ) ) {
+            if ( ! Helpers::getSingleton( "local.me" , me ) ||
+                 ! me.hasField("host") ||
+                 me["host"].String() != myname ) {
+
+                // clean out local.me
+                Helpers::emptyCollection("local.me");
+
+                // repopulate
                 BSONObjBuilder b;
                 b.appendOID( "_id" , 0 , true );
+                b.append( "host", myname );
                 me = b.obj();
                 Helpers::putSingleton( "local.me" , me );
             }
@@ -1491,6 +1098,9 @@ namespace mongo {
 
         BSONObjBuilder cmd;
         cmd.appendAs( me["_id"] , "handshake" );
+        if (theReplSet) {
+            cmd.append("member", theReplSet->selfId());
+        }
 
         BSONObj res;
         bool ok = conn->runCommand( "admin" , cmd.obj() , res );
@@ -1499,14 +1109,13 @@ namespace mongo {
         return true;
     }
 
-    bool OplogReader::connect(string hostName) {
+    bool OplogReader::commonConnect(const string& hostName) {
         if( conn() == 0 ) {
-            _conn = auto_ptr<DBClientConnection>(new DBClientConnection( false, 0, replPair ? 20 : 0 /* tcp timeout */));
+            _conn = shared_ptr<DBClientConnection>(new DBClientConnection( false, 0, 0 /* tcp timeout */));
             string errmsg;
             ReplInfo r("trying to connect to sync source");
             if ( !_conn->connect(hostName.c_str(), errmsg) ||
-                    (!noauth && !replAuthenticate(_conn.get())) ||
-                    !replHandshake(_conn.get()) ) {
+                 (!noauth && !replAuthenticate(_conn.get())) ) {
                 resetConnection();
                 log() << "repl:  " << errmsg << endl;
                 return false;
@@ -1514,6 +1123,37 @@ namespace mongo {
         }
         return true;
     }
+    
+    bool OplogReader::connect(string hostName) {
+        if (conn() != 0) {
+            return true;
+        }
+
+        if (commonConnect(hostName)) {
+            return replHandshake(_conn.get());
+        }
+        return false;
+    }
+
+    bool OplogReader::connect(const BSONObj& rid, const int from, const string& to) {
+        if (conn() != 0) {
+            return true;
+        }
+        if (commonConnect(to)) {
+            log() << "handshake between " << from << " and " << to << endl;
+            return passthroughHandshake(rid, from);
+        }
+        return false;
+    }
+
+    bool OplogReader::passthroughHandshake(const BSONObj& rid, const int f) {
+        BSONObjBuilder cmd;
+        cmd.appendAs( rid["_id"], "handshake" );
+        cmd.append( "member" , f );
+
+        BSONObj res;
+        return conn()->runCommand( "admin" , cmd.obj() , res );
+    }
 
     /* note: not yet in mutex at this point.
        returns >= 0 if ok.  return -1 if you want to reconnect.
@@ -1541,22 +1181,9 @@ namespace mongo {
 
         if ( !oplogReader.connect(hostName) ) {
             log(4) << "repl:  can't connect to sync source" << endl;
-            if ( replPair && paired ) {
-                assert( startsWith(hostName.c_str(), replPair->remoteHost.c_str()) );
-                replPair->arbitrate();
-            }
             return -1;
         }
 
-        if ( paired ) {
-            int remote = replPair->negotiate(oplogReader.conn(), "direct");
-            int nMasters = ( remote == ReplPair::State_Master ) + ( replPair->state == ReplPair::State_Master );
-            if ( getInitialSyncCompleted() && nMasters != 1 ) {
-                log() << ( nMasters == 0 ? "no master" : "two masters" ) << ", deferring oplog pull" << endl;
-                return 1;
-            }
-        }
-
         /*
             // get current mtime at the server.
             BSONObj o = conn->findOne("admin.$cmd", opTimeQuery);
@@ -1619,9 +1246,6 @@ namespace mongo {
                 }
                 else
                     sleepAdvice = res;
-                if ( res >= 0 && !moreToSync /*&& !s->syncedTo.isNull()*/ ) {
-                    pairSync->setInitialSyncCompletedLocking();
-                }
             }
             catch ( const SyncException& ) {
                 log() << "caught SyncException" << endl;
@@ -1662,8 +1286,11 @@ namespace mongo {
             {
                 dblock lk;
                 if ( replAllDead ) {
-                    if ( !replSettings.autoresync || !ReplSource::throttledForceResyncDead( "auto" ) )
+                    // throttledForceResyncDead can throw
+                    if ( !replSettings.autoresync || !ReplSource::throttledForceResyncDead( "auto" ) ) {
+                        log() << "all sources dead: " << replAllDead << ", sleeping for 5 seconds" << endl;
                         break;
+                    }
                 }
                 assert( syncing == 0 ); // i.e., there is only one sync thread running. we will want to change/fix this.
                 syncing++;
@@ -1697,7 +1324,7 @@ namespace mongo {
 
             if ( s ) {
                 stringstream ss;
-                ss << "repl: sleep " << s << "sec before next pass";
+                ss << "repl: sleep " << s << " sec before next pass";
                 string msg = ss.str();
                 if ( ! cmdLine.quiet )
                     log() << msg << endl;
@@ -1707,8 +1334,6 @@ namespace mongo {
         }
     }
 
-    int debug_stop_repl = 0;
-
     static void replMasterThread() {
         sleepsecs(4);
         Client::initThread("replmaster");
@@ -1725,7 +1350,7 @@ namespace mongo {
                 if ( lk.got() ) {
                     toSleep = 10;
 
-                    cc().getAuthenticationInfo()->authorize("admin");
+                    replLocalAuth();
 
                     try {
                         logKeepalive();
@@ -1749,21 +1374,12 @@ namespace mongo {
 
         {
             dblock lk;
-            cc().getAuthenticationInfo()->authorize("admin");
-
-            BSONObj obj;
-            if ( Helpers::getSingleton("local.pair.startup", obj) ) {
-                // should be: {replacepeer:1}
-                replacePeer = true;
-                pairSync->setInitialSyncCompleted(); // we are the half that has all the data
-            }
+            replLocalAuth();
         }
 
         while ( 1 ) {
             try {
                 replMain();
-                if ( debug_stop_repl )
-                    break;
                 sleepsecs(5);
             }
             catch ( AssertionException& ) {
@@ -1771,6 +1387,15 @@ namespace mongo {
                 problem() << "Assertion in replSlaveThread(): sleeping 5 minutes before retry" << endl;
                 sleepsecs(300);
             }
+            catch ( DBException& e ) {
+                problem() << "exception in replSlaveThread(): " << e.what()
+                          << ", sleeping 5 minutes before retry" << endl;
+                sleepsecs(300);
+            }
+            catch ( ... ) {
+                problem() << "error in replSlaveThread(): sleeping 5 minutes before retry" << endl;
+                sleepsecs(300);
+            }
         }
     }
 
@@ -1783,15 +1408,21 @@ namespace mongo {
 
     void newRepl();
     void oldRepl();
+    void startReplSets(ReplSetCmdline*);
     void startReplication() {
         /* if we are going to be a replica set, we aren't doing other forms of replication. */
         if( !cmdLine._replSet.empty() ) {
-            if( replSettings.slave || replSettings.master || replPair ) {
+            if( replSettings.slave || replSettings.master ) {
                 log() << "***" << endl;
                 log() << "ERROR: can't use --slave or --master replication options with --replSet" << endl;
                 log() << "***" << endl;
             }
             newRepl();
+
+            replSet = true;
+            ReplSetCmdline *replSetCmdline = new ReplSetCmdline(cmdLine._replSet);
+            boost::thread t( boost::bind( &startReplSets, replSetCmdline) );
+
             return;
         }
 
@@ -1802,28 +1433,22 @@ namespace mongo {
            */
         //boost::thread tempt(tempThread);
 
-        if( !replSettings.slave && !replSettings.master && !replPair )
+        if( !replSettings.slave && !replSettings.master )
             return;
 
         {
             dblock lk;
-            cc().getAuthenticationInfo()->authorize("admin");
-            pairSync->init();
+            replLocalAuth();
         }
 
-        if ( replSettings.slave || replPair ) {
-            if ( replSettings.slave ) {
-                assert( replSettings.slave == SimpleSlave );
-                log(1) << "slave=true" << endl;
-            }
-            else
-                replSettings.slave = ReplPairSlave;
+        if ( replSettings.slave ) {
+            assert( replSettings.slave == SimpleSlave );
+            log(1) << "slave=true" << endl;
             boost::thread repl_thread(replSlaveThread);
         }
 
-        if ( replSettings.master || replPair ) {
-            if ( replSettings.master )
-                log(1) << "master=true" << endl;
+        if ( replSettings.master ) {
+            log(1) << "master=true" << endl;
             replSettings.master = true;
             createOplog();
             boost::thread t(replMasterThread);
@@ -1833,11 +1458,6 @@ namespace mongo {
             sleepmillis( 50 );
     }
 
-    /* called from main at server startup */
-    void pairWith(const char *remoteEnd, const char *arb) {
-        replPair = new ReplPair(remoteEnd, arb);
-    }
-
     void testPretouch() {
         int nthr = min(8, 8);
         nthr = max(nthr, 1);
diff --git a/db/repl.h b/db/repl.h
index 45036fa..9791f14 100644
--- a/db/repl.h
+++ b/db/repl.h
@@ -30,49 +30,44 @@
 #include "pdfile.h"
 #include "db.h"
 #include "dbhelpers.h"
-#include "query.h"
-#include "queryoptimizer.h"
 #include "../client/dbclient.h"
 #include "../util/optime.h"
 #include "oplog.h"
 #include "../util/concurrency/thread_pool.h"
 #include "oplogreader.h"
+#include "cloner.h"
 
 namespace mongo {
 
-    /* replication slave? (possibly with slave or repl pair nonmaster)
+    /* replication slave? (possibly with slave)
        --slave cmd line setting -> SimpleSlave
     */
-    typedef enum { NotSlave=0, SimpleSlave, ReplPairSlave } SlaveTypes;
+    typedef enum { NotSlave=0, SimpleSlave } SlaveTypes;
 
     class ReplSettings {
     public:
         SlaveTypes slave;
 
-        /* true means we are master and doing replication.  if we are not writing to oplog (no --master or repl pairing),
-           this won't be true.
-        */
+        /** true means we are master and doing replication.  if we are not writing to oplog, this won't be true. */
         bool master;
 
-        int opIdMem;
-
         bool fastsync;
 
         bool autoresync;
 
         int slavedelay;
 
+        set<string> discoveredSeeds;
+        BSONObj reconfig;
+
         ReplSettings()
-            : slave(NotSlave) , master(false) , opIdMem(100000000) , fastsync() , autoresync(false), slavedelay() {
+            : slave(NotSlave) , master(false) , fastsync() , autoresync(false), slavedelay(), discoveredSeeds() {
         }
 
     };
 
     extern ReplSettings replSettings;
 
-    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
-                   bool slaveOk, bool useReplAuth, bool snapshot);
-
     /* A replication exception */
     class SyncException : public DBException {
     public:
@@ -84,18 +79,18 @@ namespace mongo {
 
        Can be a group of things to replicate for several databases.
 
-          { host: ..., source: ..., only: ..., syncedTo: ..., localLogTs: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+          { host: ..., source: ..., only: ..., syncedTo: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
 
        'source' defaults to 'main'; support for multiple source names is
        not done (always use main for now).
     */
     class ReplSource {
-        auto_ptr<ThreadPool> tp;
+        shared_ptr<ThreadPool> tp;
 
-        bool resync(string db);
+        void resync(string db);
 
         /** @param alreadyLocked caller already put us in write lock if true */
-        void sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail, bool alreadyLocked);
+        void sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked);
 
         /* pull some operations from the master's oplog, and apply them.
            calls sync_pullOpLog_applyOperation
@@ -115,28 +110,23 @@ namespace mongo {
 
         // returns the dummy ns used to do the drop
         string resyncDrop( const char *db, const char *requester );
-        // returns possibly unowned id spec for the operation.
-        static BSONObj idForOp( const BSONObj &op, bool &mod );
-        static void updateSetsWithOp( const BSONObj &op, bool mayUpdateStorage );
         // call without the db mutex
         void syncToTailOfRemoteLog();
-        // call with the db mutex
-        OpTime nextLastSavedLocalTs() const;
-        void setLastSavedLocalTs( const OpTime &nextLocalTs );
-        // call without the db mutex
-        void resetSlave();
-        // call with the db mutex
-        // returns false if the slave has been reset
-        bool updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock );
         string ns() const { return string( "local.oplog.$" ) + sourceName(); }
         unsigned _sleepAdviceTime;
 
+        /**
+         * If 'db' is a new database and its name would conflict with that of
+         * an existing database, synchronize these database names with the
+         * master.
+         * @return true iff an op with the specified ns may be applied.
+         */
+        bool handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db );
+        
     public:
         OplogReader oplogReader;
 
         static void applyOperation(const BSONObj& op);
-        bool replacing; // in "replace mode" -- see CmdReplacePeer
-        bool paired; // --pair in use
         string hostName;    // ip addr or hostname plus optionally, ":<port>"
         string _sourceName;  // a logical source name.
         string sourceName() const { return _sourceName.empty() ? "main" : _sourceName; }
@@ -145,14 +135,6 @@ namespace mongo {
         /* the last time point we have already synced up to (in the remote/master's oplog). */
         OpTime syncedTo;
 
-        /* This is for repl pairs.
-           _lastSavedLocalTs is the most recent point in the local log that we know is consistent
-           with the remote log ( ie say the local op log has entries ABCDE and the remote op log
-           has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled
-           the DE-XY difference.)
-        */
-        OpTime _lastSavedLocalTs;
-
         int nClonedThisPass;
 
         typedef vector< shared_ptr< ReplSource > > SourceVector;
@@ -186,148 +168,24 @@ namespace mongo {
         void forceResync( const char *requester );
     };
 
-    // class for managing a set of ids in memory
-    class MemIds {
-    public:
-        MemIds() : size_() {}
-        friend class IdTracker;
-        void reset() {
-            imp_.clear();
-            size_ = 0;
-        }
-        bool get( const char *ns, const BSONObj &id ) { return imp_[ ns ].count( id ); }
-        void set( const char *ns, const BSONObj &id, bool val ) {
-            if ( val ) {
-                if ( imp_[ ns ].insert( id.getOwned() ).second ) {
-                    size_ += id.objsize() + sizeof( BSONObj );
-                }
-            }
-            else {
-                if ( imp_[ ns ].erase( id ) == 1 ) {
-                    size_ -= id.objsize() + sizeof( BSONObj );
-                }
-            }
-        }
-        long long roughSize() const {
-            return size_;
-        }
-    private:
-        typedef map< string, BSONObjSetDefaultOrder > IdSets;
-        IdSets imp_;
-        long long size_;
-    };
-
-    // class for managing a set of ids in a db collection
-    // All functions must be called with db mutex held
-    class DbIds {
-    public:
-        DbIds( const string & name ) : impl_( name, BSON( "ns" << 1 << "id" << 1 ) ) {}
-        void reset() {
-            impl_.reset();
-        }
-        bool get( const char *ns, const BSONObj &id ) {
-            return impl_.get( key( ns, id ) );
-        }
-        void set( const char *ns, const BSONObj &id, bool val ) {
-            impl_.set( key( ns, id ), val );
-        }
-    private:
-        static BSONObj key( const char *ns, const BSONObj &id ) {
-            BSONObjBuilder b;
-            b << "ns" << ns;
-            // rename _id to id since there may be duplicates
-            b.appendAs( id.firstElement(), "id" );
-            return b.obj();
-        }
-        DbSet impl_;
-    };
+    bool anyReplEnabled();
+    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 );
 
-    // class for tracking ids and mod ids, in memory or on disk
-    // All functions must be called with db mutex held
-    // Kind of sloppy class structure, for now just want to keep the in mem
-    // version speedy.
-    // see http://www.mongodb.org/display/DOCS/Pairing+Internals
-    class IdTracker {
+    /**
+     * Helper class used to set and query an ignore state for a named database.
+     * The ignore state will expire after a specified OpTime.
+     */
+    class DatabaseIgnorer {
     public:
-        IdTracker() :
-            dbIds_( "local.temp.replIds" ),
-            dbModIds_( "local.temp.replModIds" ),
-            inMem_( true ),
-            maxMem_( replSettings.opIdMem ) {
-        }
-        void reset( int maxMem = replSettings.opIdMem ) {
-            memIds_.reset();
-            memModIds_.reset();
-            dbIds_.reset();
-            dbModIds_.reset();
-            maxMem_ = maxMem;
-            inMem_ = true;
-        }
-        bool haveId( const char *ns, const BSONObj &id ) {
-            if ( inMem_ )
-                return get( memIds_, ns, id );
-            else
-                return get( dbIds_, ns, id );
-        }
-        bool haveModId( const char *ns, const BSONObj &id ) {
-            if ( inMem_ )
-                return get( memModIds_, ns, id );
-            else
-                return get( dbModIds_, ns, id );
-        }
-        void haveId( const char *ns, const BSONObj &id, bool val ) {
-            if ( inMem_ )
-                set( memIds_, ns, id, val );
-            else
-                set( dbIds_, ns, id, val );
-        }
-        void haveModId( const char *ns, const BSONObj &id, bool val ) {
-            if ( inMem_ )
-                set( memModIds_, ns, id, val );
-            else
-                set( dbModIds_, ns, id, val );
-        }
-        // will release the db mutex
-        void mayUpgradeStorage() {
-            if ( !inMem_ || memIds_.roughSize() + memModIds_.roughSize() <= maxMem_ )
-                return;
-            log() << "saving master modified id information to collection" << endl;
-            upgrade( memIds_, dbIds_ );
-            upgrade( memModIds_, dbModIds_ );
-            memIds_.reset();
-            memModIds_.reset();
-            inMem_ = false;
-        }
-        bool inMem() const { return inMem_; }
+        /** Indicate that operations for 'db' should be ignored until after 'futureOplogTime' */
+        void doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime );
+        /**
+         * Query ignore state of 'db'; if 'currentOplogTime' is after the ignore
+         * limit, the ignore state will be cleared.
+         */
+        bool ignoreAt( const string &db, const OpTime &currentOplogTime );
     private:
-        template< class T >
-        bool get( T &ids, const char *ns, const BSONObj &id ) {
-            return ids.get( ns, id );
-        }
-        template< class T >
-        void set( T &ids, const char *ns, const BSONObj &id, bool val ) {
-            ids.set( ns, id, val );
-        }
-        void upgrade( MemIds &a, DbIds &b ) {
-            for( MemIds::IdSets::const_iterator i = a.imp_.begin(); i != a.imp_.end(); ++i ) {
-                for( BSONObjSetDefaultOrder::const_iterator j = i->second.begin(); j != i->second.end(); ++j ) {
-                    set( b, i->first.c_str(), *j, true );
-                    RARELY {
-                        dbtemprelease t;
-                    }
-                }
-            }
-        }
-        MemIds memIds_;
-        MemIds memModIds_;
-        DbIds dbIds_;
-        DbIds dbModIds_;
-        bool inMem_;
-        int maxMem_;
+        map< string, OpTime > _ignores;
     };
 
-    bool anyReplEnabled();
-    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 );
-
-
 } // namespace mongo
diff --git a/db/repl/connections.h b/db/repl/connections.h
index 7e7bfe5..78cfb30 100644
--- a/db/repl/connections.h
+++ b/db/repl/connections.h
@@ -20,7 +20,7 @@
 
 #include <map>
 #include "../../client/dbclient.h"
-#include "../security_key.h"
+#include "../security_common.h"
 
 namespace mongo {
 
@@ -44,13 +44,14 @@ namespace mongo {
     public:
         /** throws assertions if connect failure etc. */
         ScopedConn(string hostport);
-        ~ScopedConn();
+        ~ScopedConn() {
+            // conLock releases...
+        }
 
         /* If we were to run a query and not exhaust the cursor, future use of the connection would be problematic.
            So here what we do is wrapper known safe methods and not allow cursor-style queries at all.  This makes
            ScopedConn limited in functionality but very safe.  More non-cursor wrappers can be added here if needed.
            */
-
         bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0) {
             return conn()->runCommand(dbname, cmd, info, options);
         }
@@ -108,12 +109,4 @@ namespace mongo {
         }
     }
 
-    inline ScopedConn::~ScopedConn() {
-        // conLock releases...
-    }
-
-    /*inline DBClientConnection* ScopedConn::operator->() {
-        return &x->cc;
-    }*/
-
 }
diff --git a/db/repl/consensus.cpp b/db/repl/consensus.cpp
index dadb22e..fd18cdc 100644
--- a/db/repl/consensus.cpp
+++ b/db/repl/consensus.cpp
@@ -25,7 +25,49 @@ namespace mongo {
     public:
         CmdReplSetFresh() : ReplSetCommand("replSetFresh") { }
     private:
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+        bool shouldVeto(const BSONObj& cmdObj, string& errmsg) {
+            unsigned id = cmdObj["id"].Int();
+            const Member* primary = theReplSet->box.getPrimary();
+            const Member* hopeful = theReplSet->findById(id);
+            const Member *highestPriority = theReplSet->getMostElectable();
+
+            if( !hopeful ) {
+                errmsg = str::stream() << "replSet couldn't find member with id " << id;
+                return true;
+            }
+            else if( theReplSet->isPrimary() && theReplSet->lastOpTimeWritten >= hopeful->hbinfo().opTime ) {
+                // hbinfo is not updated, so we have to check the primary's last optime separately
+                errmsg = str::stream() << "I am already primary, " << hopeful->fullName() <<
+                    " can try again once I've stepped down";
+                return true;
+            }
+            else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) {
+                // other members might be aware of more up-to-date nodes
+                errmsg = str::stream() << hopeful->fullName() << " is trying to elect itself but " <<
+                    primary->fullName() << " is already primary and more up-to-date";
+                return true;
+            }
+            else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) {
+                errmsg = str::stream() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName();
+                return true;
+            }
+
+            // don't veto older versions
+            if (cmdObj["id"].eoo()) {
+                // they won't be looking for the veto field
+                return false;
+            }
+
+            if (!hopeful || !theReplSet->isElectable(id) ||
+                (highestPriority && highestPriority->config().priority > hopeful->config().priority)) {
+                return true;
+            }
+
+            return false;
+        }
+
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
 
@@ -43,11 +85,15 @@ namespace mongo {
                 result.append("info", "config version stale");
                 weAreFresher = true;
             }
-            else if( opTime < theReplSet->lastOpTimeWritten )  {
+            // check not only our own optime, but any other member we can reach
+            else if( opTime < theReplSet->lastOpTimeWritten ||
+                     opTime < theReplSet->lastOtherOpTime())  {
                 weAreFresher = true;
             }
             result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
             result.append("fresher", weAreFresher);
+            result.append("veto", shouldVeto(cmdObj, errmsg));
+
             return true;
         }
     } cmdReplSetFresh;
@@ -56,11 +102,9 @@ namespace mongo {
     public:
         CmdReplSetElect() : ReplSetCommand("replSetElect") { }
     private:
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
-            //task::lam f = boost::bind(&Consensus::electCmdReceived, &theReplSet->elect, cmdObj, &result);
-            //theReplSet->mgr->call(f);
             theReplSet->elect.electCmdReceived(cmdObj, &result);
             return true;
         }
@@ -91,6 +135,10 @@ namespace mongo {
             if( dt < T )
                 vUp += m->config().votes;
         }
+
+        // the manager will handle calling stepdown if another node should be
+        // primary due to priority
+
         return !( vUp * 2 > totalVotes() );
     }
 
@@ -98,17 +146,19 @@ namespace mongo {
 
     const time_t LeaseTime = 30;
 
+    mutex Consensus::lyMutex("ly");
+
     unsigned Consensus::yea(unsigned memberId) { /* throws VoteException */
-        Atomic<LastYea>::tran t(ly);
-        LastYea &ly = t.ref();
+        mutex::scoped_lock lk(lyMutex);
+        LastYea &L = this->ly.ref(lk);
         time_t now = time(0);
-        if( ly.when + LeaseTime >= now && ly.who != memberId ) {
-            log(1) << "replSet not voting yea for " << memberId <<
-                   " voted for " << ly.who << ' ' << now-ly.when << " secs ago" << rsLog;
+        if( L.when + LeaseTime >= now && L.who != memberId ) {
+            LOG(1) << "replSet not voting yea for " << memberId <<
+                   " voted for " << L.who << ' ' << now-L.when << " secs ago" << rsLog;
             throw VoteException();
         }
-        ly.when = now;
-        ly.who = memberId;
+        L.when = now;
+        L.who = memberId;
         return rs._self->config().votes;
     }
 
@@ -116,8 +166,8 @@ namespace mongo {
        place instead of leaving it for a long time.
        */
     void Consensus::electionFailed(unsigned meid) {
-        Atomic<LastYea>::tran t(ly);
-        LastYea &L = t.ref();
+        mutex::scoped_lock lk(lyMutex);
+        LastYea &L = ly.ref(lk);
         DEV assert( L.who == meid ); // this may not always always hold, so be aware, but adding for now as a quick sanity test
         if( L.who == meid )
             L.when = 0;
@@ -127,7 +177,7 @@ namespace mongo {
     void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) {
         BSONObjBuilder& b = *_b;
         DEV log() << "replSet received elect msg " << cmd.toString() << rsLog;
-        else log(2) << "replSet received elect msg " << cmd.toString() << rsLog;
+        else LOG(2) << "replSet received elect msg " << cmd.toString() << rsLog;
         string set = cmd["set"].String();
         unsigned whoid = cmd["whoid"].Int();
         int cfgver = cmd["cfgver"].Int();
@@ -136,22 +186,22 @@ namespace mongo {
 
         const Member* primary = rs.box.getPrimary();
         const Member* hopeful = rs.findById(whoid);
+        const Member* highestPriority = rs.getMostElectable();
 
         int vote = 0;
         if( set != rs.name() ) {
             log() << "replSet error received an elect request for '" << set << "' but our set name is '" << rs.name() << "'" << rsLog;
-
         }
         else if( myver < cfgver ) {
             // we are stale.  don't vote
         }
         else if( myver > cfgver ) {
             // they are stale!
-            log() << "replSet info got stale version # during election" << rsLog;
+            log() << "replSet electCmdReceived info got stale version # during election" << rsLog;
             vote = -10000;
         }
         else if( !hopeful ) {
-            log() << "couldn't find member with id " << whoid << rsLog;
+            log() << "replSet electCmdReceived couldn't find member with id " << whoid << rsLog;
             vote = -10000;
         }
         else if( primary && primary == rs._self && rs.lastOpTimeWritten >= hopeful->hbinfo().opTime ) {
@@ -166,14 +216,19 @@ namespace mongo {
                   primary->fullName() << " is already primary and more up-to-date" << rsLog;
             vote = -10000;
         }
+        else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) {
+            log() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName();
+            vote = -10000;
+        }
         else {
             try {
                 vote = yea(whoid);
+                dassert( hopeful->id() == whoid );
                 rs.relinquish();
-                log() << "replSet info voting yea for " << whoid << rsLog;
+                log() << "replSet info voting yea for " <<  hopeful->fullName() << " (" << whoid << ')' << rsLog;
             }
             catch(VoteException&) {
-                log() << "replSet voting no already voted for another" << rsLog;
+                log() << "replSet voting no for " << hopeful->fullName() << " already voted for another" << rsLog;
             }
         }
 
@@ -212,7 +267,8 @@ namespace mongo {
                           "set" << rs.name() <<
                           "opTime" << Date_t(ord.asDate()) <<
                           "who" << rs._self->fullName() <<
-                          "cfgver" << rs._cfg->version );
+                          "cfgver" << rs._cfg->version <<
+                          "id" << rs._self->id());
         list<Target> L;
         int ver;
         /* the following queries arbiters, even though they are never fresh.  wonder if that makes sense.
@@ -228,19 +284,33 @@ namespace mongo {
         for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
             if( i->ok ) {
                 nok++;
-                if( i->result["fresher"].trueValue() )
+                if( i->result["fresher"].trueValue() ) {
+                    log() << "not electing self, we are not freshest" << rsLog;
                     return false;
+                }
                 OpTime remoteOrd( i->result["opTime"].Date() );
                 if( remoteOrd == ord )
                     nTies++;
                 assert( remoteOrd <= ord );
+
+                if( i->result["veto"].trueValue() ) {
+                    BSONElement msg = i->result["errmsg"];
+                    if (!msg.eoo()) {
+                        log() << "not electing self, " << i->toHost << " would veto with '" <<
+                            msg.String() << "'" << rsLog;
+                    }
+                    else {
+                        log() << "not electing self, " << i->toHost << " would veto" << rsLog;
+                    }
+                    return false;
+                }
             }
             else {
                 DEV log() << "replSet freshest returns " << i->result.toString() << rsLog;
                 allUp = false;
             }
         }
-        log(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog;
+        LOG(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog;
         assert( ord <= theReplSet->lastOpTimeWritten ); // <= as this may change while we are working...
         return true;
     }
@@ -267,7 +337,6 @@ namespace mongo {
         bool allUp;
         int nTies;
         if( !weAreFreshest(allUp, nTies) ) {
-            log() << "replSet info not electing self, we are not freshest" << rsLog;
             return;
         }
 
@@ -324,7 +393,6 @@ namespace mongo {
             multiCommand(electCmd, L);
 
             {
-                RSBase::lock lk(&rs);
                 for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
                     DEV log() << "replSet elect res: " << i->result.toString() << rsLog;
                     if( i->ok ) {
diff --git a/db/repl/health.cpp b/db/repl/health.cpp
index 762ca90..711b457 100644
--- a/db/repl/health.cpp
+++ b/db/repl/health.cpp
@@ -32,7 +32,6 @@
 #include "../dbhelpers.h"
 
 namespace mongo {
-
     /* decls for connections.h */
     ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M());
     mutex ScopedConn::mapMutex("ScopedConn::mapMutex");
@@ -43,9 +42,9 @@ namespace mongo {
     using namespace mongoutils::html;
     using namespace bson;
 
-    static RamLog _rsLog;
-    Tee *rsLog = &_rsLog;
-    extern bool replSetBlind;
+    static RamLog * _rsLog = new RamLog( "rs" );
+    Tee *rsLog = _rsLog;
+    extern bool replSetBlind; // for testing
 
     string ago(time_t t) {
         if( t == 0 ) return "";
@@ -126,19 +125,6 @@ namespace mongo {
         return "";
     }
 
-    string MemberState::toString() const {
-        if( s == MemberState::RS_STARTUP ) return "STARTUP";
-        if( s == MemberState::RS_PRIMARY ) return "PRIMARY";
-        if( s == MemberState::RS_SECONDARY ) return "SECONDARY";
-        if( s == MemberState::RS_RECOVERING ) return "RECOVERING";
-        if( s == MemberState::RS_FATAL ) return "FATAL";
-        if( s == MemberState::RS_STARTUP2 ) return "STARTUP2";
-        if( s == MemberState::RS_ARBITER ) return "ARBITER";
-        if( s == MemberState::RS_DOWN ) return "DOWN";
-        if( s == MemberState::RS_ROLLBACK ) return "ROLLBACK";
-        return "";
-    }
-
     extern time_t started;
 
     // oplogdiags in web ui
@@ -208,8 +194,8 @@ namespace mongo {
 
         ss << "<style type=\"text/css\" media=\"screen\">"
            "table { font-size:75% }\n"
-//            "th { background-color:#bbb; color:#000 }\n"
-//            "td,th { padding:.25em }\n"
+           // "th { background-color:#bbb; color:#000 }\n"
+           // "td,th { padding:.25em }\n"
            "</style>\n";
 
         ss << table(h, true);
@@ -306,6 +292,8 @@ namespace mongo {
             myMinValid = "exception fetching minvalid";
         }
 
+        const Member *_self = this->_self;
+        assert(_self);
         {
             stringstream s;
             /* self row */
@@ -340,20 +328,40 @@ namespace mongo {
 
 
     void fillRsLog(stringstream& s) {
-        _rsLog.toHTML( s );
+        _rsLog->toHTML( s );
     }
 
     const Member* ReplSetImpl::findById(unsigned id) const {
-        if( id == _self->id() ) return _self;
+        if( _self && id == _self->id() ) return _self;
+        
         for( Member *m = head(); m; m = m->next() )
             if( m->id() == id )
                 return m;
         return 0;
     }
+    
+    const OpTime ReplSetImpl::lastOtherOpTime() const {
+        OpTime closest(0,0);
+        
+        for( Member *m = _members.head(); m; m=m->next() ) {                
+            if (!m->hbinfo().up()) {
+                continue;
+            }
+
+            if (m->hbinfo().opTime > closest) {
+                closest = m->hbinfo().opTime;
+            }
+        }
+
+        return closest;
+    }
 
     void ReplSetImpl::_summarizeStatus(BSONObjBuilder& b) const {
         vector<BSONObj> v;
 
+        const Member *_self = this->_self;
+        assert( _self );
+
         // add self
         {
             BSONObjBuilder bb;
@@ -390,6 +398,7 @@ namespace mongo {
             bb.appendTimestamp("optime", m->hbinfo().opTime.asDate());
             bb.appendDate("optimeDate", m->hbinfo().opTime.getSecs() * 1000LL);
             bb.appendTimeT("lastHeartbeat", m->hbinfo().lastHeartbeat);
+            bb.append("pingMs", m->hbinfo().ping);
             string s = m->lhb();
             if( !s.empty() )
                 bb.append("errmsg", s);
@@ -400,6 +409,10 @@ namespace mongo {
         b.append("set", name());
         b.appendTimeT("date", time(0));
         b.append("myState", box.getState().s);
+        const Member *syncTarget = _currentSyncTarget;
+        if (syncTarget) {
+            b.append("syncingTo", syncTarget->fullName());
+        }
         b.append("members", v);
         if( replSetBlind )
             b.append("blind",true); // to avoid confusion if set...normally never set except for testing.
diff --git a/db/repl/health.h b/db/repl/health.h
index a32db00..55cca93 100644
--- a/db/repl/health.h
+++ b/db/repl/health.h
@@ -24,11 +24,11 @@ namespace mongo {
     bool requestHeartbeat(string setname, string fromHost, string memberFullName, BSONObj& result, int myConfigVersion, int& theirConfigVersion, bool checkEmpty = false);
 
     struct HealthOptions {
-        HealthOptions() {
-            heartbeatSleepMillis = 2000;
-            heartbeatTimeoutMillis = 10000;
-            heartbeatConnRetries  = 2;
-        }
+        HealthOptions() :  
+            heartbeatSleepMillis(2000), 
+            heartbeatTimeoutMillis( 10000 ),
+            heartbeatConnRetries(2) 
+        { }
 
         bool isDefault() const { return *this == HealthOptions(); }
 
@@ -43,8 +43,8 @@ namespace mongo {
         }
 
         bool operator==(const HealthOptions& r) const {
-            return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==heartbeatConnRetries;
+            return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==r.heartbeatConnRetries;
         }
     };
-
+    
 }
diff --git a/db/repl/heartbeat.cpp b/db/repl/heartbeat.cpp
index 3972466..7d3f78c 100644
--- a/db/repl/heartbeat.cpp
+++ b/db/repl/heartbeat.cpp
@@ -30,15 +30,16 @@
 #include "connections.h"
 #include "../../util/unittest.h"
 #include "../instance.h"
+#include "../repl.h"
 
 namespace mongo {
 
     using namespace bson;
 
     extern bool replSetBlind;
+    extern ReplSettings replSettings;
 
-    // hacky
-    string *discoveredSeed = 0;
+    unsigned int HeartbeatInfo::numPings;
 
     long long HeartbeatInfo::timeDown() const {
         if( up() ) return 0;
@@ -52,7 +53,7 @@ namespace mongo {
     public:
         virtual bool adminOnly() const { return false; }
         CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( replSetBlind )
                 return false;
 
@@ -63,9 +64,13 @@ namespace mongo {
                 return false;
             }
 
+            if (!checkAuth(errmsg, result)) {
+                return false;
+            }
+
             /* we want to keep heartbeat connections open when relinquishing primary.  tag them here. */
             {
-                MessagingPort *mp = cc().port();
+                AbstractMessagingPort *mp = cc().port();
                 if( mp )
                     mp->tag |= 1;
             }
@@ -78,8 +83,8 @@ namespace mongo {
                 string s = string(cmdObj.getStringField("replSetHeartbeat"));
                 if( cmdLine.ourSetName() != s ) {
                     errmsg = "repl set names do not match";
-                    log() << "cmdline: " << cmdLine._replSet << endl;
-                    log() << "s: " << s << endl;
+                    log() << "replSet set names do not match, our cmdline: " << cmdLine._replSet << rsLog;
+                    log() << "replSet s: " << s << rsLog;
                     result.append("mismatch", true);
                     return false;
                 }
@@ -91,8 +96,8 @@ namespace mongo {
             }
             if( theReplSet == 0 ) {
                 string from( cmdObj.getStringField("from") );
-                if( !from.empty() && discoveredSeed == 0 ) {
-                    discoveredSeed = new string(from);
+                if( !from.empty() ) {
+                    replSettings.discoveredSeeds.insert(from);
                 }
                 errmsg = "still initializing";
                 return false;
@@ -105,6 +110,7 @@ namespace mongo {
             }
             result.append("set", theReplSet->name());
             result.append("state", theReplSet->state().s);
+            result.append("e", theReplSet->iAmElectable());
             result.append("hbmsg", theReplSet->hbmsg());
             result.append("time", (long long) time(0));
             result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
@@ -144,10 +150,10 @@ namespace mongo {
     public:
         ReplSetHealthPollTask(const HostAndPort& hh, const HeartbeatInfo& mm) : h(hh), m(mm) { }
 
-        string name() const { return "ReplSetHealthPollTask"; }
+        string name() const { return "rsHealthPoll"; }
         void doWork() {
             if ( !theReplSet ) {
-                log(2) << "theReplSet not initialized yet, skipping health poll this round" << rsLog;
+                LOG(2) << "replSet not initialized yet, skipping health poll this round" << rsLog;
                 return;
             }
 
@@ -157,11 +163,22 @@ namespace mongo {
                 BSONObj info;
                 int theirConfigVersion = -10000;
 
-                time_t before = time(0);
+                Timer timer;
 
                 bool ok = requestHeartbeat(theReplSet->name(), theReplSet->selfFullName(), h.toString(), info, theReplSet->config().version, theirConfigVersion);
 
-                time_t after = mem.lastHeartbeat = time(0); // we set this on any response - we don't get this far if couldn't connect because exception is thrown
+                mem.ping = (unsigned int)timer.millis();
+
+                time_t before = timer.startTime() / 1000000;
+                // we set this on any response - we don't get this far if
+                // couldn't connect because exception is thrown
+                time_t after = mem.lastHeartbeat = before + (mem.ping / 1000);
+
+                // weight new ping with old pings
+                // on the first ping, just use the ping value
+                if (old.ping != 0) {
+                    mem.ping = (unsigned int)((old.ping * .8) + (mem.ping * .2));
+                }
 
                 if ( info["time"].isNumber() ) {
                     long long t = info["time"].numberLong();
@@ -183,8 +200,10 @@ namespace mongo {
                         mem.hbstate = MemberState(state.Int());
                 }
                 if( ok ) {
+                    HeartbeatInfo::numPings++;
+
                     if( mem.upSince == 0 ) {
-                        log() << "replSet info " << h.toString() << " is up" << rsLog;
+                        log() << "replSet info member " << h.toString() << " is up" << rsLog;
                         mem.upSince = mem.lastHeartbeat;
                     }
                     mem.health = 1.0;
@@ -192,6 +211,30 @@ namespace mongo {
                     if( info.hasElement("opTime") )
                         mem.opTime = info["opTime"].Date();
 
+                    // see if this member is in the electable set
+                    if( info["e"].eoo() ) {
+                        // for backwards compatibility
+                        const Member *member = theReplSet->findById(mem.id());
+                        if (member && member->config().potentiallyHot()) {
+                            theReplSet->addToElectable(mem.id());
+                        }
+                        else {
+                            theReplSet->rmFromElectable(mem.id());
+                        }
+                    }
+                    // add this server to the electable set if it is within 10
+                    // seconds of the latest optime we know of
+                    else if( info["e"].trueValue() &&
+                             mem.opTime >= theReplSet->lastOpTimeWritten.getSecs() - 10) {
+                        unsigned lastOp = theReplSet->lastOtherOpTime().getSecs();
+                        if (lastOp > 0 && mem.opTime >= lastOp - 10) {
+                            theReplSet->addToElectable(mem.id());
+                        }
+                    }
+                    else {
+                        theReplSet->rmFromElectable(mem.id());
+                    }
+                    
                     be cfg = info["config"];
                     if( cfg.ok() ) {
                         // received a new config
@@ -208,7 +251,7 @@ namespace mongo {
                 down(mem, e.what());
             }
             catch(...) {
-                down(mem, "something unusual went wrong");
+                down(mem, "replSet unexpected exception in ReplSetHealthPollTask");
             }
             m = mem;
 
@@ -219,7 +262,7 @@ namespace mongo {
             bool changed = mem.changed(old);
             if( changed ) {
                 if( old.hbstate != mem.hbstate )
-                    log() << "replSet member " << h.toString() << ' ' << mem.hbstate.toString() << rsLog;
+                    log() << "replSet member " << h.toString() << " is now in state " << mem.hbstate.toString() << rsLog;
             }
             if( changed || now-last>4 ) {
                 last = now;
@@ -230,12 +273,15 @@ namespace mongo {
     private:
         void down(HeartbeatInfo& mem, string msg) {
             mem.health = 0.0;
+            mem.ping = 0;
             if( mem.upSince || mem.downSince == 0 ) {
                 mem.upSince = 0;
                 mem.downSince = jsTime();
+                mem.hbstate = MemberState::RS_DOWN;
                 log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog;
             }
             mem.lastHeartbeatMsg = msg;
+            theReplSet->rmFromElectable(mem.id());
         }
     };
 
@@ -262,18 +308,13 @@ namespace mongo {
     */
     void ReplSetImpl::startThreads() {
         task::fork(mgr);
-
-        /*Member* m = _members.head();
-        while( m ) {
-            ReplSetHealthPollTask *task = new ReplSetHealthPollTask(m->h(), m->hbinfo());
-            healthTasks.insert(task);
-            task::repeat(shared_ptr<task::Task>(task), 2000);
-            m = m->next();
-        }*/
-
         mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
 
         boost::thread t(startSyncThread);
+
+        task::fork(ghost);
+
+        // member heartbeats are started in ReplSetImpl::initFromConfig
     }
 
 }
diff --git a/db/repl/manager.cpp b/db/repl/manager.cpp
index d2e0764..3c4c0eb 100644
--- a/db/repl/manager.cpp
+++ b/db/repl/manager.cpp
@@ -19,6 +19,7 @@
 
 #include "pch.h"
 #include "rs.h"
+#include "connections.h"
 #include "../client.h"
 
 namespace mongo {
@@ -50,7 +51,7 @@ namespace mongo {
     }
 
     Manager::Manager(ReplSetImpl *_rs) :
-        task::Server("rs Manager"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY) {
+        task::Server("rsMgr"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY) {
     }
 
     Manager::~Manager() {
@@ -63,10 +64,8 @@ namespace mongo {
     }
 
     void Manager::starting() {
-        Client::initThread("rs Manager");
-        if (!noauth) {
-            cc().getAuthenticationInfo()->authorize("local");
-        }
+        Client::initThread("rsMgr");
+        replLocalAuth();
     }
 
     void Manager::noteARemoteIsPrimary(const Member *m) {
@@ -81,6 +80,45 @@ namespace mongo {
         }
     }
 
+    void Manager::checkElectableSet() {
+        unsigned otherOp = rs->lastOtherOpTime().getSecs();
+        
+        // make sure the electable set is up-to-date
+        if (rs->elect.aMajoritySeemsToBeUp() &&
+            rs->iAmPotentiallyHot() &&
+            (otherOp == 0 || rs->lastOpTimeWritten.getSecs() >= otherOp - 10)) {
+            theReplSet->addToElectable(rs->selfId());
+        }
+        else {
+            theReplSet->rmFromElectable(rs->selfId());
+        }
+
+        // check if we should ask the primary (possibly ourselves) to step down
+        const Member *highestPriority = theReplSet->getMostElectable();
+        const Member *primary = rs->box.getPrimary();
+        
+        if (primary && highestPriority &&
+            highestPriority->config().priority > primary->config().priority) {
+            log() << "stepping down " << primary->fullName() << endl;
+
+            if (primary->h().isSelf()) {
+                // replSetStepDown tries to acquire the same lock
+                // msgCheckNewState takes, so we can't call replSetStepDown on
+                // ourselves.
+                rs->relinquish();
+            }
+            else {
+                BSONObj cmd = BSON( "replSetStepDown" << 1 );
+                ScopedConn conn(primary->fullName());
+                BSONObj result;
+                if (!conn.runCommand("admin", cmd, result, 0)) {
+                    log() << "stepping down " << primary->fullName()
+                          << " failed: " << result << endl;
+                }
+            }
+        }
+    }
+
     /** called as the health threads get new results */
     void Manager::msgCheckNewState() {
         {
@@ -90,7 +128,9 @@ namespace mongo {
             RSBase::lock lk(rs);
 
             if( busyWithElectSelf ) return;
-
+            
+            checkElectableSet();
+            
             const Member *p = rs->box.getPrimary();
             if( p && p != rs->_self ) {
                 if( !p->hbinfo().up() ||
@@ -154,7 +194,7 @@ namespace mongo {
                 }
 
                 if( rs->elect.shouldRelinquish() ) {
-                    log() << "replSet can't see a majority of the set, relinquishing primary" << rsLog;
+                    log() << "can't see a majority of the set, relinquishing primary" << rsLog;
                     rs->relinquish();
                 }
 
@@ -163,9 +203,7 @@ namespace mongo {
 
             if( !rs->iAmPotentiallyHot() ) // if not we never try to be primary
                 return;
-
-            /* TODO : CHECK PRIORITY HERE.  can't be elected if priority zero. */
-
+            
             /* no one seems to be primary.  shall we try to elect ourself? */
             if( !rs->elect.aMajoritySeemsToBeUp() ) {
                 static time_t last;
@@ -178,6 +216,10 @@ namespace mongo {
                 return;
             }
 
+            if( !rs->iAmElectable() ) {
+                return;
+            }
+
             busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one.
         }
         try {
diff --git a/db/repl/multicmd.h b/db/repl/multicmd.h
index df7c4e5..99dabea 100644
--- a/db/repl/multicmd.h
+++ b/db/repl/multicmd.h
@@ -53,16 +53,16 @@ namespace mongo {
     };
 
     inline void multiCommand(BSONObj cmd, list<Target>& L) {
-        list<BackgroundJob *> jobs;
+        list< shared_ptr<BackgroundJob> > jobs;
 
         for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
             Target& d = *i;
             _MultiCommandJob *j = new _MultiCommandJob(cmd, d);
+            jobs.push_back( shared_ptr<BackgroundJob>(j) );
             j->go();
-            jobs.push_back(j);
         }
 
-        for( list<BackgroundJob*>::iterator i = jobs.begin(); i != jobs.end(); i++ ) {
+        for( list< shared_ptr<BackgroundJob> >::iterator i = jobs.begin(); i != jobs.end(); i++ ) {
             (*i)->wait();
         }
     }
diff --git a/db/repl/replset_commands.cpp b/db/repl/replset_commands.cpp
index 1d110ac..68dab7e 100644
--- a/db/repl/replset_commands.cpp
+++ b/db/repl/replset_commands.cpp
@@ -17,6 +17,7 @@
 #include "pch.h"
 #include "../cmdline.h"
 #include "../commands.h"
+#include "../repl.h"
 #include "health.h"
 #include "rs.h"
 #include "rs_config.h"
@@ -28,7 +29,7 @@ using namespace bson;
 
 namespace mongo {
 
-    void checkMembersUpForConfigChange(const ReplSetConfig& cfg, bool initial);
+    void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial);
 
     /* commands in other files:
          replSetHeartbeat - health.cpp
@@ -44,14 +45,18 @@ namespace mongo {
             help << "Just for regression tests.\n";
         }
         CmdReplSetTest() : ReplSetCommand("replSetTest") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             log() << "replSet replSetTest command received: " << cmdObj.toString() << rsLog;
+
+            if (!checkAuth(errmsg, result)) {
+                return false;
+            }
+
             if( cmdObj.hasElement("forceInitialSyncFailure") ) {
                 replSetForceInitialSyncFailure = (unsigned) cmdObj["forceInitialSyncFailure"].Number();
                 return true;
             }
 
-            // may not need this, but if removed check all tests still work:
             if( !check(errmsg, result) )
                 return false;
 
@@ -63,7 +68,10 @@ namespace mongo {
         }
     } cmdReplSetTest;
 
-    /** get rollback id */
+    /** get rollback id.  used to check if a rollback happened during some interval of time.
+        as consumed, the rollback id is not in any particular order, it simply changes on each rollback.
+        @see incRBID()
+    */
     class CmdReplSetGetRBID : public ReplSetCommand {
     public:
         /* todo: ideally this should only change on rollbacks NOT on mongod restarts also. fix... */
@@ -72,9 +80,11 @@ namespace mongo {
             help << "internal";
         }
         CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") {
-            rbid = (int) curTimeMillis();
+            // this is ok but micros or combo with some rand() and/or 64 bits might be better --
+            // imagine a restart and a clock correction simultaneously (very unlikely but possible...)
+            rbid = (int) curTimeMillis64();
         }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
             result.append("rbid",rbid);
@@ -102,7 +112,7 @@ namespace mongo {
             help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
         }
         CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if ( cmdObj["forShell"].trueValue() )
                 lastError.disableForCommand();
 
@@ -122,20 +132,38 @@ namespace mongo {
             help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
         }
         CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") { }
-        virtual bool run(const string& a, BSONObj& b, string& errmsg, BSONObjBuilder& c, bool d) {
+        virtual bool run(const string& a, BSONObj& b, int e, string& errmsg, BSONObjBuilder& c, bool d) {
             try {
                 rwlock_try_write lk(mutex);
-                return _run(a,b,errmsg,c,d);
+                return _run(a,b,e,errmsg,c,d);
             }
             catch(rwlock_try_write::exception&) { }
             errmsg = "a replSetReconfig is already in progress";
             return false;
         }
     private:
-        bool _run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            if( !check(errmsg, result) )
+        bool _run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( !checkAuth(errmsg, result) ) {
                 return false;
-            if( !theReplSet->box.getState().primary() ) {
+            }
+
+            if( cmdObj["replSetReconfig"].type() != Object ) {
+                errmsg = "no configuration specified";
+                return false;
+            }
+
+            bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+            if( force && !theReplSet ) {
+                replSettings.reconfig = cmdObj["replSetReconfig"].Obj().getOwned();
+                result.append("msg", "will try this config momentarily, try running rs.conf() again in a few seconds");
+                return true;
+            }
+
+            if ( !check(errmsg, result) ) {
+                return false;
+            }
+
+            if( !force && !theReplSet->box.getState().primary() ) {
                 errmsg = "replSetReconfig command must be sent to the current replica set primary.";
                 return false;
             }
@@ -152,18 +180,8 @@ namespace mongo {
                 }
             }
 
-            if( cmdObj["replSetReconfig"].type() != Object ) {
-                errmsg = "no configuration specified";
-                return false;
-            }
-
-            /** TODO
-                Support changes when a majority, but not all, members of a set are up.
-                Determine what changes should not be allowed as they would cause erroneous states.
-                What should be possible when a majority is not up?
-                */
             try {
-                ReplSetConfig newConfig(cmdObj["replSetReconfig"].Obj());
+                ReplSetConfig newConfig(cmdObj["replSetReconfig"].Obj(), force);
 
                 log() << "replSet replSetReconfig config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;
 
@@ -171,12 +189,12 @@ namespace mongo {
                     return false;
                 }
 
-                checkMembersUpForConfigChange(newConfig,false);
+                checkMembersUpForConfigChange(newConfig, result, false);
 
                 log() << "replSet replSetReconfig [2]" << rsLog;
 
                 theReplSet->haveNewConfig(newConfig, true);
-                ReplSet::startupStatusMsg = "replSetReconfig'd";
+                ReplSet::startupStatusMsg.set("replSetReconfig'd");
             }
             catch( DBException& e ) {
                 log() << "replSet replSetReconfig exception: " << e.what() << rsLog;
@@ -199,7 +217,7 @@ namespace mongo {
         }
 
         CmdReplSetFreeze() : ReplSetCommand("replSetFreeze") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
             int secs = (int) cmdObj.firstElement().numberInt();
@@ -223,13 +241,38 @@ namespace mongo {
         }
 
         CmdReplSetStepDown() : ReplSetCommand("replSetStepDown") { }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if( !check(errmsg, result) )
                 return false;
             if( !theReplSet->box.getState().primary() ) {
                 errmsg = "not primary so can't step down";
                 return false;
             }
+
+            bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+
+            // only step down if there is another node synced to within 10
+            // seconds of this node
+            if (!force) {
+                long long int lastOp = (long long int)theReplSet->lastOpTimeWritten.getSecs();
+                long long int closest = (long long int)theReplSet->lastOtherOpTime().getSecs();
+
+                long long int diff = lastOp - closest;
+                result.append("closest", closest);
+                result.append("difference", diff);
+
+                if (diff < 0) {
+                    // not our problem, but we'll wait until thing settle down
+                    errmsg = "someone is ahead of the primary?";
+                    return false;
+                }
+
+                if (diff > 10) {
+                    errmsg = "no secondaries within 10 seconds of my optime";
+                    return false;
+                }
+            }
+
             int secs = (int) cmdObj.firstElement().numberInt();
             if( secs == 0 )
                 secs = 60;
diff --git a/db/repl/rs.cpp b/db/repl/rs.cpp
index bbfb057..1fbbc10 100644
--- a/db/repl/rs.cpp
+++ b/db/repl/rs.cpp
@@ -16,7 +16,7 @@
 
 #include "pch.h"
 #include "../cmdline.h"
-#include "../../util/sock.h"
+#include "../../util/net/sock.h"
 #include "../client.h"
 #include "../../client/dbclient.h"
 #include "../dbhelpers.h"
@@ -24,14 +24,20 @@
 #include "rs.h"
 #include "connections.h"
 #include "../repl.h"
+#include "../instance.h"
 
-namespace mongo {
+using namespace std;
 
+namespace mongo {
+    
     using namespace bson;
 
     bool replSet = false;
     ReplSet *theReplSet = 0;
-    extern string *discoveredSeed;
+
+    bool isCurrentlyAReplSetPrimary() { 
+        return theReplSet && theReplSet->isPrimary();
+    }
 
     void ReplSetImpl::sethbmsg(string s, int logLevel) {
         static time_t lastLogged;
@@ -57,21 +63,71 @@ namespace mongo {
     }
 
     void ReplSetImpl::assumePrimary() {
+        LOG(2) << "replSet assuming primary" << endl;
         assert( iAmPotentiallyHot() );
         writelock lk("admin."); // so we are synchronized with _logOp()
-        box.setSelfPrimary(_self);
-        //log() << "replSet PRIMARY" << rsLog; // self (" << _self->id() << ") is now primary" << rsLog;
+
+        // Make sure that new OpTimes are higher than existing ones even with clock skew
+        DBDirectClient c;
+        BSONObj lastOp = c.findOne( "local.oplog.rs", Query().sort(reverseNaturalObj), NULL, QueryOption_SlaveOk );
+        if ( !lastOp.isEmpty() ) {
+            OpTime::setLast( lastOp[ "ts" ].date() );
+        }
+
+        changeState(MemberState::RS_PRIMARY);
     }
 
     void ReplSetImpl::changeState(MemberState s) { box.change(s, _self); }
 
+    void ReplSetImpl::setMaintenanceMode(const bool inc) {
+        lock lk(this);
+
+        if (inc) {
+            log() << "replSet going into maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+
+            _maintenanceMode++;
+            changeState(MemberState::RS_RECOVERING);
+        }
+        else {
+            _maintenanceMode--;
+            // no need to change state, syncTail will try to go live as a secondary soon
+
+            log() << "leaving maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+        }
+    }
+
+    Member* ReplSetImpl::getMostElectable() {
+        lock lk(this);
+
+        Member *max = 0;
+
+        for (set<unsigned>::iterator it = _electableSet.begin(); it != _electableSet.end(); it++) {
+            const Member *temp = findById(*it);
+            if (!temp) {
+                log() << "couldn't find member: " << *it << endl;
+                _electableSet.erase(*it);
+                continue;
+            }
+            if (!max || max->config().priority < temp->config().priority) {
+                max = (Member*)temp;
+            }
+        }
+
+        return max;
+    }
+
     const bool closeOnRelinquish = true;
 
     void ReplSetImpl::relinquish() {
+        LOG(2) << "replSet attempting to relinquish" << endl;
         if( box.getState().primary() ) {
-            log() << "replSet relinquishing primary state" << rsLog;
-            changeState(MemberState::RS_SECONDARY);
-
+            {
+                writelock lk("admin."); // so we are synchronized with _logOp()
+            
+                log() << "replSet relinquishing primary state" << rsLog;
+                changeState(MemberState::RS_SECONDARY);
+            }
+            
             if( closeOnRelinquish ) {
                 /* close sockets that were talking to us so they don't blithly send many writes that will fail
                    with "not master" (of course client could check result code, but in case they are not)
@@ -173,6 +229,8 @@ namespace mongo {
     }
 
     void ReplSetImpl::_fillIsMaster(BSONObjBuilder& b) {
+        lock lk(this);
+        
         const StateBox::SP sp = box.get();
         bool isp = sp.state.primary();
         b.append("setName", name());
@@ -203,9 +261,13 @@ namespace mongo {
             if( m )
                 b.append("primary", m->h().toString());
         }
+        else {
+            b.append("primary", _self->fullName());
+        }
+
         if( myConfig().arbiterOnly )
             b.append("arbiterOnly", true);
-        if( myConfig().priority == 0 )
+        if( myConfig().priority == 0 && !myConfig().arbiterOnly)
             b.append("passive", true);
         if( myConfig().slaveDelay )
             b.append("slaveDelay", myConfig().slaveDelay);
@@ -213,6 +275,13 @@ namespace mongo {
             b.append("hidden", true);
         if( !myConfig().buildIndexes )
             b.append("buildIndexes", false);
+        if( !myConfig().tags.empty() ) {
+            BSONObjBuilder a;
+            for( map<string,string>::const_iterator i = myConfig().tags.begin(); i != myConfig().tags.end(); i++ )
+                a.append((*i).first, (*i).second);
+            b.append("tags", a.done());
+        }
+        b.append("me", myConfig().h.toString());
     }
 
     /** @param cfgString <setname>/<seedhost1>,<seedhost2> */
@@ -259,19 +328,22 @@ namespace mongo {
     }
 
     ReplSetImpl::ReplSetImpl(ReplSetCmdline& replSetCmdline) : elect(this),
+        _currentSyncTarget(0),
+        _hbmsgTime(0),
         _self(0),
-        mgr( new Manager(this) ) {
+        _maintenanceMode(0),
+        mgr( new Manager(this) ),
+        ghost( new GhostSync(this) ) {
+
         _cfg = 0;
         memset(_hbmsg, 0, sizeof(_hbmsg));
-        *_hbmsg = '.'; // temp...just to see
+        strcpy( _hbmsg , "initial startup" );
         lastH = 0;
         changeState(MemberState::RS_STARTUP);
 
         _seeds = &replSetCmdline.seeds;
-        //for( vector<HostAndPort>::iterator i = seeds->begin(); i != seeds->end(); i++ )
-        //    addMemberIfMissing(*i);
 
-        log(1) << "replSet beginning startup..." << rsLog;
+        LOG(1) << "replSet beginning startup..." << rsLog;
 
         loadConfig();
 
@@ -282,7 +354,7 @@ namespace mongo {
         for( set<HostAndPort>::iterator i = replSetCmdline.seedSet.begin(); i != replSetCmdline.seedSet.end(); i++ ) {
             if( i->isSelf() ) {
                 if( sss == 1 )
-                    log(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog;
+                    LOG(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog;
             }
             else
                 log() << "replSet warning command line seed " << i->toString() << " is not present in the current repl set config" << rsLog;
@@ -291,14 +363,13 @@ namespace mongo {
 
     void newReplUp();
 
-    void ReplSetImpl::loadLastOpTimeWritten() {
-        //assert( lastOpTimeWritten.isNull() );
+    void ReplSetImpl::loadLastOpTimeWritten(bool quiet) {
         readlock lk(rsoplog);
         BSONObj o;
         if( Helpers::getLast(rsoplog, o) ) {
             lastH = o["h"].numberLong();
             lastOpTimeWritten = o["ts"]._opTime();
-            uassert(13290, "bad replSet oplog entry?", !lastOpTimeWritten.isNull());
+            uassert(13290, "bad replSet oplog entry?", quiet || !lastOpTimeWritten.isNull());
         }
     }
 
@@ -326,7 +397,10 @@ namespace mongo {
     extern BSONObj *getLastErrorDefault;
 
     void ReplSetImpl::setSelfTo(Member *m) {
+        // already locked in initFromConfig
         _self = m;
+        _id = m->id();
+        _config = m->config();
         if( m ) _buildIndexes = m->config().buildIndexes;
         else _buildIndexes = true;
     }
@@ -345,29 +419,32 @@ namespace mongo {
             getLastErrorDefault = new BSONObj( c.getLastErrorDefaults );
         }
 
-        list<const ReplSetConfig::MemberCfg*> newOnes;
+        list<ReplSetConfig::MemberCfg*> newOnes;
+        // additive short-cuts the new config setup. If we are just adding a
+        // node/nodes and nothing else is changing, this is additive. If it's
+        // not a reconfig, we're not adding anything
         bool additive = reconf;
         {
             unsigned nfound = 0;
             int me = 0;
             for( vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin(); i != c.members.end(); i++ ) {
-                const ReplSetConfig::MemberCfg& m = *i;
+                
+                ReplSetConfig::MemberCfg& m = *i;
                 if( m.h.isSelf() ) {
-                    nfound++;
                     me++;
-                    if( !reconf || (_self && _self->id() == (unsigned) m._id) )
-                        ;
-                    else {
-                        log() << "replSet " << _self->id() << ' ' << m._id << rsLog;
+                }
+                
+                if( reconf ) {
+                    if (m.h.isSelf() && (!_self || (int)_self->id() != m._id)) {
+                        log() << "self doesn't match: " << m._id << rsLog;
                         assert(false);
                     }
-                }
-                else if( reconf ) {
+
                     const Member *old = findById(m._id);
                     if( old ) {
                         nfound++;
                         assert( (int) old->id() == m._id );
-                        if( old->config() == m ) {
+                        if( old->config() != m ) {
                             additive = false;
                         }
                     }
@@ -375,23 +452,21 @@ namespace mongo {
                         newOnes.push_back(&m);
                     }
                 }
-
-                // change timeout settings, if necessary
-                ScopedConn conn(m.h.toString());
-                conn.setTimeout(c.ho.heartbeatTimeoutMillis/1000.0);
             }
             if( me == 0 ) {
-                // initial startup with fastsync
-                if (!reconf && replSettings.fastsync) {
-                    return false;
-                }
-                // log() << "replSet config : " << _cfg->toString() << rsLog;
+                _members.orphanAll();
+                // hbs must continue to pick up new config
+                // stop sync thread
+                box.set(MemberState::RS_STARTUP, 0);
+
+                // go into holding pattern
                 log() << "replSet error self not present in the repl set configuration:" << rsLog;
                 log() << c.toString() << rsLog;
-                uasserted(13497, "replSet error self not present in the configuration");
+                return false;
             }
             uassert( 13302, "replSet error self appears twice in the repl set configuration", me<=1 );
 
+            // if we found different members that the original config, reload everything
             if( reconf && config().members.size() != nfound )
                 additive = false;
         }
@@ -402,10 +477,11 @@ namespace mongo {
         _name = _cfg->_id;
         assert( !_name.empty() );
 
+        // this is a shortcut for simple changes
         if( additive ) {
             log() << "replSet info : additive change to configuration" << rsLog;
-            for( list<const ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) {
-                const ReplSetConfig::MemberCfg* m = *i;
+            for( list<ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) {
+                ReplSetConfig::MemberCfg *m = *i;
                 Member *mi = new Member(m->h, m->_id, m, false);
 
                 /** we will indicate that new members are up() initially so that we don't relinquish our
@@ -417,6 +493,11 @@ namespace mongo {
                 _members.push(mi);
                 startHealthTaskFor(mi);
             }
+
+            // if we aren't creating new members, we may have to update the
+            // groups for the current ones
+            _cfg->updateMembers(_members);
+
             return true;
         }
 
@@ -433,21 +514,21 @@ namespace mongo {
         }
         forgetPrimary();
 
-        bool iWasArbiterOnly = _self ? iAmArbiterOnly() : false;
-        setSelfTo(0);
+        // not setting _self to 0 as other threads use _self w/o locking
+        int me = 0;
+
+        // For logging
+        string members = "";
+
         for( vector<ReplSetConfig::MemberCfg>::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) {
-            const ReplSetConfig::MemberCfg& m = *i;
+            ReplSetConfig::MemberCfg& m = *i;
             Member *mi;
+            members += ( members == "" ? "" : ", " ) + m.h.toString();
             if( m.h.isSelf() ) {
-                assert( _self == 0 );
+                assert( me++ == 0 );
                 mi = new Member(m.h, m._id, &m, true);
                 setSelfTo(mi);
 
-                // if the arbiter status changed
-                if (iWasArbiterOnly ^ iAmArbiterOnly()) {
-                    _changeArbiterState();
-                }
-
                 if( (int)mi->id() == oldPrimaryId )
                     box.setSelfPrimary(mi);
             }
@@ -459,38 +540,12 @@ namespace mongo {
                     box.setOtherPrimary(mi);
             }
         }
-        return true;
-    }
 
-    void startSyncThread();
-
-    void ReplSetImpl::_changeArbiterState() {
-        if (iAmArbiterOnly()) {
-            changeState(MemberState::RS_ARBITER);
-
-            // if there is an oplog, free it
-            // not sure if this is necessary, maybe just leave the oplog and let
-            // the user delete it if they want the space?
-            writelock lk(rsoplog);
-            Client::Context c(rsoplog);
-            NamespaceDetails *d = nsdetails(rsoplog);
-            if (d) {
-                string errmsg;
-                bob res;
-                dropCollection(rsoplog, errmsg, res);
-
-                // clear last op time to force initial sync (if the arbiter
-                // becomes a "normal" server again)
-                lastOpTimeWritten = OpTime();
-            }
+        if( me == 0 ){
+            log() << "replSet warning did not detect own host in full reconfig, members " << members << " config: " << c << rsLog;
         }
-        else {
-            changeState(MemberState::RS_RECOVERING);
 
-            // oplog will be allocated when sync begins
-            /* TODO : could this cause two sync threads to exist (race condition)? */
-            boost::thread t(startSyncThread);
-        }
+        return true;
     }
 
     // Our own config must be the first one.
@@ -514,7 +569,6 @@ namespace mongo {
 
         if( highest->version > myVersion && highest->version >= 0 ) {
             log() << "replSet got config version " << highest->version << " from a remote, saving locally" << rsLog;
-            writelock lk("admin.");
             highest->saveConfigLocally(BSONObj());
         }
         return true;
@@ -523,7 +577,7 @@ namespace mongo {
     void ReplSetImpl::loadConfig() {
         while( 1 ) {
             startupStatus = LOADINGCONFIG;
-            startupStatusMsg = "loading " + rsConfigNs + " config (LOADINGCONFIG)";
+            startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)");
             try {
                 vector<ReplSetConfig> configs;
                 try {
@@ -531,7 +585,6 @@ namespace mongo {
                 }
                 catch(DBException& e) {
                     log() << "replSet exception loading our local replset configuration object : " << e.toString() << rsLog;
-                    throw;
                 }
                 for( vector<HostAndPort>::const_iterator i = _seeds->begin(); i != _seeds->end(); i++ ) {
                     try {
@@ -542,12 +595,25 @@ namespace mongo {
                     }
                 }
 
-                if( discoveredSeed ) {
+                if( replSettings.discoveredSeeds.size() > 0 ) {
+                    for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); i != replSettings.discoveredSeeds.end(); i++) {
+                        try {
+                            configs.push_back( ReplSetConfig(HostAndPort(*i)) );
+                        }
+                        catch( DBException& ) {
+                            log(1) << "replSet exception trying to load config from discovered seed " << *i << rsLog;
+                            replSettings.discoveredSeeds.erase(*i);
+                        }
+                    }
+                }
+
+                if (!replSettings.reconfig.isEmpty()) {
                     try {
-                        configs.push_back( ReplSetConfig(HostAndPort(*discoveredSeed)) );
+                        configs.push_back(ReplSetConfig(replSettings.reconfig, true));
                     }
-                    catch( DBException& ) {
-                        log(1) << "replSet exception trying to load config from discovered seed " << *discoveredSeed << rsLog;
+                    catch( DBException& re) {
+                        log() << "couldn't load reconfig: " << re.what() << endl;
+                        replSettings.reconfig = BSONObj();
                     }
                 }
 
@@ -563,17 +629,17 @@ namespace mongo {
 
                     if( nempty == (int) configs.size() ) {
                         startupStatus = EMPTYCONFIG;
-                        startupStatusMsg = "can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)";
+                        startupStatusMsg.set("can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)");
                         log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog;
                         static unsigned once;
                         if( ++once == 1 )
                             log() << "replSet info you may need to run replSetInitiate -- rs.initiate() in the shell -- if that is not already done" << rsLog;
                         if( _seeds->size() == 0 )
-                            log(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog;
+                            LOG(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog;
                     }
                     else {
                         startupStatus = EMPTYUNREACHABLE;
-                        startupStatusMsg = "can't currently get " + rsConfigNs + " config from self or any seed (EMPTYUNREACHABLE)";
+                        startupStatusMsg.set("can't currently get " + rsConfigNs + " config from self or any seed (EMPTYUNREACHABLE)");
                         log() << "replSet can't get " << rsConfigNs << " config from self or any seed (yet)" << rsLog;
                     }
 
@@ -589,7 +655,7 @@ namespace mongo {
             }
             catch(DBException& e) {
                 startupStatus = BADCONFIG;
-                startupStatusMsg = "replSet error loading set config (BADCONFIG)";
+                startupStatusMsg.set("replSet error loading set config (BADCONFIG)");
                 log() << "replSet error loading configurations " << e.toString() << rsLog;
                 log() << "replSet error replication will not start" << rsLog;
                 sethbmsg("error loading set config");
@@ -598,27 +664,26 @@ namespace mongo {
             }
             break;
         }
-        startupStatusMsg = "? started";
+        startupStatusMsg.set("? started");
         startupStatus = STARTED;
     }
 
     void ReplSetImpl::_fatal() {
-        //lock l(this);
         box.set(MemberState::RS_FATAL, 0);
-        //sethbmsg("fatal error");
         log() << "replSet error fatal, stopping replication" << rsLog;
     }
 
     void ReplSet::haveNewConfig(ReplSetConfig& newConfig, bool addComment) {
-        lock l(this); // convention is to lock replset before taking the db rwlock
-        writelock lk("");
         bo comment;
         if( addComment )
             comment = BSON( "msg" << "Reconfig set" << "version" << newConfig.version );
+
         newConfig.saveConfigLocally(comment);
+
         try {
-            initFromConfig(newConfig, true);
-            log() << "replSet replSetReconfig new config saved locally" << rsLog;
+            if (initFromConfig(newConfig, true)) {
+                log() << "replSet replSetReconfig new config saved locally" << rsLog;
+            }
         }
         catch(DBException& e) {
             if( e.getCode() == 13497 /* removed from set */ ) {
@@ -652,16 +717,14 @@ namespace mongo {
        terminates.
     */
     void startReplSets(ReplSetCmdline *replSetCmdline) {
-        Client::initThread("startReplSets");
+        Client::initThread("rsStart");
         try {
             assert( theReplSet == 0 );
             if( replSetCmdline == 0 ) {
                 assert(!replSet);
                 return;
             }
-            if( !noauth ) {
-                cc().getAuthenticationInfo()->authorize("local");
-            }
+            replLocalAuth();
             (theReplSet = new ReplSet(*replSetCmdline))->go();
         }
         catch(std::exception& e) {
@@ -672,6 +735,13 @@ namespace mongo {
         cc().shutdown();
     }
 
+    void replLocalAuth() {
+        if ( noauth )
+            return;
+        cc().getAuthenticationInfo()->authorize("local","_repl");
+    }
+    
+
 }
 
 namespace boost {
diff --git a/db/repl/rs.h b/db/repl/rs.h
index ea9aef1..61041a6 100644
--- a/db/repl/rs.h
+++ b/db/repl/rs.h
@@ -21,13 +21,26 @@
 #include "../../util/concurrency/list.h"
 #include "../../util/concurrency/value.h"
 #include "../../util/concurrency/msg.h"
-#include "../../util/hostandport.h"
+#include "../../util/net/hostandport.h"
 #include "../commands.h"
+#include "../oplogreader.h"
 #include "rs_exception.h"
 #include "rs_optime.h"
 #include "rs_member.h"
 #include "rs_config.h"
 
+/**
+ * Order of Events
+ *
+ * On startup, if the --replSet option is present, startReplSets is called.
+ * startReplSets forks off a new thread for replica set activities.  It creates
+ * the global theReplSet variable and calls go() on it.
+ *
+ * theReplSet's constructor changes the replica set's state to RS_STARTUP,
+ * starts the replica set manager, and loads the config (if the replica set
+ * has been initialized).
+ */
+
 namespace mongo {
 
     struct HowToFixUp;
@@ -41,11 +54,15 @@ namespace mongo {
 
     /* member of a replica set */
     class Member : public List1<Member>::Base {
+    private:
+        ~Member(); // intentionally unimplemented as should never be called -- see List1<>::Base.
+        Member(const Member&); 
     public:
-        Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self);
+        Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self);
 
         string fullName() const { return h().toString(); }
         const ReplSetConfig::MemberCfg& config() const { return _config; }
+        ReplSetConfig::MemberCfg& configw() { return _config; }
         const HeartbeatInfo& hbinfo() const { return _hbinfo; }
         HeartbeatInfo& get_hbinfo() { return _hbinfo; }
         string lhb() const { return _hbinfo.lastHeartbeatMsg; }
@@ -58,7 +75,7 @@ namespace mongo {
 
     private:
         friend class ReplSetImpl;
-        const ReplSetConfig::MemberCfg _config;
+        ReplSetConfig::MemberCfg _config;
         const HostAndPort _h;
         HeartbeatInfo _hbinfo;
     };
@@ -75,6 +92,7 @@ namespace mongo {
         const Member* findOtherPrimary(bool& two);
 
         void noteARemoteIsPrimary(const Member *);
+        void checkElectableSet();
         virtual void starting();
     public:
         Manager(ReplSetImpl *rs);
@@ -83,6 +101,47 @@ namespace mongo {
         void msgCheckNewState();
     };
 
+    class GhostSync : public task::Server {
+        struct GhostSlave {
+            GhostSlave() : last(0), slave(0), init(false) {}
+            OplogReader reader;
+            OpTime last;
+            Member* slave;
+            bool init;
+        };
+        /**
+         * This is a cache of ghost slaves
+         */
+        typedef map<mongo::OID,GhostSlave> MAP;
+        MAP _ghostCache;
+        RWLock _lock; // protects _ghostCache
+        ReplSetImpl *rs;
+        virtual void starting();
+    public:
+        GhostSync(ReplSetImpl *_rs) : task::Server("rsGhostSync"), _lock("GhostSync"), rs(_rs) {}
+        ~GhostSync() {
+            log() << "~GhostSync() called" << rsLog;
+        }
+
+        /**
+         * Replica sets can sync in a hierarchical fashion, which throws off w
+         * calculation on the master.  percolate() faux-syncs from an upstream
+         * node so that the primary will know what the slaves are up to.
+         *
+         * We can't just directly sync to the primary because it could be
+         * unreachable, e.g., S1--->S2--->S3--->P.  S2 should ghost sync from S3
+         * and S3 can ghost sync from the primary.
+         *
+         * Say we have an S1--->S2--->P situation and this node is S2.  rid
+         * would refer to S1.  S2 would create a ghost slave of S1 and connect
+         * it to P (_currentSyncTarget). Then it would use this connection to
+         * pretend to be S1, replicating off of P.
+         */
+        void percolate(const BSONObj& rid, const OpTime& last);
+        void associateSlave(const BSONObj& rid, const int memberId);
+        void updateSlave(const mongo::OID& id, const OpTime& last);
+    };
+
     struct Target;
 
     class Consensus {
@@ -92,7 +151,8 @@ namespace mongo {
             time_t when;
             unsigned who;
         };
-        Atomic<LastYea> ly;
+        static mutex lyMutex;
+        Guarded<LastYea,lyMutex> ly;
         unsigned yea(unsigned memberId); // throws VoteException
         void electionFailed(unsigned meid);
         void _electSelf();
@@ -117,7 +177,12 @@ namespace mongo {
         void multiCommand(BSONObj cmd, list<Target>& L);
     };
 
-    /** most operations on a ReplSet object should be done while locked. that logic implemented here. */
+    /**
+     * most operations on a ReplSet object should be done while locked. that
+     * logic implemented here.
+     *
+     * Order of locking: lock the replica set, then take a rwlock.
+     */
     class RSBase : boost::noncopyable {
     public:
         const unsigned magic;
@@ -133,6 +198,7 @@ namespace mongo {
             log() << "replSet ~RSBase called" << rsLog;
         }
 
+    public:
         class lock {
             RSBase& rsbase;
             auto_ptr<scoped_lock> sl;
@@ -156,7 +222,6 @@ namespace mongo {
             }
         };
 
-    public:
         /* for asserts */
         bool locked() const { return _locked != 0; }
 
@@ -178,13 +243,19 @@ namespace mongo {
             const Member *primary;
         };
         const SP get() {
-            scoped_lock lk(m);
+            rwlock lk(m, false);
             return sp;
         }
-        MemberState getState() const { return sp.state; }
-        const Member* getPrimary() const { return sp.primary; }
+        MemberState getState() const {
+            rwlock lk(m, false);
+            return sp.state;
+        }
+        const Member* getPrimary() const {
+            rwlock lk(m, false);
+            return sp.primary;
+        }
         void change(MemberState s, const Member *self) {
-            scoped_lock lk(m);
+            rwlock lk(m, true);
             if( sp.state != s ) {
                 log() << "replSet " << s.toString() << rsLog;
             }
@@ -198,24 +269,25 @@ namespace mongo {
             }
         }
         void set(MemberState s, const Member *p) {
-            scoped_lock lk(m);
-            sp.state = s; sp.primary = p;
+            rwlock lk(m, true);
+            sp.state = s;
+            sp.primary = p;
         }
         void setSelfPrimary(const Member *self) { change(MemberState::RS_PRIMARY, self); }
         void setOtherPrimary(const Member *mem) {
-            scoped_lock lk(m);
+            rwlock lk(m, true);
             assert( !sp.state.primary() );
             sp.primary = mem;
         }
         void noteRemoteIsPrimary(const Member *remote) {
-            scoped_lock lk(m);
+            rwlock lk(m, true);
             if( !sp.state.secondary() && !sp.state.fatal() )
                 sp.state = MemberState::RS_RECOVERING;
             sp.primary = remote;
         }
         StateBox() : m("StateBox") { }
     private:
-        mongo::mutex m;
+        RWLock m;
         SP sp;
     };
 
@@ -267,10 +339,17 @@ namespace mongo {
         bool _freeze(int secs);
     private:
         void assumePrimary();
-        void loadLastOpTimeWritten();
+        void loadLastOpTimeWritten(bool quiet=false);
         void changeState(MemberState s);
+        
+        /**
+         * Find the closest member (using ping time) with a higher latest optime.
+         */
         const Member* getMemberToSyncTo();
-        void _changeArbiterState();
+        Member* _currentSyncTarget;
+
+        // set of electable members' _ids
+        set<unsigned> _electableSet;
     protected:
         // "heartbeat message"
         // sent in requestHeartbeat respond in field "hbm"
@@ -278,8 +357,54 @@ namespace mongo {
         time_t _hbmsgTime; // when it was logged
     public:
         void sethbmsg(string s, int logLevel = 0);
+
+        /**
+         * Election with Priorities
+         *
+         * Each node (n) keeps a set of nodes that could be elected primary.
+         * Each node in this set:
+         *
+         *  1. can connect to a majority of the set
+         *  2. has a priority greater than 0
+         *  3. has an optime within 10 seconds of the most up-to-date node
+         *     that n can reach
+         *
+         * If a node fails to meet one or more of these criteria, it is removed
+         * from the list.  This list is updated whenever the node receives a
+         * heartbeat.
+         *
+         * When a node sends an "am I freshest?" query, the node receiving the
+         * query checks their electable list to make sure that no one else is
+         * electable AND higher priority.  If this check passes, the node will
+         * return an "ok" response, if not, it will veto.
+         *
+         * If a node is primary and there is another node with higher priority
+         * on the electable list (i.e., it must be synced to within 10 seconds
+         * of the current primary), the node (or nodes) with connections to both
+         * the primary and the secondary with higher priority will issue
+         * replSetStepDown requests to the primary to allow the higher-priority
+         * node to take over.  
+         */
+        void addToElectable(const unsigned m) { lock lk(this); _electableSet.insert(m); }
+        void rmFromElectable(const unsigned m) { lock lk(this); _electableSet.erase(m); }
+        bool iAmElectable() { lock lk(this); return _electableSet.find(_self->id()) != _electableSet.end(); }
+        bool isElectable(const unsigned id) { lock lk(this); return _electableSet.find(id) != _electableSet.end(); }
+        Member* getMostElectable();
     protected:
-        bool initFromConfig(ReplSetConfig& c, bool reconf=false); // true if ok; throws if config really bad; false if config doesn't include self
+        /**
+         * Load a new config as the replica set's main config.
+         *
+         * If there is a "simple" change (just adding a node), this shortcuts
+         * the config. Returns true if the config was changed.  Returns false
+         * if the config doesn't include a this node.  Throws an exception if
+         * something goes very wrong.
+         *
+         * Behavior to note:
+         *  - locks this
+         *  - intentionally leaks the old _cfg and any old _members (if the
+         *    change isn't strictly additive)
+         */
+        bool initFromConfig(ReplSetConfig& c, bool reconf=false); 
         void _fillIsMaster(BSONObjBuilder&);
         void _fillIsMasterHost(const Member*, vector<string>&, vector<string>&, vector<string>&);
         const ReplSetConfig& config() { return *_cfg; }
@@ -301,27 +426,48 @@ namespace mongo {
         const vector<HostAndPort> *_seeds;
         ReplSetConfig *_cfg;
 
-        /** load our configuration from admin.replset.  try seed machines too.
-            @return true if ok; throws if config really bad; false if config doesn't include self
-        */
+        /**
+         * Finds the configuration with the highest version number and attempts
+         * load it.
+         */
         bool _loadConfigFinish(vector<ReplSetConfig>& v);
+        /**
+         * Gather all possible configs (from command line seeds, our own config
+         * doc, and any hosts listed therein) and try to initiate from the most
+         * recent config we find.
+         */
         void loadConfig();
 
         list<HostAndPort> memberHostnames() const;
-        const ReplSetConfig::MemberCfg& myConfig() const { return _self->config(); }
+        const ReplSetConfig::MemberCfg& myConfig() const { return _config; }
         bool iAmArbiterOnly() const { return myConfig().arbiterOnly; }
-        bool iAmPotentiallyHot() const { return myConfig().potentiallyHot(); }
+        bool iAmPotentiallyHot() const {
+          return myConfig().potentiallyHot() && // not an arbiter
+            elect.steppedDown <= time(0) && // not stepped down/frozen
+            state() == MemberState::RS_SECONDARY; // not stale
+        }
     protected:
         Member *_self;
         bool _buildIndexes;       // = _self->config().buildIndexes
         void setSelfTo(Member *); // use this as it sets buildIndexes var
     private:
-        List1<Member> _members; /* all members of the set EXCEPT self. */
+        List1<Member> _members; // all members of the set EXCEPT _self.
+        ReplSetConfig::MemberCfg _config; // config of _self
+        unsigned _id; // _id of _self
 
+        int _maintenanceMode; // if we should stay in recovering state
     public:
-        unsigned selfId() const { return _self->id(); }
+        // this is called from within a writelock in logOpRS
+        unsigned selfId() const { return _id; }
         Manager *mgr;
-
+        GhostSync *ghost;
+        /**
+         * This forces a secondary to go into recovering state and stay there
+         * until this is called again, passing in "false".  Multiple threads can
+         * call this and it will leave maintenance mode once all of the callers
+         * have called it again, passing in false.
+         */
+        void setMaintenanceMode(const bool inc);
     private:
         Member* head() const { return _members.head(); }
     public:
@@ -334,6 +480,7 @@ namespace mongo {
         friend class CmdReplSetElect;
         friend class Member;
         friend class Manager;
+        friend class GhostSync;
         friend class Consensus;
 
     private:
@@ -352,6 +499,7 @@ namespace mongo {
         bool _isStale(OplogReader& r, const string& hn);
     public:
         void syncThread();
+        const OpTime lastOtherOpTime() const;
     };
 
     class ReplSet : public ReplSetImpl {
@@ -365,7 +513,7 @@ namespace mongo {
         bool freeze(int secs) { return _freeze(secs); }
 
         string selfFullName() {
-            lock lk(this);
+            assert( _self );
             return _self->fullName();
         }
 
@@ -385,12 +533,20 @@ namespace mongo {
         void summarizeStatus(BSONObjBuilder& b) const  { _summarizeStatus(b); }
         void fillIsMaster(BSONObjBuilder& b) { _fillIsMaster(b); }
 
-        /* we have a new config (reconfig) - apply it.
-           @param comment write a no-op comment to the oplog about it.  only makes sense if one is primary and initiating the reconf.
-        */
+        /**
+         * We have a new config (reconfig) - apply it.
+         * @param comment write a no-op comment to the oplog about it.  only
+         * makes sense if one is primary and initiating the reconf.
+         *
+         * The slaves are updated when they get a heartbeat indicating the new
+         * config.  The comment is a no-op.
+         */
         void haveNewConfig(ReplSetConfig& c, bool comment);
 
-        /* if we delete old configs, this needs to assure locking. currently we don't so it is ok. */
+        /**
+         * Pointer assignment isn't necessarily atomic, so this needs to assure
+         * locking, even though we don't delete old configs.
+         */
         const ReplSetConfig& getConfig() { return config(); }
 
         bool lockedByMe() { return RSBase::lockedByMe(); }
@@ -402,9 +558,10 @@ namespace mongo {
         }
     };
 
-    /** base class for repl set commands.  checks basic things such as in rs mode before the command
-        does its real work
-        */
+    /**
+     * Base class for repl set commands.  Checks basic things such if we're in
+     * rs mode before the command does its real work.
+     */
     class ReplSetCommand : public Command {
     protected:
         ReplSetCommand(const char * s, bool show=false) : Command(s, show) { }
@@ -413,26 +570,53 @@ namespace mongo {
         virtual bool logTheOp() { return false; }
         virtual LockType locktype() const { return NONE; }
         virtual void help( stringstream &help ) const { help << "internal"; }
+
+        /**
+         * Some replica set commands call this and then call check(). This is
+         * intentional, as they might do things before theReplSet is initialized
+         * that still need to be checked for auth.
+         */
+        bool checkAuth(string& errmsg, BSONObjBuilder& result) {
+            if( !noauth && adminOnly() ) {
+                AuthenticationInfo *ai = cc().getAuthenticationInfo();
+                if (!ai->isAuthorizedForLock("admin", locktype())) {
+                    errmsg = "replSet command unauthorized";
+                    return false;
+                }
+            }
+            return true;
+        }
+
         bool check(string& errmsg, BSONObjBuilder& result) {
             if( !replSet ) {
                 errmsg = "not running with --replSet";
                 return false;
             }
+
             if( theReplSet == 0 ) {
                 result.append("startupStatus", ReplSet::startupStatus);
+                string s;
                 errmsg = ReplSet::startupStatusMsg.empty() ? "replset unknown error 2" : ReplSet::startupStatusMsg.get();
                 if( ReplSet::startupStatus == 3 )
                     result.append("info", "run rs.initiate(...) if not yet done for the set");
                 return false;
             }
-            return true;
+
+            return checkAuth(errmsg, result);
         }
     };
 
+    /**
+     * does local authentication
+     * directly authorizes against AuthenticationInfo
+     */
+    void replLocalAuth();
+
     /** inlines ----------------- */
 
-    inline Member::Member(HostAndPort h, unsigned ord, const ReplSetConfig::MemberCfg *c, bool self) :
+    inline Member::Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self) :
         _config(*c), _h(h), _hbinfo(ord) {
+        assert(c);
         if( self )
             _hbinfo.health = 1.0;
     }
diff --git a/db/repl/rs_config.cpp b/db/repl/rs_config.cpp
index 2341fe9..13352b1 100644
--- a/db/repl/rs_config.cpp
+++ b/db/repl/rs_config.cpp
@@ -20,10 +20,11 @@
 #include "rs.h"
 #include "../../client/dbclient.h"
 #include "../../client/syncclusterconnection.h"
-#include "../../util/hostandport.h"
+#include "../../util/net/hostandport.h"
 #include "../dbhelpers.h"
 #include "connections.h"
 #include "../oplog.h"
+#include "../instance.h"
 
 using namespace bson;
 
@@ -36,7 +37,7 @@ namespace mongo {
         while( i.more() ) {
             BSONElement e = i.next();
             if( !fields.count( e.fieldName() ) ) {
-                uasserted(13434, str::stream() << "unexpected field '" << e.fieldName() << "'in object");
+                uasserted(13434, str::stream() << "unexpected field '" << e.fieldName() << "' in object");
             }
         }
     }
@@ -63,27 +64,14 @@ namespace mongo {
             //rather than above, do a logOp()? probably
             BSONObj o = asBson();
             Helpers::putSingletonGod(rsConfigNs.c_str(), o, false/*logOp=false; local db so would work regardless...*/);
-            if( !comment.isEmpty() )
+            if( !comment.isEmpty() && (!theReplSet || theReplSet->isPrimary()) )
                 logOpInitiate(comment);
 
             cx.db()->flushFiles(true);
         }
-        DEV log() << "replSet saveConfigLocally done" << rsLog;
+        log() << "replSet saveConfigLocally done" << rsLog;
     }
 
-    /*static*/
-    /*void ReplSetConfig::receivedNewConfig(BSONObj cfg) {
-        if( theReplSet )
-            return; // this is for initial setup only, so far. todo
-
-        ReplSetConfig c(cfg);
-
-        writelock lk("admin.");
-        if( theReplSet )
-            return;
-        c.saveConfigLocally(bo());
-    }*/
-
     bo ReplSetConfig::MemberCfg::asBson() const {
         bob b;
         b << "_id" << _id;
@@ -95,36 +83,52 @@ namespace mongo {
         if( hidden ) b << "hidden" << hidden;
         if( !buildIndexes ) b << "buildIndexes" << buildIndexes;
         if( !tags.empty() ) {
-            BSONArrayBuilder a;
-            for( set<string>::const_iterator i = tags.begin(); i != tags.end(); i++ )
-                a.append(*i);
-            b.appendArray("tags", a.done());
-        }
-        if( !initialSync.isEmpty() ) {
-            b << "initialSync" << initialSync;
+            BSONObjBuilder a;
+            for( map<string,string>::const_iterator i = tags.begin(); i != tags.end(); i++ )
+                a.append((*i).first, (*i).second);
+            b.append("tags", a.done());
         }
         return b.obj();
     }
 
+    void ReplSetConfig::updateMembers(List1<Member> &dest) {
+        for (vector<MemberCfg>::iterator source = members.begin(); source < members.end(); source++) {
+            for( Member *d = dest.head(); d; d = d->next() ) {
+                if (d->fullName() == (*source).h.toString()) {
+                    d->configw().groupsw() = (*source).groups();
+                }
+            }
+        }
+    }
+
     bo ReplSetConfig::asBson() const {
         bob b;
         b.append("_id", _id).append("version", version);
-        if( !ho.isDefault() || !getLastErrorDefaults.isEmpty() ) {
-            bob settings;
-            if( !ho.isDefault() )
-                settings << "heartbeatConnRetries " << ho.heartbeatConnRetries  <<
-                         "heartbeatSleep" << ho.heartbeatSleepMillis / 1000.0 <<
-                         "heartbeatTimeout" << ho.heartbeatTimeoutMillis / 1000.0;
-            if( !getLastErrorDefaults.isEmpty() )
-                settings << "getLastErrorDefaults" << getLastErrorDefaults;
-            b << "settings" << settings.obj();
-        }
 
         BSONArrayBuilder a;
         for( unsigned i = 0; i < members.size(); i++ )
             a.append( members[i].asBson() );
         b.append("members", a.arr());
 
+        if( !ho.isDefault() || !getLastErrorDefaults.isEmpty() || !rules.empty()) {
+            bob settings;
+            if( !rules.empty() ) {
+                bob modes;
+                for (map<string,TagRule*>::const_iterator it = rules.begin(); it != rules.end(); it++) {
+                    bob clauses;
+                    vector<TagClause*> r = (*it).second->clauses;
+                    for (vector<TagClause*>::iterator it2 = r.begin(); it2 < r.end(); it2++) {
+                        clauses << (*it2)->name << (*it2)->target;
+                    }
+                    modes << (*it).first << clauses.obj();
+                }
+                settings << "getLastErrorModes" << modes.obj();
+            }
+            if( !getLastErrorDefaults.isEmpty() )
+                settings << "getLastErrorDefaults" << getLastErrorDefaults;
+            b << "settings" << settings.obj();
+        }
+
         return b.obj();
     }
 
@@ -135,38 +139,87 @@ namespace mongo {
     void ReplSetConfig::MemberCfg::check() const {
         mchk(_id >= 0 && _id <= 255);
         mchk(priority >= 0 && priority <= 1000);
-        mchk(votes >= 0 && votes <= 100);
-        uassert(13419, "this version of mongod only supports priorities 0 and 1", priority == 0 || priority == 1);
+        mchk(votes <= 100); // votes >= 0 because it is unsigned
+        uassert(13419, "priorities must be between 0.0 and 100.0", priority >= 0.0 && priority <= 100.0);
         uassert(13437, "slaveDelay requires priority be zero", slaveDelay == 0 || priority == 0);
         uassert(13438, "bad slaveDelay value", slaveDelay >= 0 && slaveDelay <= 3600 * 24 * 366);
         uassert(13439, "priority must be 0 when hidden=true", priority == 0 || !hidden);
         uassert(13477, "priority must be 0 when buildIndexes=false", buildIndexes || priority == 0);
+    }
+/*
+    string ReplSetConfig::TagSubgroup::toString() const {
+        bool first = true;
+        string result = "\""+name+"\": [";
+        for (set<const MemberCfg*>::const_iterator i = m.begin(); i != m.end(); i++) {
+            if (!first) {
+                result += ", ";
+            }
+            first = false;
+            result += (*i)->h.toString();
+        }
+        return result+"]";
+    }
+    */
+    string ReplSetConfig::TagClause::toString() const {
+        string result = name+": {";
+        for (map<string,TagSubgroup*>::const_iterator i = subgroups.begin(); i != subgroups.end(); i++) {
+//TEMP?            result += (*i).second->toString()+", ";
+        }
+        result += "TagClause toString TEMPORARILY DISABLED";
+        return result + "}";
+    }
 
-        if (!initialSync.isEmpty()) {
-            static const string legal[] = {"state", "name", "_id","optime"};
-            static const set<string> legals(legal, legal + 4);
-            assertOnlyHas(initialSync, legals);
+    string ReplSetConfig::TagRule::toString() const {
+        string result = "{";
+        for (vector<TagClause*>::const_iterator it = clauses.begin(); it < clauses.end(); it++) {
+            result += ((TagClause*)(*it))->toString()+",";
+        }
+        return result+"}";
+    }
 
-            if (initialSync.hasElement("state")) {
-                uassert(13525, "initialSync source state must be 1 or 2",
-                        initialSync["state"].isNumber() &&
-                        (initialSync["state"].Number() == 1 ||
-                         initialSync["state"].Number() == 2));
-            }
-            if (initialSync.hasElement("name")) {
-                uassert(13526, "initialSync source name must be a string",
-                        initialSync["name"].type() == mongo::String);
+    void ReplSetConfig::TagSubgroup::updateLast(const OpTime& op) {
+        if (last < op) {
+            last = op;
+
+            for (vector<TagClause*>::iterator it = clauses.begin(); it < clauses.end(); it++) {
+                (*it)->updateLast(op);
             }
-            if (initialSync.hasElement("_id")) {
-                uassert(13527, "initialSync source _id must be a number",
-                        initialSync["_id"].isNumber());
+        }
+    }
+
+    void ReplSetConfig::TagClause::updateLast(const OpTime& op) {
+        if (last >= op) {
+            return;
+        }
+
+        // check at least n subgroups greater than clause.last
+        int count = 0;
+        map<string,TagSubgroup*>::iterator it;
+        for (it = subgroups.begin(); it != subgroups.end(); it++) {
+            if ((*it).second->last >= op) {
+                count++;
             }
-            if (initialSync.hasElement("optime")) {
-                uassert(13528, "initialSync source optime must be a timestamp",
-                        initialSync["optime"].type() == mongo::Timestamp ||
-                        initialSync["optime"].type() == mongo::Date);
+        }
+
+        if (count >= actualTarget) {
+            last = op;
+            rule->updateLast(op);
+        }
+    }
+
+    void ReplSetConfig::TagRule::updateLast(const OpTime& op) {
+        OpTime *earliest = (OpTime*)&op;
+        vector<TagClause*>::iterator it;
+
+        for (it = clauses.begin(); it < clauses.end(); it++) {
+            if ((*it)->last < *earliest) {
+                earliest = &(*it)->last;
             }
         }
+
+        // rules are simply and-ed clauses, so whatever the most-behind
+        // clause is at is what the rule is at
+        last = *earliest;
     }
 
     /** @param o old config
@@ -184,18 +237,28 @@ namespace mongo {
                   if someone had some intermediate config this node doesnt have, that could be
                   necessary.  but then how did we become primary?  so perhaps we are fine as-is.
                   */
-        if( o.version + 1 != n.version ) {
-            errmsg = "version number wrong";
+        if( o.version >= n.version ) {
+            errmsg = str::stream() << "version number must increase, old: "
+                                   << o.version << " new: " << n.version;
             return false;
         }
 
         map<HostAndPort,const ReplSetConfig::MemberCfg*> old;
+        bool isLocalHost = false;
         for( vector<ReplSetConfig::MemberCfg>::const_iterator i = o.members.begin(); i != o.members.end(); i++ ) {
+            if (i->h.isLocalHost()) {
+                isLocalHost = true;
+            }
             old[i->h] = &(*i);
         }
         int me = 0;
         for( vector<ReplSetConfig::MemberCfg>::const_iterator i = n.members.begin(); i != n.members.end(); i++ ) {
             const ReplSetConfig::MemberCfg& m = *i;
+            if ( (isLocalHost && !m.h.isLocalHost()) || (!isLocalHost && m.h.isLocalHost())) {
+                log() << "reconfig error, cannot switch between localhost and hostnames: "
+                      << m.h.toString() << rsLog;
+                uasserted(13645, "hosts cannot switch between localhost and hostname");
+            }
             if( old.count(m.h) ) {
                 const ReplSetConfig::MemberCfg& oldCfg = *old[m.h];
                 if( oldCfg._id != m._id ) {
@@ -212,6 +275,7 @@ namespace mongo {
                     log() << "replSet reconfig error with member: " << m.h.toString() << " arbiterOnly cannot change. remove and readd the member instead " << rsLog;
                     uasserted(13510, "arbiterOnly may not change for members");
                 }
+                uassert(14827, "arbiters cannot have tags", !m.arbiterOnly || m.tags.size() == 0 );
             }
             if( m.h.isSelf() )
                 me++;
@@ -250,6 +314,122 @@ namespace mongo {
         }
     }
 
+    void ReplSetConfig::_populateTagMap(map<string,TagClause> &tagMap) {
+        // create subgroups for each server corresponding to each of
+        // its tags. E.g.:
+        //
+        // A is tagged with {"server" : "A", "dc" : "ny"}
+        // B is tagged with {"server" : "B", "dc" : "ny"}
+        //
+        // At the end of this step, tagMap will contain:
+        //
+        // "server" => {"A" : [A], "B" : [B]}
+        // "dc" => {"ny" : [A,B]}
+
+        for (unsigned i=0; i<members.size(); i++) {
+            MemberCfg member = members[i];
+
+            for (map<string,string>::iterator tag = member.tags.begin(); tag != member.tags.end(); tag++) {
+                string label = (*tag).first;
+                string value = (*tag).second;
+
+                TagClause& clause = tagMap[label];
+                clause.name = label;
+
+                TagSubgroup* subgroup;
+                // search for "ny" in "dc"'s clause
+                if (clause.subgroups.find(value) == clause.subgroups.end()) {
+                    clause.subgroups[value] = subgroup = new TagSubgroup(value);
+                }
+                else {
+                    subgroup = clause.subgroups[value];
+                }
+
+                subgroup->m.insert(&members[i]);
+            }
+        }
+    }
+
+    void ReplSetConfig::parseRules(const BSONObj& modes) {
+        map<string,TagClause> tagMap;
+        _populateTagMap(tagMap);
+
+        for (BSONObj::iterator i = modes.begin(); i.more(); ) {
+            unsigned int primaryOnly = 0;
+
+            // ruleName : {dc : 2, m : 3}
+            BSONElement rule = i.next();
+            uassert(14046, "getLastErrorMode rules must be objects", rule.type() == mongo::Object);
+
+            TagRule* r = new TagRule();
+
+            BSONObj clauseObj = rule.Obj();
+            for (BSONObj::iterator c = clauseObj.begin(); c.more(); ) {
+                BSONElement clauseElem = c.next();
+                uassert(14829, "getLastErrorMode criteria must be numeric", clauseElem.isNumber());
+
+                // get the clause, e.g., "x.y" : 3
+                const char *criteria = clauseElem.fieldName();
+                int value = clauseElem.numberInt();
+                uassert(14828, str::stream() << "getLastErrorMode criteria must be greater than 0: " << clauseElem, value > 0);
+
+                TagClause* node = new TagClause(tagMap[criteria]);
+
+                int numGroups = node->subgroups.size();
+                uassert(14831, str::stream() << "mode " << clauseObj << " requires "
+                        << value << " tagged with " << criteria << ", but only "
+                        << numGroups << " with this tag were found", numGroups >= value);
+
+                node->name = criteria;
+                node->target = value;
+                // if any subgroups contain "me", we can decrease the target
+                node->actualTarget = node->target;
+
+                // then we want to add pointers between clause & subgroup
+                for (map<string,TagSubgroup*>::iterator sgs = node->subgroups.begin();
+                     sgs != node->subgroups.end(); sgs++) {
+                    bool foundMe = false;
+                    (*sgs).second->clauses.push_back(node);
+
+                    // if this subgroup contains the primary, it's automatically always up-to-date
+                    for( set<MemberCfg*>::const_iterator cfg = (*sgs).second->m.begin();
+                         cfg != (*sgs).second->m.end(); 
+                         cfg++) 
+                    {
+                        if ((*cfg)->h.isSelf()) {
+                            node->actualTarget--;
+                            foundMe = true;
+                        }
+                    }
+
+                    for (set<MemberCfg *>::iterator cfg = (*sgs).second->m.begin();
+                         !foundMe && cfg != (*sgs).second->m.end(); cfg++) {
+                        (*cfg)->groupsw().insert((*sgs).second);
+                    }
+                }
+
+                // if all of the members of this clause involve the primary, it's always up-to-date
+                if (node->actualTarget == 0) {
+                    node->last = OpTime(INT_MAX, INT_MAX);
+                    primaryOnly++;
+                }
+
+                // this is a valid clause, so we want to add it to its rule
+                node->rule = r;
+                r->clauses.push_back(node);
+            }
+
+            // if all of the clauses are satisfied by the primary, this rule is trivially true
+            if (primaryOnly == r->clauses.size()) {
+                r->last = OpTime(INT_MAX, INT_MAX);
+            }
+
+            // if we got here, this is a valid rule
+            LOG(1) << "replSet new rule " << rule.fieldName() << ": " << r->toString() << rsLog;
+            rules[rule.fieldName()] = r;
+        }
+    }
+
     void ReplSetConfig::from(BSONObj o) {
         static const string legal[] = {"_id","version", "members","settings"};
         static const set<string> legals(legal, legal + 4);
@@ -262,19 +442,6 @@ namespace mongo {
             uassert(13115, "bad " + rsConfigNs + " config: version", version > 0);
         }
 
-        if( o["settings"].ok() ) {
-            BSONObj settings = o["settings"].Obj();
-            if( settings["heartbeatConnRetries "].ok() )
-                ho.heartbeatConnRetries  = settings["heartbeatConnRetries "].numberInt();
-            if( settings["heartbeatSleep"].ok() )
-                ho.heartbeatSleepMillis = (unsigned) (settings["heartbeatSleep"].Number() * 1000);
-            if( settings["heartbeatTimeout"].ok() )
-                ho.heartbeatTimeoutMillis = (unsigned) (settings["heartbeatTimeout"].Number() * 1000);
-            ho.check();
-            try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); }
-            catch(...) { }
-        }
-
         set<string> hosts;
         set<int> ords;
         vector<BSONElement> members;
@@ -292,7 +459,7 @@ namespace mongo {
             try {
                 static const string legal[] = {
                     "_id","votes","priority","host", "hidden","slaveDelay",
-                    "arbiterOnly","buildIndexes","tags","initialSync"
+                    "arbiterOnly","buildIndexes","tags","initialSync" // deprecated
                 };
                 static const set<string> legals(legal, legal + 10);
                 assertOnlyHas(mobj, legals);
@@ -304,10 +471,12 @@ namespace mongo {
                     /* TODO: use of string exceptions may be problematic for reconfig case! */
                     throw "_id must be numeric";
                 }
-                string s;
                 try {
-                    s = mobj["host"].String();
+                    string s = mobj["host"].String();
                     m.h = HostAndPort(s);
+                    if (!m.h.hasPort()) {
+                        m.h.setPort(m.h.port());
+                    }
                 }
                 catch(...) {
                     throw string("bad or missing host field? ") + mobj.toString();
@@ -325,12 +494,10 @@ namespace mongo {
                 if( mobj.hasElement("votes") )
                     m.votes = (unsigned) mobj["votes"].Number();
                 if( mobj.hasElement("tags") ) {
-                    vector<BSONElement> v = mobj["tags"].Array();
-                    for( unsigned i = 0; i < v.size(); i++ )
-                        m.tags.insert( v[i].String() );
-                }
-                if( mobj.hasElement("initialSync")) {
-                    m.initialSync = mobj["initialSync"].Obj().getOwned();
+                    const BSONObj &t = mobj["tags"].Obj();
+                    for (BSONObj::iterator c = t.begin(); c.more(); c.next()) {
+                        m.tags[(*c).fieldName()] = (*c).String();
+                    }
                 }
                 m.check();
             }
@@ -356,22 +523,38 @@ namespace mongo {
         }
         uassert(13393, "can't use localhost in repl set member names except when using it for all members", localhosts == 0 || localhosts == members.size());
         uassert(13117, "bad " + rsConfigNs + " config", !_id.empty());
+
+        if( o["settings"].ok() ) {
+            BSONObj settings = o["settings"].Obj();
+            if( settings["getLastErrorModes"].ok() ) {
+                parseRules(settings["getLastErrorModes"].Obj());
+            }
+            ho.check();
+            try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); }
+            catch(...) { }
+        }
     }
 
     static inline void configAssert(bool expr) {
         uassert(13122, "bad repl set config?", expr);
     }
 
-    ReplSetConfig::ReplSetConfig(BSONObj cfg) {
+    ReplSetConfig::ReplSetConfig(BSONObj cfg, bool force) {
+        _constructed = false;
         clear();
         from(cfg);
-        configAssert( version < 0 /*unspecified*/ || (version >= 1 && version <= 5000) );
+        if( force ) {
+            version += rand() % 100000 + 10000;
+        }
+        configAssert( version < 0 /*unspecified*/ || (version >= 1) );
         if( version < 1 )
             version = 1;
         _ok = true;
+        _constructed = true;
     }
 
     ReplSetConfig::ReplSetConfig(const HostAndPort& h) {
+        _constructed = false;
         clear();
         int level = 2;
         DEV level = 0;
@@ -447,6 +630,7 @@ namespace mongo {
         checkRsConfig();
         _ok = true;
         log(level) << "replSet load config ok from " << (h.isSelf() ? "self" : h.toString()) << rsLog;
+        _constructed = true;
     }
 
 }
diff --git a/db/repl/rs_config.h b/db/repl/rs_config.h
index 7d43fe6..f69052a 100644
--- a/db/repl/rs_config.h
+++ b/db/repl/rs_config.h
@@ -20,26 +20,37 @@
 
 #pragma once
 
-#include "../../util/hostandport.h"
+#include "../../util/net/hostandport.h"
+#include "../../util/concurrency/race.h"
 #include "health.h"
 
 namespace mongo {
-
-    /* singleton config object is stored here */
+    class Member;
     const string rsConfigNs = "local.system.replset";
 
     class ReplSetConfig {
         enum { EMPTYCONFIG = -2 };
+        struct TagSubgroup;
     public:
-        /* if something is misconfigured, throws an exception.
-        if couldn't be queried or is just blank, ok() will be false.
-        */
+        /**
+         * This contacts the given host and tries to get a config from them.
+         *
+         * This sends a test heartbeat to the host and, if all goes well and the
+         * host has a more recent config, fetches the config and loads it (see
+         * from().
+         *
+         * If it's contacting itself, it skips the heartbeat (for obvious
+         * reasons.) If something is misconfigured, throws an exception. If the
+         * host couldn't be queried or is just blank, ok() will be false.
+         */
         ReplSetConfig(const HostAndPort& h);
 
-        ReplSetConfig(BSONObj cfg);
+        ReplSetConfig(BSONObj cfg, bool force=false);
 
         bool ok() const { return _ok; }
 
+        struct TagRule;
+
         struct MemberCfg {
             MemberCfg() : _id(-1), votes(1), priority(1.0), arbiterOnly(false), slaveDelay(0), hidden(false), buildIndexes(true) { }
             int _id;              /* ordinal */
@@ -50,12 +61,24 @@ namespace mongo {
             int slaveDelay;       /* seconds.  int rather than unsigned for convenient to/front bson conversion. */
             bool hidden;          /* if set, don't advertise to drives in isMaster. for non-primaries (priority 0) */
             bool buildIndexes;    /* if false, do not create any non-_id indexes */
-            set<string> tags;     /* tagging for data center, rack, etc. */
-            BSONObj initialSync;  /* directions for initial sync source */
-
+            map<string,string> tags;     /* tagging for data center, rack, etc. */
+        private:
+            set<TagSubgroup*> _groups; // the subgroups this member belongs to
+        public:
+            const set<TagSubgroup*>& groups() const { 
+                return _groups;
+            }
+            set<TagSubgroup*>& groupsw() {
+                return _groups;
+            }
             void check() const;   /* check validity, assert if not. */
             BSONObj asBson() const;
             bool potentiallyHot() const { return !arbiterOnly && priority > 0; }
+            void updateGroups(const OpTime& last) {
+                for (set<TagSubgroup*>::iterator it = _groups.begin(); it != _groups.end(); it++) {
+                    ((TagSubgroup*)(*it))->updateLast(last);
+                }
+            }
             bool operator==(const MemberCfg& r) const {
                 return _id==r._id && votes == r.votes && h == r.h && priority == r.priority &&
                        arbiterOnly == r.arbiterOnly && slaveDelay == r.slaveDelay && hidden == r.hidden &&
@@ -70,6 +93,7 @@ namespace mongo {
         HealthOptions ho;
         string md5;
         BSONObj getLastErrorDefaults;
+        map<string,TagRule*> rules;
 
         list<HostAndPort> otherMemberHostnames() const; // except self
 
@@ -88,12 +112,112 @@ namespace mongo {
         void saveConfigLocally(BSONObj comment); // to local db
         string saveConfigEverywhere(); // returns textual info on what happened
 
+        /**
+         * Update members' groups when the config changes but members stay the same.
+         */
+        void updateMembers(List1<Member> &dest);
+
         BSONObj asBson() const;
 
+        bool _constructed;
     private:
         bool _ok;
         void from(BSONObj);
         void clear();
+
+        struct TagClause;
+
+        /**
+         * This is a logical grouping of servers.  It is pointed to by a set of
+         * servers with a certain tag.
+         *
+         * For example, suppose servers A, B, and C have the tag "dc" : "nyc". If we
+         * have a rule {"dc" : 2}, then we want A _or_ B _or_ C to have the
+         * write for one of the "dc" critiria to be fulfilled, so all three will
+         * point to this subgroup. When one of their oplog-tailing cursors is
+         * updated, this subgroup is updated.
+         */
+        struct TagSubgroup : boost::noncopyable {
+            ~TagSubgroup(); // never called; not defined
+            TagSubgroup(string nm) : name(nm) { }
+            const string name;
+            OpTime last;
+            vector<TagClause*> clauses;
+
+            // this probably won't actually point to valid members after the
+            // subgroup is created, as initFromConfig() makes a copy of the
+            // config
+            set<MemberCfg*> m;
+
+            void updateLast(const OpTime& op);
+
+            //string toString() const;
+
+            /**
+             * If two tags have the same name, they should compare as equal so
+             * that members don't have to update two identical groups on writes.
+             */
+            bool operator() (TagSubgroup& lhs, TagSubgroup& rhs) const {
+                return lhs.name < rhs.name;
+            }
+        };
+
+        /**
+         * An argument in a rule.  For example, if we had the rule {dc : 2,
+         * machines : 3}, "dc" : 2 and "machines" : 3 would be two TagClauses.
+         *
+         * Each tag clause has a set of associated subgroups.  For example, if
+         * we had "dc" : 2, our subgroups might be "nyc", "sf", and "hk".
+         */
+        struct TagClause {
+            OpTime last;
+            map<string,TagSubgroup*> subgroups;
+            TagRule *rule;
+            string name;
+            /**
+             * If we have get a clause like {machines : 3} and this server is
+             * tagged with "machines", then it's really {machines : 2}, as we
+             * will always be up-to-date.  So, target would be 3 and
+             * actualTarget would be 2, in that example.
+             */
+            int target;
+            int actualTarget;
+
+            void updateLast(const OpTime& op);
+            string toString() const;
+        };
+
+        /**
+         * Parses getLastErrorModes.
+         */
+        void parseRules(const BSONObj& modes);
+
+        /**
+         * Create a  hash containing every possible clause that could be used in a
+         * rule and the servers related to that clause.
+         *
+         * For example, suppose we have the following servers:
+         * A {"dc" : "ny", "ny" : "rk1"}
+         * B {"dc" : "ny", "ny" : "rk1"}
+         * C {"dc" : "ny", "ny" : "rk2"}
+         * D {"dc" : "sf", "sf" : "rk1"}
+         * E {"dc" : "sf", "sf" : "rk2"}
+         *
+         * This would give us the possible criteria:
+         * "dc" -> {A, B, C},{D, E}
+         * "ny" -> {A, B},{C}
+         * "sf" -> {D},{E}
+         */
+        void _populateTagMap(map<string,TagClause> &tagMap);
+
+    public:
+        struct TagRule {
+            vector<TagClause*> clauses;
+            OpTime last;
+
+            void updateLast(const OpTime& op);
+            string toString() const;
+        };
     };
 
 }
diff --git a/db/repl/rs_initialsync.cpp b/db/repl/rs_initialsync.cpp
index 5a54059..101b03a 100644
--- a/db/repl/rs_initialsync.cpp
+++ b/db/repl/rs_initialsync.cpp
@@ -34,7 +34,7 @@ namespace mongo {
 
     // add try/catch with sleep
 
-    void isyncassert(const char *msg, bool expr) {
+    void isyncassert(const string& msg, bool expr) {
         if( !expr ) {
             string m = str::stream() << "initial sync " << msg;
             theReplSet->sethbmsg(m, 0);
@@ -57,20 +57,15 @@ namespace mongo {
         }
     }
 
-    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
-                   bool slaveOk, bool useReplAuth, bool snapshot);
-
     /* todo : progress metering to sethbmsg. */
     static bool clone(const char *master, string db) {
         string err;
         return cloneFrom(master, err, db, false,
-                         /* slave_ok */ true, true, false);
+                         /* slave_ok */ true, true, false, /*mayYield*/true, /*mayBeInterrupted*/false);
     }
 
     void _logOpObjRS(const BSONObj& op);
 
-    bool copyCollectionFromRemote(const string& host, const string& ns, const BSONObj& query, string &errmsg, bool logforrepl);
-
     static void emptyOplog() {
         writelock lk(rsoplog);
         Client::Context ctx(rsoplog);
@@ -80,104 +75,47 @@ namespace mongo {
         if( d && d->stats.nrecords == 0 )
             return; // already empty, ok.
 
-        log(1) << "replSet empty oplog" << rsLog;
+        LOG(1) << "replSet empty oplog" << rsLog;
         d->emptyCappedCollection(rsoplog);
-
-        /*
-        string errmsg;
-        bob res;
-        dropCollection(rsoplog, errmsg, res);
-        log() << "replSet recreated oplog so it is empty.  todo optimize this..." << rsLog;
-        createOplog();*/
-
-        // TEMP: restart to recreate empty oplog
-        //log() << "replSet FATAL error during initial sync.  mongod restart required." << rsLog;
-        //dbexit( EXIT_CLEAN );
-
-        /*
-        writelock lk(rsoplog);
-        Client::Context c(rsoplog, dbpath, 0, doauth/false);
-        NamespaceDetails *oplogDetails = nsdetails(rsoplog);
-        uassert(13412, str::stream() << "replSet error " << rsoplog << " is missing", oplogDetails != 0);
-        oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
-        */
     }
 
-    /**
-     * Choose a member to sync from.
-     *
-     * The initalSync option is an object with 1 k/v pair:
-     *
-     * "state" : 1|2
-     * "name" : "host"
-     * "_id" : N
-     * "optime" : t
-     *
-     * All except optime are exact matches.  "optime" will find a secondary with
-     * an optime >= to the optime given.
-     */
     const Member* ReplSetImpl::getMemberToSyncTo() {
-        BSONObj sync = myConfig().initialSync;
-        bool secondaryOnly = false, isOpTime = false;
-        char *name = 0;
-        int id = -1;
-        OpTime optime;
-
-        StateBox::SP sp = box.get();
-        assert( !sp.state.primary() ); // wouldn't make sense if we were.
-
-        // if it exists, we've already checked that these fields are valid in
-        // rs_config.cpp
-        if ( !sync.isEmpty() ) {
-            if (sync.hasElement("state")) {
-                if (sync["state"].Number() == 1) {
-                    if (sp.primary) {
-                        sethbmsg( str::stream() << "syncing to primary: " << sp.primary->fullName(), 0);
-                        return const_cast<Member*>(sp.primary);
-                    }
-                    else {
-                        sethbmsg("couldn't clone from primary");
-                        return NULL;
-                    }
-                }
-                else {
-                    secondaryOnly = true;
-                }
-            }
-            if (sync.hasElement("name")) {
-                name = (char*)sync["name"].valuestr();
-            }
-            if (sync.hasElement("_id")) {
-                id = (int)sync["_id"].Number();
-            }
-            if (sync.hasElement("optime")) {
-                isOpTime = true;
-                optime = sync["optime"]._opTime();
+        Member *closest = 0;
+
+        // wait for 2N pings before choosing a sync target
+        if (_cfg) {
+            int needMorePings = config().members.size()*2 - HeartbeatInfo::numPings;
+
+            if (needMorePings > 0) {
+                OCCASIONALLY log() << "waiting for " << needMorePings << " pings from other members before syncing" << endl;
+                return NULL;
             }
         }
 
-        for( Member *m = head(); m; m = m->next() ) {
-            if (!m->hbinfo().up() ||
-                    (m->state() != MemberState::RS_SECONDARY &&
-                     m->state() != MemberState::RS_PRIMARY) ||
-                    (secondaryOnly && m->state() != MemberState::RS_SECONDARY) ||
-                    (id != -1 && (int)m->id() != id) ||
-                    (name != 0 && strcmp(name, m->fullName().c_str()) != 0) ||
-                    (isOpTime && optime >= m->hbinfo().opTime)) {
-                continue;
+        // find the member with the lowest ping time that has more data than me
+        for (Member *m = _members.head(); m; m = m->next()) {
+            if (m->hbinfo().up() &&
+                (m->state() == MemberState::RS_PRIMARY ||
+                 (m->state() == MemberState::RS_SECONDARY && m->hbinfo().opTime > lastOpTimeWritten)) &&
+                (!closest || m->hbinfo().ping < closest->hbinfo().ping)) {
+                closest = m;
             }
+        }
+
+        {
+            lock lk(this);        
 
-            sethbmsg( str::stream() << "syncing to: " << m->fullName(), 0);
-            return const_cast<Member*>(m);
+            if (!closest) {
+                _currentSyncTarget = NULL;
+                return NULL;
+            }
+            
+            _currentSyncTarget = closest;
         }
 
-        sethbmsg( str::stream() << "couldn't find a member matching the sync criteria: " <<
-                  "\nstate? " << (secondaryOnly ? "2" : "none") <<
-                  "\nname? " << (name ? name : "none") <<
-                  "\n_id? " << id <<
-                  "\noptime? " << optime.toStringPretty() );
+        sethbmsg( str::stream() << "syncing to: " << closest->fullName(), 0);
 
-        return NULL;
+        return const_cast<Member*>(closest);
     }
 
     /**
@@ -186,6 +124,12 @@ namespace mongo {
     void ReplSetImpl::_syncDoInitialSync() {
         sethbmsg("initial sync pending",0);
 
+        // if this is the first node, it may have already become primary
+        if ( box.getState().primary() ) {
+            sethbmsg("I'm already primary, no need for initial sync",0);
+            return;
+        }
+        
         const Member *source = getMemberToSyncTo();
         if (!source) {
             sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
@@ -252,13 +196,14 @@ namespace mongo {
         /* apply relevant portion of the oplog
         */
         {
-            sethbmsg("initial sync initial oplog application");
-            isyncassert( "initial sync source must remain readable throughout our initial sync [2]", source->state().readable() );
+            isyncassert( str::stream() << "initial sync source must remain readable throughout our initial sync [2] state now: " << source->state().toString() , source->state().readable() );
             if( ! initialSyncOplogApplication(source, /*applyGTE*/startingTS, /*minValid*/mvoptime) ) { // note we assume here that this call does not throw
                 log() << "replSet initial sync failed during applyoplog" << rsLog;
                 emptyOplog(); // otherwise we'll be up!
+                
                 lastOpTimeWritten = OpTime();
                 lastH = 0;
+                
                 log() << "replSet cleaning up [1]" << rsLog;
                 {
                     writelock lk("local.");
diff --git a/db/repl/rs_initiate.cpp b/db/repl/rs_initiate.cpp
index cf1941f..3d998a8 100644
--- a/db/repl/rs_initiate.cpp
+++ b/db/repl/rs_initiate.cpp
@@ -37,8 +37,8 @@ namespace mongo {
        throws
        @param initial true when initiating
     */
-    void checkMembersUpForConfigChange(const ReplSetConfig& cfg, bool initial) {
-        int failures = 0;
+    void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial) {
+        int failures = 0, allVotes = 0, allowableFailures = 0;
         int me = 0;
         stringstream selfs;
         for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
@@ -51,7 +51,10 @@ namespace mongo {
                     uasserted(13420, "initiation and reconfiguration of a replica set must be sent to a node that can become primary");
                 }
             }
+            allVotes += i->votes;
         }
+        allowableFailures = allVotes - (allVotes/2 + 1);
+
         uassert(13278, "bad config: isSelf is true for multiple hosts: " + selfs.str(), me <= 1); // dups?
         if( me != 1 ) {
             stringstream ss;
@@ -61,6 +64,7 @@ namespace mongo {
             uasserted(13279, ss.str());
         }
 
+        vector<string> down;
         for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
             // we know we're up
             if (i->h.isSelf()) {
@@ -100,27 +104,27 @@ namespace mongo {
                     }
                 }
                 if( !ok && !res["rs"].trueValue() ) {
+                    down.push_back(i->h.toString());
+
                     if( !res.isEmpty() ) {
                         /* strange.  got a response, but not "ok". log it. */
                         log() << "replSet warning " << i->h.toString() << " replied: " << res.toString() << rsLog;
                     }
 
                     bool allowFailure = false;
-                    failures++;
-                    if( res.isEmpty() && !initial && failures == 1 ) {
-                        /* for now we are only allowing 1 node to be down on a reconfig.  this can be made to be a minority
-                           trying to keep change small as release is near.
-                           */
+                    failures += i->votes;
+                    if( !initial && failures <= allowableFailures ) {
                         const Member* m = theReplSet->findById( i->_id );
                         if( m ) {
-                            // ok, so this was an existing member (wouldn't make sense to add to config a new member that is down)
                             assert( m->h().toString() == i->h.toString() );
-                            allowFailure = true;
                         }
+                        // it's okay if the down member isn't part of the config,
+                        // we might be adding a new member that isn't up yet
+                        allowFailure = true;
                     }
 
                     if( !allowFailure ) {
-                        string msg = string("need members up to initiate, not ok : ") + i->h.toString();
+                        string msg = string("need all members up to initiate, not ok : ") + i->h.toString();
                         if( !initial )
                             msg = string("need most members up to reconfigure, not ok : ") + i->h.toString();
                         uasserted(13144, msg);
@@ -133,6 +137,9 @@ namespace mongo {
                         !hasData || i->h.isSelf());
             }
         }
+        if (down.size() > 0) {
+            result.append("down", down);
+        }
     }
 
     class CmdReplSetInitiate : public ReplSetCommand {
@@ -143,7 +150,7 @@ namespace mongo {
             h << "Initiate/christen a replica set.";
             h << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
         }
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             log() << "replSet replSetInitiate admin command received from client" << rsLog;
 
             if( !replSet ) {
@@ -179,7 +186,7 @@ namespace mongo {
 
             if( ReplSet::startupStatus == ReplSet::BADCONFIG ) {
                 errmsg = "server already in BADCONFIG state (check logs); not initiating";
-                result.append("info", ReplSet::startupStatusMsg);
+                result.append("info", ReplSet::startupStatusMsg.get());
                 return false;
             }
             if( ReplSet::startupStatus != ReplSet::EMPTYCONFIG ) {
@@ -204,6 +211,7 @@ namespace mongo {
                 b.append("_id", name);
                 bob members;
                 members.append("0", BSON( "_id" << 0 << "host" << HostAndPort::Me().toString() ));
+                result.append("me", HostAndPort::Me().toString());
                 for( unsigned i = 0; i < seeds.size(); i++ )
                     members.append(bob::numStr(i+1), BSON( "_id" << i+1 << "host" << seeds[i].toString()));
                 b.appendArray("members", members.obj());
@@ -226,7 +234,7 @@ namespace mongo {
 
                 log() << "replSet replSetInitiate config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;
 
-                checkMembersUpForConfigChange(newConfig, true);
+                checkMembersUpForConfigChange(newConfig, result, true);
 
                 log() << "replSet replSetInitiate all members seem up" << rsLog;
 
@@ -238,7 +246,7 @@ namespace mongo {
                 log() << "replSet replSetInitiate config now saved locally.  Should come online in about a minute." << rsLog;
                 result.append("info", "Config now saved locally.  Should come online in about a minute.");
                 ReplSet::startupStatus = ReplSet::SOON;
-                ReplSet::startupStatusMsg = "Received replSetInitiate - should come online shortly.";
+                ReplSet::startupStatusMsg.set("Received replSetInitiate - should come online shortly.");
             }
             catch( DBException& e ) {
                 log() << "replSet replSetInitiate exception: " << e.what() << rsLog;
@@ -248,6 +256,11 @@ namespace mongo {
                     errmsg = string("couldn't initiate : ") + e.what();
                 return false;
             }
+            catch( string& e2 ) {
+                log() << e2 << rsLog;
+                errmsg = e2;
+                return false;
+            }
 
             return true;
         }
diff --git a/db/repl/rs_member.h b/db/repl/rs_member.h
index b685c04..d60bb52 100644
--- a/db/repl/rs_member.h
+++ b/db/repl/rs_member.h
@@ -49,6 +49,7 @@ namespace mongo {
         MemberState(MS ms = RS_UNKNOWN) : s(ms) { }
         explicit MemberState(int ms) : s((MS) ms) { }
 
+        bool startup() const { return s == RS_STARTUP; }
         bool primary() const { return s == RS_PRIMARY; }
         bool secondary() const { return s == RS_SECONDARY; }
         bool recovering() const { return s == RS_RECOVERING; }
@@ -79,6 +80,8 @@ namespace mongo {
         DiagStr lastHeartbeatMsg;
         OpTime opTime;
         int skew;
+        unsigned int ping; // milliseconds
+        static unsigned int numPings;
 
         bool up() const { return health > 0; }
 
@@ -104,4 +107,20 @@ namespace mongo {
                hbstate != old.hbstate;
     }
 
+    inline string MemberState::toString() const {
+        switch ( s ) {
+        case RS_STARTUP: return "STARTUP";
+        case RS_PRIMARY: return "PRIMARY";
+        case RS_SECONDARY: return "SECONDARY";
+        case RS_RECOVERING: return "RECOVERING";
+        case RS_FATAL: return "FATAL";
+        case RS_STARTUP2: return "STARTUP2";
+        case RS_ARBITER: return "ARBITER";
+        case RS_DOWN: return "DOWN";
+        case RS_ROLLBACK: return "ROLLBACK";
+        case RS_UNKNOWN: return "UNKNOWN";
+        }
+        return "";
+    }
+
 }
diff --git a/db/repl/rs_rollback.cpp b/db/repl/rs_rollback.cpp
index 0b4cc28..f012e65 100644
--- a/db/repl/rs_rollback.cpp
+++ b/db/repl/rs_rollback.cpp
@@ -20,7 +20,10 @@
 #include "../../client/dbclient.h"
 #include "rs.h"
 #include "../repl.h"
-#include "../query.h"
+#include "../ops/query.h"
+#include "../cloner.h"
+#include "../ops/update.h"
+#include "../ops/delete.h"
 
 /* Scenarios
 
@@ -62,7 +65,6 @@ namespace mongo {
 
     using namespace bson;
 
-    bool copyCollectionFromRemote(const string& host, const string& ns, const BSONObj& query, string& errmsg, bool logforrepl);
     void incRBID();
 
     class rsfatal : public std::exception {
@@ -227,9 +229,9 @@ namespace mongo {
             log() << "replSet info rollback our last optime:   " << ourTime.toStringPretty() << rsLog;
             log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog;
             log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog;
-            if( diff > 3600 ) {
+            if( diff > 1800 ) {
                 log() << "replSet rollback too long a time period for a rollback." << rsLog;
-                throw "error not willing to roll back more than one hour of data";
+                throw "error not willing to roll back more than 30 minutes of data";
             }
         }
 
@@ -339,7 +341,7 @@ namespace mongo {
                 {
                     /* TODO : slow.  lots of round trips. */
                     n++;
-                    bo good= them->findOne(d.ns, d._id.wrap()).getOwned();
+                    bo good= them->findOne(d.ns, d._id.wrap(), NULL, QueryOption_SlaveOk).getOwned();
                     totSize += good.objsize();
                     uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );
 
@@ -393,7 +395,7 @@ namespace mongo {
                     dropCollection(ns, errmsg, res);
                     {
                         dbtemprelease r;
-                        bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false);
+                        bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false, true, false);
                         if( !ok ) {
                             log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog;
                             throw "rollback error resyncing rollection [1]";
@@ -572,7 +574,7 @@ namespace mongo {
         sethbmsg("rollback 6");
 
         // clean up oplog
-        log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
+        LOG(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
         // todo: fatal error if this throws?
         oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
 
@@ -607,26 +609,20 @@ namespace mongo {
             return 2;
         }
 
-        if( box.getState().secondary() ) {
+        if( state().secondary() ) {
             /* by doing this, we will not service reads (return an error as we aren't in secondary staate.
                that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred
                or removed or yielded later anyway.
 
                also, this is better for status reporting - we know what is happening.
                */
-            box.change(MemberState::RS_ROLLBACK, _self);
+            changeState(MemberState::RS_ROLLBACK);
         }
 
         HowToFixUp how;
         sethbmsg("rollback 1");
         {
             r.resetCursor();
-            /*DBClientConnection us(false, 0, 0);
-            string errmsg;
-            if( !us.connect(HostAndPort::me().toString(),errmsg) ) {
-                sethbmsg("rollback connect to self failure" + errmsg);
-                return;
-            }*/
 
             sethbmsg("rollback 2 FindCommonPoint");
             try {
@@ -668,7 +664,7 @@ namespace mongo {
             /* success - leave "ROLLBACK" state
                can go to SECONDARY once minvalid is achieved
             */
-            box.change(MemberState::RS_RECOVERING, _self);
+            changeState(MemberState::RS_RECOVERING);
         }
 
         return 0;
diff --git a/db/repl/rs_sync.cpp b/db/repl/rs_sync.cpp
index 8d06fcc..b29328b 100644
--- a/db/repl/rs_sync.cpp
+++ b/db/repl/rs_sync.cpp
@@ -20,28 +20,28 @@
 #include "rs.h"
 #include "../repl.h"
 #include "connections.h"
+
 namespace mongo {
 
     using namespace bson;
     extern unsigned replSetForceInitialSyncFailure;
 
+    void NOINLINE_DECL blank(const BSONObj& o) {
+        if( *o.getStringField("op") != 'n' ) {
+            log() << "replSet skipping bad op in oplog: " << o.toString() << rsLog;
+        }
+    }
+
     /* apply the log op that is in param o */
     void ReplSetImpl::syncApply(const BSONObj &o) {
-        char db[MaxDatabaseNameLen];
         const char *ns = o.getStringField("ns");
-        nsToDatabase(ns, db);
-
         if ( *ns == '.' || *ns == 0 ) {
-            if( *o.getStringField("op") == 'n' )
-                return;
-            log() << "replSet skipping bad op in oplog: " << o.toString() << endl;
+            blank(o);
             return;
         }
 
         Client::Context ctx(ns);
         ctx.getClient()->curop()->reset();
-
-        /* todo : if this asserts, do we want to ignore or not? */
         applyOperation_inlock(o);
     }
 
@@ -63,15 +63,11 @@ namespace mongo {
                 return false;
             }
 
-            {
-                BSONObjBuilder q;
-                q.appendDate("$gte", applyGTE.asDate());
-                BSONObjBuilder query;
-                query.append("ts", q.done());
-                BSONObj queryObj = query.done();
-                r.query(rsoplog, queryObj);
+            r.tailingQueryGTE( rsoplog, applyGTE );
+            if ( !r.haveCursor() ) {
+                log() << "replSet initial sync oplog query error" << rsLog;
+                return false;
             }
-            assert( r.haveCursor() );
 
             {
                 if( !r.more() ) {
@@ -83,7 +79,7 @@ namespace mongo {
                 OpTime t = op["ts"]._opTime();
                 r.putBack(op);
 
-                if( op.firstElement().fieldName() == string("$err") ) {
+                if( op.firstElementFieldName() == string("$err") ) {
                     log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                     return false;
                 }
@@ -95,6 +91,9 @@ namespace mongo {
                     log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                     return false;
                 }
+
+                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
+                         << t.toStringPretty() << " to " << minValid.toStringPretty());
             }
         }
         catch(DBException& e) {
@@ -107,6 +106,7 @@ namespace mongo {
 
         // todo : use exhaust
         OpTime ts;
+        time_t start = time(0);
         unsigned long long n = 0;
         while( 1 ) {
             try {
@@ -139,18 +139,35 @@ namespace mongo {
                     }
                     _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                 }
-                if( ++n % 100000 == 0 ) {
-                    // simple progress metering
-                    log() << "replSet initialSyncOplogApplication " << n << rsLog;
+
+                if ( ++n % 1000 == 0 ) {
+                    time_t now = time(0);
+                    if (now - start > 10) {
+                        // simple progress metering
+                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
+                              << ts.toStringPretty() << rsLog;
+                        start = now;
+                    }
                 }
                 
                 getDur().commitIfNeeded();
             }
             catch (DBException& e) {
+                // skip duplicate key exceptions
                 if( e.getCode() == 11000 || e.getCode() == 11001 ) {
-                    // skip duplicate key exceptions
                     continue;
                 }
+                
+                // handle cursor not found (just requery)
+                if( e.getCode() == 13127 ) {
+                    r.resetCursor();
+                    r.tailingQueryGTE(rsoplog, ts);
+                    if( r.haveCursor() ) {
+                        continue;
+                    }
+                }
+
+                // TODO: handle server restart
 
                 if( ts <= minValid ) {
                     // didn't make it far enough
@@ -171,6 +188,16 @@ namespace mongo {
     */
     bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) {
         bool golive = false;
+
+        {
+            lock lk( this );
+
+            if (_maintenanceMode > 0) {
+                // we're not actually going live
+                return true;
+            }
+        }
+
         {
             readlock lk("local.replset.minvalid");
             BSONObj mv;
@@ -190,35 +217,35 @@ namespace mongo {
         return golive;
     }
 
-    /**
-     * Checks if the oplog given is too far ahead to read from.
-     *
-     * @param r the oplog
-     * @param hn the hostname (for log messages)
-     *
-     * @return if we are stale compared to the oplog on hn
-     */
     bool ReplSetImpl::_isStale(OplogReader& r, const string& hn) {
         BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
         OpTime ts = remoteOldestOp["ts"]._opTime();
         DEV log() << "replSet remoteOldestOp:    " << ts.toStringLong() << rsLog;
-        else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
+        else LOG(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
         DEV {
-            // debugging sync1.js...
             log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
             log() << "replSet our state: " << state().toString() << rsLog;
         }
-        if( lastOpTimeWritten < ts ) {
-            log() << "replSet error RS102 too stale to catch up, at least from " << hn << rsLog;
-            log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
-            log() << "replSet oldest at " << hn << " : " << ts.toStringLong() << rsLog;
-            log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
-            sethbmsg("error RS102 too stale to catch up");
-            changeState(MemberState::RS_RECOVERING);
-            sleepsecs(120);
-            return true;
-        }
-        return false;
+        if( lastOpTimeWritten >= ts ) {
+            return false;
+        }
+
+        // we're stale
+        log() << "replSet error RS102 too stale to catch up, at least from " << hn << rsLog;
+        log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
+        log() << "replSet oldest at " << hn << " : " << ts.toStringLong() << rsLog;
+        log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
+
+        // reset minvalid so that we can't become primary prematurely
+        {
+            writelock lk("local.replset.minvalid");
+            Helpers::putSingleton("local.replset.minvalid", remoteOldestOp);
+        }
+
+        sethbmsg("error RS102 too stale to catch up");
+        changeState(MemberState::RS_RECOVERING);
+        sleepsecs(120);
+        return true;
     }
 
     /**
@@ -234,7 +261,7 @@ namespace mongo {
         assert(r.conn() == 0);
 
         if( !r.connect(hn) ) {
-            log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
+            LOG(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
             r.resetConnection();
             return false;
         }
@@ -250,8 +277,11 @@ namespace mongo {
         // todo : locking vis a vis the mgr...
         OplogReader r;
         string hn;
+        const Member *target = 0;
 
-        const Member *target = box.getPrimary();
+        // if we cannot reach the master but someone else is more up-to-date
+        // than we are, sync from them.
+        target = getMemberToSyncTo();
         if (target != 0) {
             hn = target->h().toString();
             if (!_getOplogReader(r, hn)) {
@@ -260,32 +290,21 @@ namespace mongo {
                 target = 0;
             }
         }
-
-        // if we cannot reach the master but someone else is more up-to-date
-        // than we are, sync from them.
-        if( target == 0 ) {
-            for(Member *m = head(); m; m=m->next()) {
-                hn = m->h().toString();
-                if (m->hbinfo().up() && m->state().readable() &&
-                        (m->hbinfo().opTime > lastOpTimeWritten) &&
-                        m->config().slaveDelay == 0 &&
-                        _getOplogReader(r, hn)) {
-                    target = m;
-                    break;
-                }
-            }
-
-            // no server found
-            if (target == 0) {
-                // if there is no one to sync from
-                OpTime minvalid;
-                tryToGoLiveAsASecondary(minvalid);
-                return;
-            }
+            
+        // no server found
+        if (target == 0) {
+            // if there is no one to sync from
+            OpTime minvalid;
+            tryToGoLiveAsASecondary(minvalid);
+            return;
         }
-
+        
         r.tailingQueryGTE(rsoplog, lastOpTimeWritten);
-        assert( r.haveCursor() );
+        // if target cut connections between connecting and querying (for
+        // example, because it stepped down) we might not have a cursor
+        if ( !r.haveCursor() ) {
+            return;
+        }
 
         uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );
 
@@ -314,22 +333,14 @@ namespace mongo {
                     sleepsecs(2);
                 }
                 return;
-                /*
-                log() << "replSet syncTail error querying oplog >= " << lastOpTimeWritten.toString() << " from " << hn << rsLog;
-                try {
-                    log() << "replSet " << hn << " last op: " << r.getLastOp(rsoplog).toString() << rsLog;
-                }
-                catch(...) { }
-                sleepsecs(1);
-                return;*/
             }
 
             BSONObj o = r.nextSafe();
             OpTime ts = o["ts"]._opTime();
             long long h = o["h"].numberLong();
             if( ts != lastOpTimeWritten || h != lastH ) {
-                log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << endl;
-                log() << "replset source's GTE: " << ts.toStringPretty() << endl;
+                log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << rsLog;
+                log() << "replset source's GTE: " << ts.toStringPretty() << rsLog;
                 syncRollback(r);
                 return;
             }
@@ -362,15 +373,8 @@ namespace mongo {
                         /* todo: too stale capability */
                     }
 
-                    {
-                        const Member *primary = box.getPrimary();
-                        
-                        if( !target->hbinfo().hbstate.readable() ||
-                            // if we are not syncing from the primary, return (if
-                            // it's up) so that we can try accessing it again
-                            (target != primary && primary != 0)) {
-                            return;
-                        }
+                    if( !target->hbinfo().hbstate.readable() ) {
+                        return;
                     }
                 }
                 if( !r.more() )
@@ -389,20 +393,22 @@ namespace mongo {
                         long long sleeptime = sd - lag;
                         if( sleeptime > 0 ) {
                             uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000);
-                            log() << "replSet temp slavedelay sleep:" << sleeptime << rsLog;
                             if( sleeptime < 60 ) {
                                 sleepsecs((int) sleeptime);
                             }
                             else {
+                                log() << "replSet slavedelay sleep long time: " << sleeptime << rsLog;
                                 // sleep(hours) would prevent reconfigs from taking effect & such!
                                 long long waitUntil = b + sleeptime;
                                 while( 1 ) {
                                     sleepsecs(6);
                                     if( time(0) >= waitUntil )
                                         break;
+
                                     if( !target->hbinfo().hbstate.readable() ) {
                                         break;
                                     }
+                                    
                                     if( myConfig().slaveDelay != sd ) // reconf
                                         break;
                                 }
@@ -411,7 +417,7 @@ namespace mongo {
 
                     }
 
-                    {
+                    try {
                         writelock lk("");
 
                         /* if we have become primary, we dont' want to apply things from elsewhere
@@ -423,16 +429,22 @@ namespace mongo {
                         }
 
                         syncApply(o);
-                        _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too: */
+                        _logOpObjRS(o);   // with repl sets we write the ops to our oplog too 
+                    }
+                    catch (DBException& e) {
+                        sethbmsg(str::stream() << "syncTail: " << e.toString() << ", syncing: " << o);
+                        sleepsecs(30);
+                        return;
                     }
                 }
             }
             r.tailCheck();
             if( !r.haveCursor() ) {
-                log(1) << "replSet end syncTail pass with " << hn << rsLog;
+                LOG(1) << "replSet end syncTail pass with " << hn << rsLog;
                 // TODO : reuse our connection to the primary.
                 return;
             }
+            
             if( !target->hbinfo().hbstate.readable() ) {
                 return;
             }
@@ -446,7 +458,7 @@ namespace mongo {
             sleepsecs(1);
             return;
         }
-        if( sp.state.fatal() ) {
+        if( sp.state.fatal() || sp.state.startup() ) {
             sleepsecs(5);
             return;
         }
@@ -462,32 +474,23 @@ namespace mongo {
     }
 
     void ReplSetImpl::syncThread() {
-        /* test here was to force a receive timeout
-        ScopedConn c("localhost");
-        bo info;
-        try {
-            log() << "this is temp" << endl;
-            c.runCommand("admin", BSON("sleep"<<120), info);
-            log() << info.toString() << endl;
-            c.runCommand("admin", BSON("sleep"<<120), info);
-            log() << "temp" << endl;
-        }
-        catch( DBException& e ) {
-            log() << e.toString() << endl;
-            c.runCommand("admin", BSON("sleep"<<120), info);
-            log() << "temp" << endl;
-        }
-        */
-
         while( 1 ) {
-            if( myConfig().arbiterOnly )
+            // After a reconfig, we may not be in the replica set anymore, so
+            // check that we are in the set (and not an arbiter) before
+            // trying to sync with other replicas.
+            if( ! _self ) {
+            	log() << "replSet warning did not detect own host and port, not syncing, config: " << theReplSet->config() << rsLog;
+                return;
+            }
+            if( myConfig().arbiterOnly ) {
                 return;
+            }
 
             try {
                 _syncThread();
             }
             catch(DBException& e) {
-                sethbmsg("syncThread: " + e.toString());
+                sethbmsg(str::stream() << "syncThread: " << e.toString());
                 sleepsecs(10);
             }
             catch(...) {
@@ -501,7 +504,9 @@ namespace mongo {
                are no heartbeat threads, so we do it here to be sure.  this is relevant if the singleton
                member has done a stepDown() and needs to come back up.
                */
-            OCCASIONALLY mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+            OCCASIONALLY {
+            	mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+            }
         }
     }
 
@@ -513,13 +518,115 @@ namespace mongo {
         }
         n++;
 
-        Client::initThread("replica set sync");
-        cc().iAmSyncThread();
-        if (!noauth) {
-            cc().getAuthenticationInfo()->authorize("local");
-        }
+        Client::initThread("rsSync");
+        cc().iAmSyncThread(); // for isSyncThread() (which is used not used much, is used in secondary create index code
+        replLocalAuth();
         theReplSet->syncThread();
         cc().shutdown();
     }
 
+    void GhostSync::starting() {
+        Client::initThread("rsGhostSync");
+        replLocalAuth();
+    }
+
+    void GhostSync::associateSlave(const BSONObj& id, const int memberId) {
+        const OID rid = id["_id"].OID();
+        rwlock lk( _lock , true );
+        GhostSlave &slave = _ghostCache[rid];
+        if (slave.init) {
+            LOG(1) << "tracking " << slave.slave->h().toString() << " as " << rid << rsLog;
+            return;
+        }
+
+        slave.slave = (Member*)rs->findById(memberId);
+        if (slave.slave != 0) {
+            slave.init = true;
+        }
+        else {
+            log() << "replset couldn't find a slave with id " << memberId
+                  << ", not tracking " << rid << rsLog;
+        }
+    }
+
+    void GhostSync::updateSlave(const mongo::OID& rid, const OpTime& last) {
+        rwlock lk( _lock , false );
+        MAP::iterator i = _ghostCache.find( rid );
+        if ( i == _ghostCache.end() ) {
+            OCCASIONALLY warning() << "couldn't update slave " << rid << " no entry" << rsLog;
+            return;
+        }
+        
+        GhostSlave& slave = i->second;
+        if (!slave.init) {
+            OCCASIONALLY log() << "couldn't update slave " << rid << " not init" << rsLog;            
+            return;
+        }
+
+        ((ReplSetConfig::MemberCfg)slave.slave->config()).updateGroups(last);
+    }
+
+    void GhostSync::percolate(const BSONObj& id, const OpTime& last) {
+        const OID rid = id["_id"].OID();
+        GhostSlave* slave;
+        {
+            rwlock lk( _lock , false );
+
+            MAP::iterator i = _ghostCache.find( rid );
+            if ( i == _ghostCache.end() ) {
+                OCCASIONALLY log() << "couldn't percolate slave " << rid << " no entry" << rsLog;
+                return;
+            }
+
+            slave = &(i->second);
+            if (!slave->init) {
+                OCCASIONALLY log() << "couldn't percolate slave " << rid << " not init" << rsLog;
+                return;
+            }
+        }
+
+        assert(slave->slave);
+
+        const Member *target = rs->_currentSyncTarget;
+        if (!target || rs->box.getState().primary()
+            // we are currently syncing from someone who's syncing from us
+            // the target might end up with a new Member, but s.slave never
+            // changes so we'll compare the names
+            || target == slave->slave || target->fullName() == slave->slave->fullName()) {
+            LOG(1) << "replica set ghost target no good" << endl;
+            return;
+        }
+
+        try {
+            if (!slave->reader.haveCursor()) {
+                if (!slave->reader.connect(id, slave->slave->id(), target->fullName())) {
+                    // error message logged in OplogReader::connect
+                    return;
+                }
+                slave->reader.ghostQueryGTE(rsoplog, last);
+            }
+
+            LOG(1) << "replSet last: " << slave->last.toString() << " to " << last.toString() << rsLog;
+            if (slave->last > last) {
+                return;
+            }
+
+            while (slave->last <= last) {
+                if (!slave->reader.more()) {
+                    // we'll be back
+                    return;
+                }
+
+                BSONObj o = slave->reader.nextSafe();
+                slave->last = o["ts"]._opTime();
+            }
+            LOG(2) << "now last is " << slave->last.toString() << rsLog;
+        }
+        catch (DBException& e) {
+            // we'll be back
+            LOG(2) << "replSet ghost sync error: " << e.what() << " for "
+                   << slave->slave->fullName() << rsLog;
+            slave->reader.resetConnection();
+        }
+    }
 }
diff --git a/db/repl_block.cpp b/db/repl_block.cpp
index 05be343..dcac121 100644
--- a/db/repl_block.cpp
+++ b/db/repl_block.cpp
@@ -24,7 +24,7 @@
 #include "../util/background.h"
 #include "../util/mongoutils/str.h"
 #include "../client/dbclient.h"
-#include "replpair.h"
+#include "replutil.h"
 
 //#define REPLDEBUG(x) log() << "replBlock: "  << x << endl;
 #define REPLDEBUG(x)
@@ -41,7 +41,7 @@ namespace mongo {
 
         struct Ident {
 
-            Ident(BSONObj r,string h,string n) {
+            Ident(const BSONObj& r, const string& h, const string& n) {
                 BSONObjBuilder b;
                 b.appendElements( r );
                 b.append( "host" , h );
@@ -50,7 +50,7 @@ namespace mongo {
             }
 
             bool operator<( const Ident& other ) const {
-                return obj.woCompare( other.obj ) < 0;
+                return obj["_id"].OID() < other.obj["_id"].OID();
             }
 
             BSONObj obj;
@@ -122,6 +122,11 @@ namespace mongo {
 
             Ident ident(rid,host,ns);
             Info& i = _slaves[ ident ];
+
+            if (theReplSet && theReplSet->isPrimary()) {
+                theReplSet->ghost->updateSlave(ident.obj["_id"].OID(), last);
+            }
+
             if ( i.loc ) {
                 if( i.owned )
                     i.loc[0] = last;
@@ -153,11 +158,34 @@ namespace mongo {
 
         }
 
-        bool opReplicatedEnough( OpTime op , int w ) {
+        bool opReplicatedEnough( OpTime op , BSONElement w ) {
             RARELY {
                 REPLDEBUG( "looking for : " << op << " w=" << w );
             }
 
+            if (w.isNumber()) {
+                return replicatedToNum(op, w.numberInt());
+            }
+
+            if (!theReplSet) {
+                return false;
+            }
+
+            string wStr = w.String();
+            if (wStr == "majority") {
+                // use the entire set, including arbiters, to prevent writing
+                // to a majority of the set but not a majority of voters
+                return replicatedToNum(op, theReplSet->config().members.size()/2+1);
+            }
+
+            map<string,ReplSetConfig::TagRule*>::const_iterator it = theReplSet->config().rules.find(wStr);
+            uassert(14830, str::stream() << "unrecognized getLastError mode: " << wStr,
+                    it != theReplSet->config().rules.end());
+
+            return op <= (*it).second->last;
+        }
+
+        bool replicatedToNum(OpTime& op, int w) {
             if ( w <= 1 || ! _isMaster() )
                 return true;
 
@@ -203,12 +231,23 @@ namespace mongo {
             return;
 
         slaveTracking.update( rid , curop.getRemoteString( false ) , ns , lastOp );
+
+        if (theReplSet && !theReplSet->isPrimary()) {
+            // we don't know the slave's port, so we make the replica set keep
+            // a map of rids to slaves
+            log(2) << "percolating " << lastOp.toString() << " from " << rid << endl;
+            theReplSet->ghost->send( boost::bind(&GhostSync::percolate, theReplSet->ghost, rid, lastOp) );
+        }
     }
 
-    bool opReplicatedEnough( OpTime op , int w ) {
+    bool opReplicatedEnough( OpTime op , BSONElement w ) {
         return slaveTracking.opReplicatedEnough( op , w );
     }
 
+    bool opReplicatedEnough( OpTime op , int w ) {
+        return slaveTracking.replicatedToNum( op , w );
+    }
+
     void resetSlaveCache() {
         slaveTracking.reset();
     }
diff --git a/db/repl_block.h b/db/repl_block.h
index 978932d..bb74dee 100644
--- a/db/repl_block.h
+++ b/db/repl_block.h
@@ -32,6 +32,7 @@ namespace mongo {
 
     /** @return true if op has made it to w servers */
     bool opReplicatedEnough( OpTime op , int w );
+    bool opReplicatedEnough( OpTime op , BSONElement w );
 
     void resetSlaveCache();
     unsigned getSlaveCount();
diff --git a/db/replpair.h b/db/replpair.h
deleted file mode 100644
index a551308..0000000
--- a/db/replpair.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/**
-*    Copyright (C) 2008 10gen Inc.
-*
-*    This program is free software: you can redistribute it and/or  modify
-*    it under the terms of the GNU Affero General Public License, version 3,
-*    as published by the Free Software Foundation.
-*
-*    This program is distributed in the hope that it will be useful,
-*    but WITHOUT ANY WARRANTY; without even the implied warranty of
-*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-*    GNU Affero General Public License for more details.
-*
-*    You should have received a copy of the GNU Affero General Public License
-*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#pragma once
-
-#include "db.h"
-#include "dbhelpers.h"
-#include "json.h"
-#include "../client/dbclient.h"
-#include "repl.h"
-#include "cmdline.h"
-#include "repl/rs.h"
-
-namespace mongo {
-
-    extern const char *replAllDead;
-
-    /* ReplPair is a pair of db servers replicating to one another and cooperating.
-
-       Only one member of the pair is active at a time; so this is a smart master/slave
-       configuration basically.
-
-       You may read from the slave at anytime though (if you don't mind the slight lag).
-
-       todo: Could be extended to be more than a pair, thus the name 'Set' -- for example,
-       a set of 3...
-    */
-
-    class ReplPair {
-    public:
-        enum ReplState {
-            State_CantArb = -3,
-            State_Confused = -2,
-            State_Negotiating = -1,
-            State_Slave = 0,
-            State_Master = 1
-        };
-
-        int state;
-        ThreadSafeString info; // commentary about our current state
-        string arbHost;  // "-" for no arbiter.  "host[:port]"
-        int remotePort;
-        string remoteHost;
-        string remote; // host:port if port specified.
-        //    int date; // -1 not yet set; 0=slave; 1=master
-
-        string getInfo() {
-            stringstream ss;
-            ss << "  state:   ";
-            if ( state == 1 ) ss << "1 State_Master ";
-            else if ( state == 0 ) ss << "0 State_Slave";
-            else
-                ss << "<b>" << state << "</b>";
-            ss << '\n';
-            ss << "  info:    " << info << '\n';
-            ss << "  arbhost: " << arbHost << '\n';
-            ss << "  remote:  " << remoteHost << ':' << remotePort << '\n';
-//        ss << "  date:    " << date << '\n';
-            return ss.str();
-        }
-
-        ReplPair(const char *remoteEnd, const char *arbiter);
-        virtual ~ReplPair() {}
-
-        bool dominant(const string& myname) {
-            if ( myname == remoteHost )
-                return cmdLine.port > remotePort;
-            return myname > remoteHost;
-        }
-
-        void setMasterLocked( int n, const char *_comment = "" ) {
-            dblock p;
-            setMaster( n, _comment );
-        }
-
-        void setMaster(int n, const char *_comment = "");
-
-        /* negotiate with our peer who is master; returns state of peer */
-        int negotiate(DBClientConnection *conn, string method);
-
-        /* peer unreachable, try our arbitrator */
-        void arbitrate();
-
-        virtual
-        DBClientConnection *newClientConnection() const {
-            return new DBClientConnection();
-        }
-    };
-
-    extern ReplPair *replPair;
-
-    /* note we always return true for the "local" namespace.
-
-       we should not allow most operations when not the master
-       also we report not master if we are "dead".
-
-       See also CmdIsMaster.
-
-       If 'client' is not specified, the current client is used.
-    */
-    inline bool _isMaster() {
-        if( replSet ) {
-            if( theReplSet )
-                return theReplSet->isPrimary();
-            return false;
-        }
-
-        if( ! replSettings.slave )
-            return true;
-
-        if ( replAllDead )
-            return false;
-
-        if ( replPair ) {
-            if( replPair->state == ReplPair::State_Master )
-                return true;
-        }
-        else {
-            if( replSettings.master ) {
-                // if running with --master --slave, allow.  note that master is also true
-                // for repl pairs so the check for replPair above is important.
-                return true;
-            }
-        }
-
-        if ( cc().isGod() )
-            return true;
-
-        return false;
-    }
-    inline bool isMaster(const char *client = 0) {
-        if( _isMaster() )
-            return true;
-        if ( !client ) {
-            Database *database = cc().database();
-            assert( database );
-            client = database->name.c_str();
-        }
-        return strcmp( client, "local" ) == 0;
-    }
-
-    inline void notMasterUnless(bool expr) {
-        uassert( 10107 , "not master" , expr );
-    }
-
-    /* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair
-       so that queries to a pair are realtime consistent as much as possible.  use setSlaveOk() to
-       query the nonmaster member of a replica pair.
-    */
-    inline void replVerifyReadsOk(ParsedQuery& pq) {
-        if( replSet ) {
-            /* todo: speed up the secondary case.  as written here there are 2 mutex entries, it can b 1. */
-            if( isMaster() ) return;
-            uassert(13435, "not master and slaveok=false", pq.hasOption(QueryOption_SlaveOk));
-            uassert(13436, "not master or secondary, can't read", theReplSet && theReplSet->isSecondary() );
-        }
-        else {
-            notMasterUnless(isMaster() || pq.hasOption(QueryOption_SlaveOk) || replSettings.slave == SimpleSlave );
-        }
-    }
-
-    inline bool isMasterNs( const char *ns ) {
-        char cl[ 256 ];
-        nsToDatabase( ns, cl );
-        return isMaster( cl );
-    }
-
-    inline ReplPair::ReplPair(const char *remoteEnd, const char *arb) {
-        state = -1;
-        remote = remoteEnd;
-        remotePort = CmdLine::DefaultDBPort;
-        remoteHost = remoteEnd;
-        const char *p = strchr(remoteEnd, ':');
-        if ( p ) {
-            remoteHost = string(remoteEnd, p-remoteEnd);
-            remotePort = atoi(p+1);
-            uassert( 10125 , "bad port #", remotePort > 0 && remotePort < 0x10000 );
-            if ( remotePort == CmdLine::DefaultDBPort )
-                remote = remoteHost; // don't include ":27017" as it is default; in case ran in diff ways over time to normalizke the hostname format in sources collection
-        }
-
-        uassert( 10126 , "arbiter parm is missing, use '-' for none", arb);
-        arbHost = arb;
-        uassert( 10127 , "arbiter parm is empty", !arbHost.empty());
-    }
-
-    /* This is set to true if we have EVER been up to date -- this way a new pair member
-     which is a replacement won't go online as master until we have initially fully synced.
-     */
-    class PairSync {
-        int initialsynccomplete;
-    public:
-        PairSync() {
-            initialsynccomplete = -1;
-        }
-
-        /* call before using the class.  from dbmutex */
-        void init() {
-            BSONObj o;
-            initialsynccomplete = 0;
-            if ( Helpers::getSingleton("local.pair.sync", o) )
-                initialsynccomplete = 1;
-        }
-
-        bool initialSyncCompleted() {
-            return initialsynccomplete != 0;
-        }
-
-        void setInitialSyncCompleted() {
-            BSONObj o = fromjson("{\"initialsynccomplete\":1}");
-            Helpers::putSingleton("local.pair.sync", o);
-            initialsynccomplete = 1;
-            tlog() << "pair: initial sync complete" << endl;
-        }
-
-        void setInitialSyncCompletedLocking() {
-            if ( initialsynccomplete == 1 )
-                return;
-            dblock lk;
-            setInitialSyncCompleted();
-        }
-    };
-
-
-} // namespace mongo
diff --git a/db/replutil.h b/db/replutil.h
new file mode 100644
index 0000000..f2bea23
--- /dev/null
+++ b/db/replutil.h
@@ -0,0 +1,98 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "db.h"
+#include "dbhelpers.h"
+#include "json.h"
+#include "../client/dbclient.h"
+#include "repl.h"
+#include "cmdline.h"
+#include "repl/rs.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+    extern const char *replAllDead;
+
+    /* note we always return true for the "local" namespace.
+
+       we should not allow most operations when not the master
+       also we report not master if we are "dead".
+
+       See also CmdIsMaster.
+
+       If 'client' is not specified, the current client is used.
+    */
+    inline bool _isMaster() {
+        if( replSet ) {
+            if( theReplSet )
+                return theReplSet->isPrimary();
+            return false;
+        }
+
+        if( ! replSettings.slave )
+            return true;
+
+        if ( replAllDead )
+            return false;
+
+        if( replSettings.master ) {
+            // if running with --master --slave, allow.
+            return true;
+        }
+
+        if ( cc().isGod() )
+            return true;
+
+        return false;
+    }
+    inline bool isMaster(const char *client = 0) {
+        if( _isMaster() )
+            return true;
+        if ( !client ) {
+            Database *database = cc().database();
+            assert( database );
+            client = database->name.c_str();
+        }
+        return strcmp( client, "local" ) == 0;
+    }
+
+    inline void notMasterUnless(bool expr) {
+        uassert( 10107 , "not master" , expr );
+    }
+
+    /** we allow queries to SimpleSlave's */
+    inline void replVerifyReadsOk(ParsedQuery& pq) {
+        if( replSet ) {
+            /* todo: speed up the secondary case.  as written here there are 2 mutex entries, it can b 1. */
+            if( isMaster() ) return;
+            uassert(13435, "not master and slaveok=false", pq.hasOption(QueryOption_SlaveOk));
+            uassert(13436, "not master or secondary, can't read", theReplSet && theReplSet->isSecondary() );
+        }
+        else {
+            notMasterUnless(isMaster() || pq.hasOption(QueryOption_SlaveOk) || replSettings.slave == SimpleSlave );
+        }
+    }
+
+    inline bool isMasterNs( const char *ns ) {
+        char cl[ 256 ];
+        nsToDatabase( ns, cl );
+        return isMaster( cl );
+    }
+
+} // namespace mongo
diff --git a/db/restapi.cpp b/db/restapi.cpp
index 7460c94..b29521e 100644
--- a/db/restapi.cpp
+++ b/db/restapi.cpp
@@ -18,14 +18,14 @@
 */
 
 #include "pch.h"
-#include "../util/miniwebserver.h"
+#include "../util/net/miniwebserver.h"
 #include "../util/mongoutils/html.h"
 #include "../util/md5.hpp"
 #include "instance.h"
 #include "dbwebserver.h"
 #include "dbhelpers.h"
 #include "repl.h"
-#include "replpair.h"
+#include "replutil.h"
 #include "clientcursor.h"
 #include "background.h"
 
@@ -279,14 +279,6 @@ namespace mongo {
             else {
                 ss << "\nmaster: " << replSettings.master << '\n';
                 ss << "slave:  " << replSettings.slave << '\n';
-                if ( replPair ) {
-                    ss << "replpair:\n";
-                    ss << replPair->getInfo();
-                }
-                bool seemCaughtUp = getInitialSyncCompleted();
-                if ( !seemCaughtUp ) ss << "<b>";
-                ss <<   "initialSyncCompleted: " << seemCaughtUp;
-                if ( !seemCaughtUp ) ss << "</b>";
                 ss << '\n';
             }
 
diff --git a/db/scanandorder.cpp b/db/scanandorder.cpp
new file mode 100644
index 0000000..efa9c8d
--- /dev/null
+++ b/db/scanandorder.cpp
@@ -0,0 +1,93 @@
+/* scanandorder.cpp
+   Order results (that aren't already indexes and in order.)
+*/
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "scanandorder.h"
+
+namespace mongo {
+
+    const unsigned ScanAndOrder::MaxScanAndOrderBytes = 32 * 1024 * 1024;
+
+    void ScanAndOrder::_add(BSONObj& k, BSONObj o, DiskLoc* loc) {
+        if (!loc) {
+            _best.insert(make_pair(k.getOwned(),o.getOwned()));
+        }
+        else {
+            BSONObjBuilder b;
+            b.appendElements(o);
+            b.append("$diskLoc", loc->toBSONObj());
+            _best.insert(make_pair(k.getOwned(), b.obj().getOwned()));
+        }
+    }
+
+    void ScanAndOrder::_addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc) {
+        /* todo : we don't correct _approxSize here. */
+        const BSONObj& worstBestKey = i->first;
+        int c = worstBestKey.woCompare(k, _order._spec.keyPattern);
+        if ( c > 0 ) {
+            // k is better, 'upgrade'
+            _best.erase(i);
+            _add(k, o, loc);
+        }
+    }
+
+
+    void ScanAndOrder::add(BSONObj o, DiskLoc* loc) {
+        assert( o.isValid() );
+        BSONObj k = _order.getKeyFromObject(o);
+        if ( k.isEmpty() ) {
+            return;   
+        }
+        if ( (int) _best.size() < _limit ) {
+            _approxSize += k.objsize();
+            _approxSize += o.objsize();
+            
+            /* note : adjust when bson return limit adjusts. note this limit should be a bit higher. */
+            uassert( 10128 ,  "too much data for sort() with no index.  add an index or specify a smaller limit", _approxSize < MaxScanAndOrderBytes );
+            
+            _add(k, o, loc);
+            return;
+        }
+        BestMap::iterator i;
+        assert( _best.end() != _best.begin() );
+        i = _best.end();
+        i--;
+        _addIfBetter(k, o, i, loc);
+    }
+
+
+    void ScanAndOrder::fill(BufBuilder& b, Projection *filter, int& nout ) const {
+        int n = 0;
+        int nFilled = 0;
+        for ( BestMap::const_iterator i = _best.begin(); i != _best.end(); i++ ) {
+            n++;
+            if ( n <= _startFrom )
+                continue;
+            const BSONObj& o = i->second;
+            fillQueryResultFromObj(b, filter, o);
+            nFilled++;
+            if ( nFilled >= _limit )
+                break;
+            uassert( 10129 ,  "too much data for sort() with no index", b.len() < (int)MaxScanAndOrderBytes ); // appserver limit
+        }
+        nout = nFilled;
+    }
+
+} // namespace mongo
diff --git a/db/scanandorder.h b/db/scanandorder.h
index 4c491fa..33e76f6 100644
--- a/db/scanandorder.h
+++ b/db/scanandorder.h
@@ -20,27 +20,33 @@
 
 #pragma once
 
+#include "indexkey.h"
+#include "queryutil.h"
+#include "projection.h"
+
 namespace mongo {
 
     /* todo:
-       _ handle compound keys with differing directions.  we don't handle this yet: neither here nor in indexes i think!!!
        _ limit amount of data
     */
 
-    /* see also IndexDetails::getKeysFromObject, which needs some merging with this. */
-
     class KeyType : boost::noncopyable {
     public:
-        BSONObj pattern; // e.g., { ts : -1 }
+        IndexSpec _spec;
+        FieldRangeVector _keyCutter;
     public:
-        KeyType(BSONObj _keyPattern) {
-            pattern = _keyPattern;
-            assert( !pattern.isEmpty() );
+        KeyType(BSONObj pattern, const FieldRangeSet &frs):
+        _spec((assert(!pattern.isEmpty()),pattern)),
+        _keyCutter(frs, _spec, 1) {
         }
 
-        // returns the key value for o
+        /**
+         * @return first key of the object that would be encountered while
+         * scanning index with keySpec 'pattern' using constraints 'frs', or
+         * BSONObj() if no such key.
+         */
         BSONObj getKeyFromObject(BSONObj o) {
-            return o.extractFields(pattern,true);
+            return _keyCutter.firstMatch(o);
         }
     };
 
@@ -71,88 +77,34 @@ namespace mongo {
 
     typedef multimap<BSONObj,BSONObj,BSONObjCmp> BestMap;
     class ScanAndOrder {
-        BestMap best; // key -> full object
-        int startFrom;
-        int limit;   // max to send back.
-        KeyType order;
-        unsigned approxSize;
-
-        void _add(BSONObj& k, BSONObj o, DiskLoc* loc) {
-            if (!loc) {
-                best.insert(make_pair(k.getOwned(),o.getOwned()));
-            }
-            else {
-                BSONObjBuilder b;
-                b.appendElements(o);
-                b.append("$diskLoc", loc->toBSONObj());
-                best.insert(make_pair(k.getOwned(), b.obj().getOwned()));
-            }
-        }
-
-        void _addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc) {
-            /* todo : we don't correct approxSize here. */
-            const BSONObj& worstBestKey = i->first;
-            int c = worstBestKey.woCompare(k, order.pattern);
-            if ( c > 0 ) {
-                // k is better, 'upgrade'
-                best.erase(i);
-                _add(k, o, loc);
-            }
-        }
-
     public:
-        ScanAndOrder(int _startFrom, int _limit, BSONObj _order) :
-            best( BSONObjCmp( _order ) ),
-            startFrom(_startFrom), order(_order) {
-            limit = _limit > 0 ? _limit + startFrom : 0x7fffffff;
-            approxSize = 0;
-        }
+        static const unsigned MaxScanAndOrderBytes;
 
-        int size() const {
-            return best.size();
+        ScanAndOrder(int startFrom, int limit, BSONObj order, const FieldRangeSet &frs) :
+            _best( BSONObjCmp( order ) ),
+            _startFrom(startFrom), _order(order, frs) {
+            _limit = limit > 0 ? limit + _startFrom : 0x7fffffff;
+            _approxSize = 0;
         }
 
-        void add(BSONObj o, DiskLoc* loc) {
-            assert( o.isValid() );
-            BSONObj k = order.getKeyFromObject(o);
-            if ( (int) best.size() < limit ) {
-                approxSize += k.objsize();
-                approxSize += o.objsize();
-
-                /* note : adjust when bson return limit adjusts. note this limit should be a bit higher. */
-                uassert( 10128 ,  "too much data for sort() with no index.  add an index or specify a smaller limit", approxSize < 32 * 1024 * 1024 );
-
-                _add(k, o, loc);
-                return;
-            }
-            BestMap::iterator i;
-            assert( best.end() != best.begin() );
-            i = best.end();
-            i--;
-            _addIfBetter(k, o, i, loc);
-        }
+        int size() const { return _best.size(); }
 
-        void _fill(BufBuilder& b, Projection *filter, int& nout, BestMap::iterator begin, BestMap::iterator end) {
-            int n = 0;
-            int nFilled = 0;
-            for ( BestMap::iterator i = begin; i != end; i++ ) {
-                n++;
-                if ( n <= startFrom )
-                    continue;
-                BSONObj& o = i->second;
-                fillQueryResultFromObj(b, filter, o);
-                nFilled++;
-                if ( nFilled >= limit )
-                    break;
-                uassert( 10129 ,  "too much data for sort() with no index", b.len() < 4000000 ); // appserver limit
-            }
-            nout = nFilled;
-        }
+        void add(BSONObj o, DiskLoc* loc);
 
         /* scanning complete. stick the query result in b for n objects. */
-        void fill(BufBuilder& b, Projection *filter, int& nout) {
-            _fill(b, filter, nout, best.begin(), best.end());
-        }
+        void fill(BufBuilder& b, Projection *filter, int& nout ) const;
+
+    private:
+
+        void _add(BSONObj& k, BSONObj o, DiskLoc* loc);
+
+        void _addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc);
+
+        BestMap _best; // key -> full object
+        int _startFrom;
+        int _limit;   // max to send back.
+        KeyType _order;
+        unsigned _approxSize;
 
     };
 
diff --git a/db/security.cpp b/db/security.cpp
index 1ec4218..ae14770 100644
--- a/db/security.cpp
+++ b/db/security.cpp
@@ -18,29 +18,42 @@
 
 #include "pch.h"
 #include "security.h"
+#include "security_common.h"
 #include "instance.h"
 #include "client.h"
 #include "curop-inl.h"
 #include "db.h"
 #include "dbhelpers.h"
 
-namespace mongo {
+// this is the _mongod only_ implementation of security.h
 
-    int AuthenticationInfo::warned = 0;
+namespace mongo {
 
-    void AuthenticationInfo::print() {
+    bool AuthenticationInfo::_warned = false;
+    /*
+    void AuthenticationInfo::print() const {
         cout << "AuthenticationInfo: " << this << '\n';
-        for ( map<string,Auth>::iterator i=m.begin(); i!=m.end(); i++ ) {
+        for ( MA::const_iterator i=_dbs.begin(); i!=_dbs.end(); i++ ) {
             cout << "\t" << i->first << "\t" << i->second.level << '\n';
         }
         cout << "END" << endl;
     }
+    */
+
+    string AuthenticationInfo::getUser( const string& dbname ) const {
+        scoped_spinlock lk(_lock);
+
+        MA::const_iterator i = _dbs.find(dbname);
+        if ( i == _dbs.end() )
+            return "";
+
+        return i->second.user;
+    }
 
 
-    bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) {
-        if ( cc().isGod() ) {
+    bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) const {
+        if ( cc().isGod() ) 
             return true;
-        }
 
         if ( isLocalHost ) {
             atleastreadlock l("");
@@ -48,15 +61,58 @@ namespace mongo {
             Client::Context c("admin.system.users");
             BSONObj result;
             if( ! Helpers::getSingleton("admin.system.users", result) ) {
-                if( warned == 0 ) {
-                    warned++;
+                if( ! _warned ) {
+                    // you could get a few of these in a race, but that's ok
+                    _warned = true;
                     log() << "note: no users configured in admin.system.users, allowing localhost access" << endl;
                 }
                 return true;
             }
         }
+
         return false;
     }
 
+    bool CmdAuthenticate::getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd) {
+        if (user == internalSecurity.user) {
+            uassert(15889, "key file must be used to log in with internal user", cmdLine.keyFile);
+            pwd = internalSecurity.pwd;
+        }
+        else {
+            // static BSONObj userPattern = fromjson("{\"user\":1}");
+            string systemUsers = dbname + ".system.users";
+            // OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
+            {
+                BSONObjBuilder b;
+                b << "user" << user;
+                BSONObj query = b.done();
+                if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) {
+                    log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
+                    return false;
+                }
+            }
+
+            pwd = userObj.getStringField("pwd");
+        }
+        return true;
+    }
+
+    void CmdAuthenticate::authenticate(const string& dbname, const string& user, const bool readOnly) {
+        AuthenticationInfo *ai = cc().getAuthenticationInfo();
+
+        if ( readOnly ) {
+            ai->authorizeReadOnly( cc().database()->name.c_str() , user );
+        }
+        else {
+            ai->authorize( cc().database()->name.c_str() , user );
+        }
+    }
+
+    bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        AuthenticationInfo *ai = cc().getAuthenticationInfo();
+        ai->logout(dbname);
+        return true;
+    }
+
 } // namespace mongo
 
diff --git a/db/security.h b/db/security.h
index 2b947c1..2937ef2 100644..100755
--- a/db/security.h
+++ b/db/security.h
@@ -20,53 +20,84 @@
 
 #include "nonce.h"
 #include "concurrency.h"
-#include "security_key.h"
+#include "security_common.h"
+#include "../util/concurrency/spin_lock.h"
+
+// this is used by both mongos and mongod
 
 namespace mongo {
 
-    /* for a particular db */
+    /* 
+     * for a particular db
+     * levels
+     *     0 : none
+     *     1 : read
+     *     2 : write
+     */
     struct Auth {
         Auth() { level = 0; }
         int level;
+        string user;
     };
 
     class AuthenticationInfo : boost::noncopyable {
-        mongo::mutex _lock;
-        map<string, Auth> m; // dbname -> auth
-        static int warned;
     public:
         bool isLocalHost;
-        AuthenticationInfo() : _lock("AuthenticationInfo") { isLocalHost = false; }
-        ~AuthenticationInfo() {
-        }
+        
+        AuthenticationInfo(){ isLocalHost = false; }
+        ~AuthenticationInfo() {}
+
+        // -- modifiers ----
+        
         void logout(const string& dbname ) {
-            scoped_lock lk(_lock);
-            m.erase(dbname);
+            scoped_spinlock lk(_lock);
+            _dbs.erase(dbname);
+        }
+        void authorize(const string& dbname , const string& user ) {
+            scoped_spinlock lk(_lock);
+            _dbs[dbname].level = 2;
+            _dbs[dbname].user = user;
+        }
+        void authorizeReadOnly(const string& dbname , const string& user ) {
+            scoped_spinlock lk(_lock);
+            _dbs[dbname].level = 1;
+            _dbs[dbname].user = user;
+        }
+        
+        // -- accessors ---
+
+        bool isAuthorized(const string& dbname) const { 
+            return _isAuthorized( dbname, 2 ); 
         }
-        void authorize(const string& dbname ) {
-            scoped_lock lk(_lock);
-            m[dbname].level = 2;
+        
+        bool isAuthorizedReads(const string& dbname) const { 
+            return _isAuthorized( dbname, 1 ); 
         }
-        void authorizeReadOnly(const string& dbname) {
-            scoped_lock lk(_lock);
-            m[dbname].level = 1;
+        
+        bool isAuthorizedForLock(const string& dbname, int lockType ) const { 
+            return _isAuthorized( dbname , lockType > 0 ? 2 : 1 ); 
         }
-        bool isAuthorized(const string& dbname) { return _isAuthorized( dbname, 2 ); }
-        bool isAuthorizedReads(const string& dbname) { return _isAuthorized( dbname, 1 ); }
-        bool isAuthorizedForLock(const string& dbname, int lockType ) { return _isAuthorized( dbname , lockType > 0 ? 2 : 1 ); }
 
-        void print();
+        string getUser( const string& dbname ) const;
+
+        void print() const;
 
     protected:
-        bool _isAuthorized(const string& dbname, int level) {
-            if( m[dbname].level >= level ) return true;
-            if( noauth ) return true;
-            if( m["admin"].level >= level ) return true;
-            if( m["local"].level >= level ) return true;
-            return _isAuthorizedSpecialChecks( dbname );
-        }
+        /** takes a lock */
+        bool _isAuthorized(const string& dbname, int level) const;
+
+        bool _isAuthorizedSingle_inlock(const string& dbname, int level) const;
+        
+        /** cannot call this locked */
+        bool _isAuthorizedSpecialChecks( const string& dbname ) const ;
+
+    private:
+        mutable SpinLock _lock;
+
+        typedef map<string,Auth> MA;
+        MA _dbs; // dbname -> auth
 
-        bool _isAuthorizedSpecialChecks( const string& dbname );
+        static bool _warned;
     };
 
 } // namespace mongo
diff --git a/db/security_commands.cpp b/db/security_commands.cpp
index 67605aa..2db9680 100644
--- a/db/security_commands.cpp
+++ b/db/security_commands.cpp
@@ -39,12 +39,12 @@ namespace mongo {
 
        getnonce sends nonce to client
 
-       client then sends { authenticate:1, nonce:<nonce_str>, user:<username>, key:<key> }
+       client then sends { authenticate:1, nonce64:<nonce_str>, user:<username>, key:<key> }
 
        where <key> is md5(<nonce_str><username><pwd_digest_str>) as a string
     */
 
-    boost::thread_specific_ptr<nonce> lastNonce;
+    boost::thread_specific_ptr<nonce64> lastNonce;
 
     class CmdGetNonce : public Command {
     public:
@@ -56,8 +56,8 @@ namespace mongo {
         void help(stringstream& h) const { h << "internal"; }
         virtual LockType locktype() const { return NONE; }
         CmdGetNonce() : Command("getnonce") {}
-        bool run(const string&, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            nonce *n = new nonce(security.getNonce());
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            nonce64 *n = new nonce64(Security::getNonce());
             stringstream ss;
             ss << hex << *n;
             result.append("nonce", ss.str() );
@@ -66,129 +66,78 @@ namespace mongo {
         }
     } cmdGetNonce;
 
-    class CmdLogout : public Command {
-    public:
-        virtual bool logTheOp() {
-            return false;
-        }
-        virtual bool slaveOk() const {
-            return true;
-        }
-        void help(stringstream& h) const { h << "de-authenticate"; }
-        virtual LockType locktype() const { return NONE; }
-        CmdLogout() : Command("logout") {}
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            AuthenticationInfo *ai = cc().getAuthenticationInfo();
-            ai->logout(dbname);
-            return true;
-        }
-    } cmdLogout;
+    CmdLogout cmdLogout;
 
-    class CmdAuthenticate : public Command {
-    public:
-        virtual bool requiresAuth() { return false; }
-        virtual bool logTheOp() {
+    bool CmdAuthenticate::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        log() << " authenticate: " << cmdObj << endl;
+
+        string user = cmdObj.getStringField("user");
+        string key = cmdObj.getStringField("key");
+        string received_nonce = cmdObj.getStringField("nonce");
+
+        if( user.empty() || key.empty() || received_nonce.empty() ) {
+            log() << "field missing/wrong type in received authenticate command "
+                  << dbname
+                  << endl;
+            errmsg = "auth fails";
+            sleepmillis(10);
             return false;
         }
-        virtual bool slaveOk() const {
-            return true;
-        }
-        virtual LockType locktype() const { return WRITE; }
-        virtual void help(stringstream& ss) const { ss << "internal"; }
-        CmdAuthenticate() : Command("authenticate") {}
-        bool run(const string& dbname , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            log(1) << " authenticate: " << cmdObj << endl;
-
-            string user = cmdObj.getStringField("user");
-            string key = cmdObj.getStringField("key");
-            string received_nonce = cmdObj.getStringField("nonce");
-
-            if( user.empty() || key.empty() || received_nonce.empty() ) {
-                log() << "field missing/wrong type in received authenticate command "
-                      << dbname
-                      << endl;
-                errmsg = "auth fails";
-                sleepmillis(10);
-                return false;
-            }
-
-            stringstream digestBuilder;
-
-            {
-                bool reject = false;
-                nonce *ln = lastNonce.release();
-                if ( ln == 0 ) {
-                    reject = true;
-                    log(1) << "auth: no lastNonce" << endl;
-                }
-                else {
-                    digestBuilder << hex << *ln;
-                    reject = digestBuilder.str() != received_nonce;
-                    if ( reject ) log(1) << "auth: different lastNonce" << endl;
-                }
-
-                if ( reject ) {
-                    log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << endl;
-                    errmsg = "auth fails";
-                    sleepmillis(30);
-                    return false;
-                }
-            }
 
-            BSONObj userObj;
-            string pwd;
+        stringstream digestBuilder;
 
-            if (user == internalSecurity.user) {
-                pwd = internalSecurity.pwd;
+        {
+            bool reject = false;
+            nonce64 *ln = lastNonce.release();
+            if ( ln == 0 ) {
+                reject = true;
+                log(1) << "auth: no lastNonce" << endl;
             }
             else {
-                static BSONObj userPattern = fromjson("{\"user\":1}");
-                string systemUsers = dbname + ".system.users";
-                OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
-                {
-                    BSONObjBuilder b;
-                    b << "user" << user;
-                    BSONObj query = b.done();
-                    if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) {
-                        log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
-                        errmsg = "auth fails";
-                        return false;
-                    }
-                }
-
-                pwd = userObj.getStringField("pwd");
+                digestBuilder << hex << *ln;
+                reject = digestBuilder.str() != received_nonce;
+                if ( reject ) log(1) << "auth: different lastNonce" << endl;
             }
 
-
-            md5digest d;
-            {
-                digestBuilder << user << pwd;
-                string done = digestBuilder.str();
-
-                md5_state_t st;
-                md5_init(&st);
-                md5_append(&st, (const md5_byte_t *) done.c_str(), done.size());
-                md5_finish(&st, d);
-            }
-
-            string computed = digestToString( d );
-
-            if ( key != computed ) {
-                log() << "auth: key mismatch " << user << ", ns:" << dbname << endl;
+            if ( reject ) {
+                log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << dbname << endl;
                 errmsg = "auth fails";
+                sleepmillis(30);
                 return false;
             }
+        }
 
-            AuthenticationInfo *ai = cc().getAuthenticationInfo();
+        BSONObj userObj;
+        string pwd;
+        if (!getUserObj(dbname, user, userObj, pwd)) {
+            errmsg = "auth fails";
+            return false;
+        }
 
-            if ( userObj[ "readOnly" ].isBoolean() && userObj[ "readOnly" ].boolean() ) {
-                ai->authorizeReadOnly( cc().database()->name.c_str() );
-            }
-            else {
-                ai->authorize( cc().database()->name.c_str() );
-            }
-            return true;
+        md5digest d;
+        {
+            digestBuilder << user << pwd;
+            string done = digestBuilder.str();
+
+            md5_state_t st;
+            md5_init(&st);
+            md5_append(&st, (const md5_byte_t *) done.c_str(), done.size());
+            md5_finish(&st, d);
+        }
+
+        string computed = digestToString( d );
+
+        if ( key != computed ) {
+            log() << "auth: key mismatch " << user << ", ns:" << dbname << endl;
+            errmsg = "auth fails";
+            return false;
         }
-    } cmdAuthenticate;
+
+        authenticate(dbname, user, userObj[ "readOnly" ].isBoolean() && userObj[ "readOnly" ].boolean());
+
+        return true;
+    }
+
+    CmdAuthenticate cmdAuthenticate;
 
 } // namespace mongo
diff --git a/db/security_key.cpp b/db/security_common.cpp
index 1ea7021..04cea99 100644
--- a/db/security_key.cpp
+++ b/db/security_common.cpp
@@ -1,4 +1,4 @@
-// security_key.cpp
+// security_common.cpp
 /*
  *    Copyright (C) 2010 10gen Inc.
  *
@@ -22,8 +22,12 @@
  */
 
 #include "pch.h"
-#include "security_key.h"
+#include "security.h"
+#include "security_common.h"
 #include "../client/dbclient.h"
+#include "commands.h"
+#include "nonce.h"
+#include "../util/md5.hpp"
 
 #include <sys/stat.h>
 
@@ -41,7 +45,7 @@ namespace mongo {
             return false;
         }
 
-#if !defined(WIN32)
+#if !defined(_WIN32)
         // check permissions: must be X00, where X is >= 4
         if ((stats.st_mode & (S_IRWXG|S_IRWXO)) != 0) {
             log() << "permissions on " << filename << " are too open" << endl;
@@ -102,4 +106,29 @@ namespace mongo {
 
         return true;
     }
+
+    bool AuthenticationInfo::_isAuthorized(const string& dbname, int level) const {
+        {
+            scoped_spinlock lk(_lock);
+
+            if ( _isAuthorizedSingle_inlock( dbname , level ) )
+                return true;
+
+            if ( noauth )
+                return true;
+
+            if ( _isAuthorizedSingle_inlock( "admin" , level ) )
+                return true;
+
+            if ( _isAuthorizedSingle_inlock( "local" , level ) )
+                return true;
+        }
+        return _isAuthorizedSpecialChecks( dbname );
+    }
+
+    bool AuthenticationInfo::_isAuthorizedSingle_inlock(const string& dbname, int level) const {
+        MA::const_iterator i = _dbs.find(dbname);
+        return i != _dbs.end() && i->second.level >= level;
+    }
+
 } // namespace mongo
diff --git a/db/security_common.h b/db/security_common.h
new file mode 100644
index 0000000..2f2565f
--- /dev/null
+++ b/db/security_common.h
@@ -0,0 +1,83 @@
+// security_common.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "commands.h"
+#include "concurrency.h"
+#include "../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+    /**
+     * Internal secret key info.
+     */
+    struct AuthInfo {
+        AuthInfo() {
+            user = "__system";
+        }
+        string user;
+        string pwd;
+    };
+
+    // --noauth cmd line option
+    extern bool noauth;
+    extern AuthInfo internalSecurity;
+
+    /**
+     * This method checks the validity of filename as a security key, hashes its
+     * contents, and stores it in the internalSecurity variable.  Prints an
+     * error message to the logs if there's an error.
+     * @param filename the file containing the key
+     * @return if the key was successfully stored
+     */
+    bool setUpSecurityKey(const string& filename);
+
+    class CmdAuthenticate : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return READ; }
+        virtual void help(stringstream& ss) const { ss << "internal"; }
+        CmdAuthenticate() : Command("authenticate") {}
+        bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+    private:
+        bool getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd);
+        void authenticate(const string& dbname, const string& user, const bool readOnly);
+    };
+
+    class CmdLogout : public Command {
+    public:
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        void help(stringstream& h) const { h << "de-authenticate"; }
+        virtual LockType locktype() const { return NONE; }
+        CmdLogout() : Command("logout") {}
+        bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+    };
+
+} // namespace mongo
diff --git a/db/security_key.h b/db/security_key.h
deleted file mode 100644
index 86f1307..0000000
--- a/db/security_key.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// security_key.h
-
-/**
-*    Copyright (C) 2009 10gen Inc.
-*
-*    This program is free software: you can redistribute it and/or  modify
-*    it under the terms of the GNU Affero General Public License, version 3,
-*    as published by the Free Software Foundation.
-*
-*    This program is distributed in the hope that it will be useful,
-*    but WITHOUT ANY WARRANTY; without even the implied warranty of
-*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-*    GNU Affero General Public License for more details.
-*
-*    You should have received a copy of the GNU Affero General Public License
-*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#pragma once
-
-namespace mongo {
-
-    /**
-     * Internal secret key info.
-     */
-    struct AuthInfo {
-        AuthInfo() {
-            user = "__system";
-        }
-        string user;
-        string pwd;
-    };
-
-    // --noauth cmd line option
-    extern bool noauth;
-    extern AuthInfo internalSecurity;
-
-    /**
-     * This method checks the validity of filename as a security key, hashes its
-     * contents, and stores it in the internalSecurity variable.  Prints an
-     * error message to the logs if there's an error.
-     * @param filename the file containing the key
-     * @return if the key was successfully stored
-     */
-    bool setUpSecurityKey(const string& filename);
-
-} // namespace mongo
diff --git a/db/stats/counters.h b/db/stats/counters.h
index b5cad85..d514a0f 100644
--- a/db/stats/counters.h
+++ b/db/stats/counters.h
@@ -19,7 +19,7 @@
 
 #include "../../pch.h"
 #include "../jsobj.h"
-#include "../../util/message.h"
+#include "../../util/net/message.h"
 #include "../../util/processinfo.h"
 #include "../../util/concurrency/spin_lock.h"
 
diff --git a/db/stats/snapshots.cpp b/db/stats/snapshots.cpp
index a81568d..ca5491b 100644
--- a/db/stats/snapshots.cpp
+++ b/db/stats/snapshots.cpp
@@ -38,19 +38,21 @@ namespace mongo {
         : _older( older ) , _newer( newer ) {
         assert( _newer._created > _older._created );
         _elapsed = _newer._created - _older._created;
-
     }
 
     Top::CollectionData SnapshotDelta::globalUsageDiff() {
         return Top::CollectionData( _older._globalUsage , _newer._globalUsage );
     }
     Top::UsageMap SnapshotDelta::collectionUsageDiff() {
+        assert( _newer._created > _older._created );
         Top::UsageMap u;
 
         for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ) {
             Top::UsageMap::const_iterator j = _older._usage.find(i->first);
             if (j != _older._usage.end())
                 u[i->first] = Top::CollectionData( j->second , i->second );
+            else
+                u[i->first] = i->second;
         }
         return u;
     }
@@ -112,14 +114,10 @@ namespace mongo {
             try {
                 const SnapshotData* s = statsSnapshots.takeSnapshot();
 
-                if ( prev ) {
+                if ( prev && cmdLine.cpu ) {
                     unsigned long long elapsed = s->_created - prev->_created;
-
-                    if ( cmdLine.cpu ) {
-                        SnapshotDelta d( *prev , *s );
-                        log() << "cpu: elapsed:" << (elapsed/1000) <<"  writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl;
-                    }
-
+                    SnapshotDelta d( *prev , *s );
+                    log() << "cpu: elapsed:" << (elapsed/1000) <<"  writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl;
                 }
 
                 prev = s;
diff --git a/db/stats/top.cpp b/db/stats/top.cpp
index 77aef0d..f5b6ee4 100644
--- a/db/stats/top.cpp
+++ b/db/stats/top.cpp
@@ -18,16 +18,15 @@
 
 #include "pch.h"
 #include "top.h"
-#include "../../util/message.h"
+#include "../../util/net/message.h"
 #include "../commands.h"
 
 namespace mongo {
 
     Top::UsageData::UsageData( const UsageData& older , const UsageData& newer ) {
         // this won't be 100% accurate on rollovers and drop(), but at least it won't be negative
-        time  = (newer.time  > older.time)  ? (newer.time  - older.time)  : newer.time;
-        count = (newer.count > older.count) ? (newer.count - older.count) : newer.count;
-
+        time  = (newer.time  >= older.time)  ? (newer.time  - older.time)  : newer.time;
+        count = (newer.count >= older.count) ? (newer.count - older.count) : newer.count;
     }
 
     Top::CollectionData::CollectionData( const CollectionData& older , const CollectionData& newer )
@@ -155,11 +154,12 @@ namespace mongo {
         virtual bool slaveOk() const { return true; }
         virtual bool adminOnly() const { return true; }
         virtual LockType locktype() const { return READ; }
-        virtual void help( stringstream& help ) const { help << "usage by collection"; }
+        virtual void help( stringstream& help ) const { help << "usage by collection, in micros "; }
 
-        virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             {
                 BSONObjBuilder b( result.subobjStart( "totals" ) );
+                b.append( "note" , "all times in microseconds" );
                 Top::global.append( b );
                 b.done();
             }
author	Antonin Kral <a.kral@bobek.cz>	2011-09-14 17:08:06 +0200
committer	Antonin Kral <a.kral@bobek.cz>	2011-09-14 17:08:06 +0200
commit	5d342a758c6095b4d30aba0750b54f13b8916f51 (patch)
tree	762e9aa84781f5e3b96db2c02d356c29cf0217c0 /db
parent	cbe2d992e9cd1ea66af9fa91df006106775d3073 (diff)
download	mongodb-5d342a758c6095b4d30aba0750b54f13b8916f51.tar.gz