89 files changed, 31306 insertions, 0 deletions
diff --git a/db/btree.cpp b/db/btree.cpp
new file mode 100644
index 0000000..8b910f5
--- /dev/null
+++ b/db/btree.cpp
@@ -0,0 +1,1050 @@
+// btree.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "json.h"
+#include "clientcursor.h"
+#include "client.h"
+#include "dbhelpers.h"
+#include "curop.h"
+
+namespace mongo {
+
+#define VERIFYTHISLOC dassert( thisLoc.btree() == this );
+
+    KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) :
+            prevChildBucket(k.prevChildBucket),
+            recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
+    { }
+
+    const int KeyMax = BucketSize / 10;
+
+    extern int otherTraceLevel;
+    const int split_debug = 0;
+    const int insert_debug = 0;
+
+    /* BucketBasics --------------------------------------------------- */
+
+    inline void BucketBasics::modified(const DiskLoc& thisLoc) {
+        VERIFYTHISLOC
+        btreeStore->modified(thisLoc);
+    }
+
+    int BucketBasics::Size() const {
+        assert( _Size == BucketSize );
+        return _Size;
+    }
+    inline void BucketBasics::setNotPacked() {
+        flags &= ~Packed;
+    }
+    inline void BucketBasics::setPacked() {
+        flags |= Packed;
+    }
+
+    void BucketBasics::_shape(int level, stringstream& ss) {
+        for ( int i = 0; i < level; i++ ) ss << ' ';
+        ss << "*\n";
+        for ( int i = 0; i < n; i++ )
+            if ( !k(i).prevChildBucket.isNull() )
+                k(i).prevChildBucket.btree()->_shape(level+1,ss);
+        if ( !nextChild.isNull() )
+            nextChild.btree()->_shape(level+1,ss);
+    }
+
+    int bt_fv=0;
+    int bt_dmp=0;
+
+    void BucketBasics::dumpTree(DiskLoc thisLoc, const BSONObj &order) {
+        bt_dmp=1;
+        fullValidate(thisLoc, order);
+        bt_dmp=0;
+    }
+
+    int BucketBasics::fullValidate(const DiskLoc& thisLoc, const BSONObj &order) {
+        {
+            bool f = false;
+            assert( f = true );
+            massert( 10281 , "assert is misdefined", f);
+        }
+
+        killCurrentOp.checkForInterrupt();
+        assertValid(order, true);
+//	if( bt_fv==0 )
+//		return;
+
+        if ( bt_dmp ) {
+            out() << thisLoc.toString() << ' ';
+            ((BtreeBucket *) this)->dump();
+        }
+
+        // keycount
+        int kc = 0;
+
+        for ( int i = 0; i < n; i++ ) {
+            _KeyNode& kn = k(i);
+
+            if ( kn.isUsed() ) kc++;
+            if ( !kn.prevChildBucket.isNull() ) {
+                DiskLoc left = kn.prevChildBucket;
+                BtreeBucket *b = left.btree();
+                wassert( b->parent == thisLoc );
+                kc += b->fullValidate(kn.prevChildBucket, order);
+            }
+        }
+        if ( !nextChild.isNull() ) {
+            BtreeBucket *b = nextChild.btree();
+            wassert( b->parent == thisLoc );
+            kc += b->fullValidate(nextChild, order);
+        }
+
+        return kc;
+    }
+
+    int nDumped = 0;
+
+    void BucketBasics::assertValid(const BSONObj &order, bool force) {
+        if ( !debug && !force )
+            return;
+        wassert( n >= 0 && n < Size() );
+        wassert( emptySize >= 0 && emptySize < BucketSize );
+        wassert( topSize >= n && topSize <= BucketSize );
+        DEV {
+            // slow:
+            for ( int i = 0; i < n-1; i++ ) {
+                BSONObj k1 = keyNode(i).key;
+                BSONObj k2 = keyNode(i+1).key;
+                int z = k1.woCompare(k2, order); //OK
+                if ( z > 0 ) {
+                    out() << "ERROR: btree key order corrupt.  Keys:" << endl;
+                    if ( ++nDumped < 5 ) {
+                        for ( int j = 0; j < n; j++ ) {
+                            out() << "  " << keyNode(j).key.toString() << endl;
+                        }
+                        ((BtreeBucket *) this)->dump();
+                    }
+                    wassert(false);
+                    break;
+                }
+                else if ( z == 0 ) {
+                    if ( !(k(i).recordLoc < k(i+1).recordLoc) ) {
+                        out() << "ERROR: btree key order corrupt (recordloc's wrong).  Keys:" << endl;
+                        out() << " k(" << i << "):" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl;
+                        out() << " k(" << i+1 << "):" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl;
+                        wassert( k(i).recordLoc < k(i+1).recordLoc );
+                    }
+                }
+            }
+        }
+        else {
+            //faster:
+            if ( n > 1 ) {
+                BSONObj k1 = keyNode(0).key;
+                BSONObj k2 = keyNode(n-1).key;
+                int z = k1.woCompare(k2, order);
+                //wassert( z <= 0 );
+                if ( z > 0 ) {
+                    problem() << "btree keys out of order" << '\n';
+                    ONCE {
+                        ((BtreeBucket *) this)->dump();
+                    }
+                    assert(false);
+                }
+            }
+        }
+    }
+
+    inline void BucketBasics::markUnused(int keypos) {
+        assert( keypos >= 0 && keypos < n );
+        k(keypos).setUnused();
+    }
+
+    inline int BucketBasics::totalDataSize() const {
+        return Size() - (data-(char*)this);
+    }
+
+    void BucketBasics::init() {
+        parent.Null();
+        nextChild.Null();
+        _Size = BucketSize;
+        flags = Packed;
+        n = 0;
+        emptySize = totalDataSize();
+        topSize = 0;
+        reserved = 0;
+    }
+
+    /* see _alloc */
+    inline void BucketBasics::_unalloc(int bytes) {
+        topSize -= bytes;
+        emptySize += bytes;
+    }
+
+    /* we allocate space from the end of the buffer for data.
+       the keynodes grow from the front.
+    */
+    inline int BucketBasics::_alloc(int bytes) {
+        topSize += bytes;
+        emptySize -= bytes;
+        int ofs = totalDataSize() - topSize;
+        assert( ofs > 0 );
+        return ofs;
+    }
+
+    void BucketBasics::_delKeyAtPos(int keypos) {
+        assert( keypos >= 0 && keypos <= n );
+        assert( childForPos(keypos).isNull() );
+        n--;
+        assert( n > 0 || nextChild.isNull() );
+        for ( int j = keypos; j < n; j++ )
+            k(j) = k(j+1);
+        emptySize += sizeof(_KeyNode);
+        setNotPacked();
+    }
+
+    /* pull rightmost key from the bucket.  this version requires its right child to be null so it 
+	   does not bother returning that value.
+    */
+    void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) { 
+        massert( 10282 ,  "n==0 in btree popBack()", n > 0 );
+        assert( k(n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that
+        KeyNode kn = keyNode(n-1);
+        recLoc = kn.recordLoc;
+        key = kn.key;
+        int keysize = kn.key.objsize();
+
+		massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull());
+
+		/* weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full. */
+		nextChild = kn.prevChildBucket;
+
+        n--;
+        emptySize += sizeof(_KeyNode);
+        _unalloc(keysize);
+    }
+
+    /* add a key.  must be > all existing.  be careful to set next ptr right. */
+    bool BucketBasics::_pushBack(const DiskLoc& recordLoc, BSONObj& key, const BSONObj &order, DiskLoc prevChild) {
+        int bytesNeeded = key.objsize() + sizeof(_KeyNode);
+        if ( bytesNeeded > emptySize )
+            return false;
+        assert( bytesNeeded <= emptySize );
+        assert( n == 0 || keyNode(n-1).key.woCompare(key, order) <= 0 );
+        emptySize -= sizeof(_KeyNode);
+        _KeyNode& kn = k(n++);
+        kn.prevChildBucket = prevChild;
+        kn.recordLoc = recordLoc;
+        kn.setKeyDataOfs( (short) _alloc(key.objsize()) );
+        char *p = dataAt(kn.keyDataOfs());
+        memcpy(p, key.objdata(), key.objsize());
+        return true;
+    }
+    /*void BucketBasics::pushBack(const DiskLoc& recordLoc, BSONObj& key, const BSONObj &order, DiskLoc prevChild, DiskLoc nextChild) { 
+        pushBack(recordLoc, key, order, prevChild);
+        childForPos(n) = nextChild;
+    }*/
+
+    /* insert a key in a bucket with no complexity -- no splits required */
+    bool BucketBasics::basicInsert(const DiskLoc& thisLoc, int keypos, const DiskLoc& recordLoc, const BSONObj& key, const BSONObj &order) {
+        modified(thisLoc);
+        assert( keypos >= 0 && keypos <= n );
+        int bytesNeeded = key.objsize() + sizeof(_KeyNode);
+        if ( bytesNeeded > emptySize ) {
+            pack( order );
+            if ( bytesNeeded > emptySize )
+                return false;
+        }
+        for ( int j = n; j > keypos; j-- ) // make room
+            k(j) = k(j-1);
+        n++;
+        emptySize -= sizeof(_KeyNode);
+        _KeyNode& kn = k(keypos);
+        kn.prevChildBucket.Null();
+        kn.recordLoc = recordLoc;
+        kn.setKeyDataOfs((short) _alloc(key.objsize()) );
+        char *p = dataAt(kn.keyDataOfs());
+        memcpy(p, key.objdata(), key.objsize());
+        return true;
+    }
+
+    /* when we delete things we just leave empty space until the node is
+       full and then we repack it.
+    */
+    void BucketBasics::pack( const BSONObj &order ) {
+        if ( flags & Packed )
+            return;
+
+        int tdz = totalDataSize();
+        char temp[BucketSize];
+        int ofs = tdz;
+        topSize = 0;
+        for ( int j = 0; j < n; j++ ) {
+            short ofsold = k(j).keyDataOfs();
+            int sz = keyNode(j).key.objsize();
+            ofs -= sz;
+            topSize += sz;
+            memcpy(temp+ofs, dataAt(ofsold), sz);
+            k(j).setKeyDataOfsSavingUse( ofs );
+        }
+        int dataUsed = tdz - ofs;
+        memcpy(data + ofs, temp + ofs, dataUsed);
+        emptySize = tdz - dataUsed - n * sizeof(_KeyNode);
+        assert( emptySize >= 0 );
+
+        setPacked();
+        assertValid( order );
+    }
+
+    inline void BucketBasics::truncateTo(int N, const BSONObj &order) {
+        n = N;
+        setNotPacked();
+        pack( order );
+    }
+
+    /* - BtreeBucket --------------------------------------------------- */
+
+    /* return largest key in the subtree. */
+    void BtreeBucket::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) {
+        DiskLoc loc = thisLoc;
+        while ( 1 ) {
+            BtreeBucket *b = loc.btree();
+            if ( !b->nextChild.isNull() ) {
+                loc = b->nextChild;
+                continue;
+            }
+
+            assert(b->n>0);
+            largestLoc = loc;
+            largestKey = b->n-1;
+
+            break;
+        }
+    }
+
+    bool BtreeBucket::exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, BSONObj order) { 
+        int pos;
+        bool found;
+        DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
+
+        // skip unused keys
+        while ( 1 ) {
+            if( b.isNull() )
+                break;
+            BtreeBucket *bucket = b.btree();
+            _KeyNode& kn = bucket->k(pos);
+            if ( kn.isUsed() )
+                return bucket->keyAt(pos).woEqual(key);
+            b = bucket->advance(b, pos, 1, "BtreeBucket::exists");
+        }
+        return false;
+    }
+
+    string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ){
+        stringstream ss;
+        ss << "E11000 duplicate key error";
+        ss << "index: " << idx.indexNamespace() << "  ";
+        ss << "dup key: " << key;
+        return ss.str();
+    }
+
+    /* Find a key withing this btree bucket.
+ 
+       When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the 
+       key.  That assures that even when there are many duplicates (e.g., 1 million) for a key,
+       our performance is still good.
+
+       assertIfDup: if the key exists (ignoring the recordLoc), uassert
+
+       pos: for existing keys k0...kn-1.
+       returns # it goes BEFORE.  so key[pos-1] < key < key[pos]
+       returns n if it goes after the last existing key.
+       note result might be an Unused location!
+    */
+	char foo;
+    bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, DiskLoc recordLoc, const BSONObj &order, int& pos, bool assertIfDup) {
+#if defined(_EXPERIMENT1)
+		{
+			char *z = (char *) this;
+			int i = 0;
+			while( 1 ) {
+				i += 4096;
+				if( i >= BucketSize )
+					break;
+				foo += z[i];
+			}
+		}
+#endif
+        /* binary search for this key */
+        bool dupsChecked = false;
+        int l=0;
+        int h=n-1;
+        while ( l <= h ) {
+            int m = (l+h)/2;
+            KeyNode M = keyNode(m);
+            int x = key.woCompare(M.key, order);
+            if ( x == 0 ) { 
+                if( assertIfDup ) {
+                    if( k(m).isUnused() ) { 
+                        // ok that key is there if unused.  but we need to check that there aren't other 
+                        // entries for the key then.  as it is very rare that we get here, we don't put any 
+                        // coding effort in here to make this particularly fast
+                        if( !dupsChecked ) { 
+                            dupsChecked = true;
+                            if( idx.head.btree()->exists(idx, idx.head, key, order) )
+                                uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+                        }
+                    }
+                    else
+                        uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+                }
+
+                // dup keys allowed.  use recordLoc as if it is part of the key
+                DiskLoc unusedRL = M.recordLoc;
+                unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up
+                x = recordLoc.compare(unusedRL);
+            }
+            if ( x < 0 ) // key < M.key
+                h = m-1;
+            else if ( x > 0 )
+                l = m+1;
+            else {
+                // found it.
+                pos = m;
+                return true;
+            }
+        }
+        // not found
+        pos = l;
+        if ( pos != n ) {
+            BSONObj keyatpos = keyNode(pos).key;
+            wassert( key.woCompare(keyatpos, order) <= 0 );
+            if ( pos > 0 ) {
+                wassert( keyNode(pos-1).key.woCompare(key, order) <= 0 );
+            }
+        }
+
+        return false;
+    }
+
+    void BtreeBucket::delBucket(const DiskLoc& thisLoc, IndexDetails& id) {
+        ClientCursor::informAboutToDeleteBucket(thisLoc);
+        assert( !isHead() );
+
+        BtreeBucket *p = parent.btreemod();
+        if ( p->nextChild == thisLoc ) {
+            p->nextChild.Null();
+        }
+        else {
+            for ( int i = 0; i < p->n; i++ ) {
+                if ( p->k(i).prevChildBucket == thisLoc ) {
+                    p->k(i).prevChildBucket.Null();
+                    goto found;
+                }
+            }
+            out() << "ERROR: can't find ref to deleted bucket.\n";
+            out() << "To delete:\n";
+            dump();
+            out() << "Parent:\n";
+            p->dump();
+            assert(false);
+        }
+found:
+#if 1
+        /* as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
+           it (meaning it is ineligible for reuse).
+           */
+        memset(this, 0, Size());
+        modified(thisLoc);
+#else
+        //defensive:
+        n = -1;
+        parent.Null();
+        massert( 10284 , "todo: use RecStoreInterface instead", false);
+        // TODO: this was broken anyway as deleteRecord does unindexRecord() call which assumes the data is a BSONObj, 
+        // and it isn't.
+        assert(false);
+//        theDataFileMgr.deleteRecord(id.indexNamespace().c_str(), thisLoc.rec(), thisLoc);
+#endif
+    }
+
+    /* note: may delete the entire bucket!  this invalid upon return sometimes. */
+    void BtreeBucket::delKeyAtPos(const DiskLoc& thisLoc, IndexDetails& id, int p) {
+        modified(thisLoc);
+        assert(n>0);
+        DiskLoc left = childForPos(p);
+
+        if ( n == 1 ) {
+            if ( left.isNull() && nextChild.isNull() ) {
+                if ( isHead() )
+                    _delKeyAtPos(p); // we don't delete the top bucket ever
+                else
+                    delBucket(thisLoc, id);
+                return;
+            }
+            markUnused(p);
+            return;
+        }
+
+        if ( left.isNull() )
+            _delKeyAtPos(p);
+        else
+            markUnused(p);
+    }
+
+    int qqq = 0;
+
+    /* remove a key from the index */
+    bool BtreeBucket::unindex(const DiskLoc& thisLoc, IndexDetails& id, BSONObj& key, const DiskLoc& recordLoc ) {
+        if ( key.objsize() > KeyMax ) {
+            OCCASIONALLY problem() << "unindex: key too large to index, skipping " << id.indexNamespace() << /* ' ' << key.toString() << */ '\n';
+            return false;
+        }
+
+        int pos;
+        bool found;
+        DiskLoc loc = locate(id, thisLoc, key, id.keyPattern(), pos, found, recordLoc, 1);
+        if ( found ) {
+            loc.btree()->delKeyAtPos(loc, id, pos);
+            return true;
+        }
+        return false;
+    }
+
+    BtreeBucket* BtreeBucket::allocTemp() {
+        BtreeBucket *b = (BtreeBucket*) malloc(BucketSize);
+        b->init();
+        return b;
+    }
+
+    inline void fix(const DiskLoc& thisLoc, const DiskLoc& child) {
+        if ( !child.isNull() ) {
+            if ( insert_debug )
+                out() << "      " << child.toString() << ".parent=" << thisLoc.toString() << endl;
+            child.btreemod()->parent = thisLoc;
+        }
+    }
+
+    /* this sucks.  maybe get rid of parent ptrs. */
+    void BtreeBucket::fixParentPtrs(const DiskLoc& thisLoc) {
+        VERIFYTHISLOC
+        fix(thisLoc, nextChild);
+        for ( int i = 0; i < n; i++ )
+            fix(thisLoc, k(i).prevChildBucket);
+    }
+
+    /* insert a key in this bucket, splitting if necessary.
+       keypos - where to insert the key i3n range 0..n.  0=make leftmost, n=make rightmost.
+    */
+    void BtreeBucket::insertHere(DiskLoc thisLoc, int keypos,
+                                 DiskLoc recordLoc, const BSONObj& key, const BSONObj& order,
+                                 DiskLoc lchild, DiskLoc rchild, IndexDetails& idx)
+    {
+        modified(thisLoc);
+        if ( insert_debug )
+            out() << "   " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' '
+                 << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
+
+        DiskLoc oldLoc = thisLoc;
+
+        if ( basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
+            _KeyNode& kn = k(keypos);
+            if ( keypos+1 == n ) { // last key
+                if ( nextChild != lchild ) {
+                    out() << "ERROR nextChild != lchild" << endl;
+                    out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
+                    out() << "  keyPos: " << keypos << " n:" << n << endl;
+                    out() << "  nextChild: " << nextChild.toString() << " lchild: " << lchild.toString() << endl;
+                    out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
+                    out() << "  key: " << key.toString() << endl;
+                    dump();
+#if 0
+                    out() << "\n\nDUMPING FULL INDEX" << endl;
+                    bt_dmp=1;
+                    bt_fv=1;
+                    idx.head.btree()->fullValidate(idx.head);
+#endif
+                    assert(false);
+                }
+                kn.prevChildBucket = nextChild;
+                assert( kn.prevChildBucket == lchild );
+                nextChild = rchild;
+                if ( !rchild.isNull() )
+                    rchild.btreemod()->parent = thisLoc;
+            }
+            else {
+                k(keypos).prevChildBucket = lchild;
+                if ( k(keypos+1).prevChildBucket != lchild ) {
+                    out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl;
+                    out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
+                    out() << "  keyPos: " << keypos << " n:" << n << endl;
+                    out() << "  k(keypos+1).pcb: " << k(keypos+1).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl;
+                    out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
+                    out() << "  key: " << key.toString() << endl;
+                    dump();
+#if 0
+                    out() << "\n\nDUMPING FULL INDEX" << endl;
+                    bt_dmp=1;
+                    bt_fv=1;
+                    idx.head.btree()->fullValidate(idx.head);
+#endif
+                    assert(false);
+                }
+                k(keypos+1).prevChildBucket = rchild;
+                if ( !rchild.isNull() )
+                    rchild.btreemod()->parent = thisLoc;
+            }
+            return;
+        }
+
+        /* ---------- split ---------------- */
+
+        if ( split_debug )
+            out() << "    " << thisLoc.toString() << ".split" << endl;
+
+        int mid = n / 2;
+
+        DiskLoc rLoc = addBucket(idx);
+        BtreeBucket *r = rLoc.btreemod();
+        if ( split_debug )
+            out() << "     mid:" << mid << ' ' << keyNode(mid).key.toString() << " n:" << n << endl;
+        for ( int i = mid+1; i < n; i++ ) {
+            KeyNode kn = keyNode(i);
+            r->pushBack(kn.recordLoc, kn.key, order, kn.prevChildBucket);
+        }
+        r->nextChild = nextChild;
+        r->assertValid( order );
+
+        if ( split_debug )
+            out() << "     new rLoc:" << rLoc.toString() << endl;
+        r = 0;
+        rLoc.btree()->fixParentPtrs(rLoc);
+
+        {
+            KeyNode middle = keyNode(mid);
+            nextChild = middle.prevChildBucket; // middle key gets promoted, its children will be thisLoc (l) and rLoc (r)
+            if ( split_debug ) {
+                out() << "    middle key:" << middle.key.toString() << endl;
+            }
+
+            // promote middle to a parent node
+            if ( parent.isNull() ) {
+                // make a new parent if we were the root
+                DiskLoc L = addBucket(idx);
+                BtreeBucket *p = L.btreemod();
+                p->pushBack(middle.recordLoc, middle.key, order, thisLoc);
+                p->nextChild = rLoc;
+                p->assertValid( order );
+                parent = idx.head = L;
+                if ( split_debug )
+                    out() << "    we were root, making new root:" << hex << parent.getOfs() << dec << endl;
+                rLoc.btreemod()->parent = parent;
+            }
+            else {
+                /* set this before calling _insert - if it splits it will do fixParent() logic and change the value.
+                */
+                rLoc.btreemod()->parent = parent;
+                if ( split_debug )
+                    out() << "    promoting middle key " << middle.key.toString() << endl;
+                parent.btree()->_insert(parent, middle.recordLoc, middle.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx);
+            }
+        }
+
+        truncateTo(mid, order);  // note this may trash middle.key.  thus we had to promote it before finishing up here.
+
+        // add our new key, there is room now
+        {
+
+            if ( keypos <= mid ) {
+                if ( split_debug )
+                    out() << "  keypos<mid, insertHere() the new key" << endl;
+                insertHere(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx);
+            } else {
+                int kp = keypos-mid-1;
+                assert(kp>=0);
+                rLoc.btree()->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx);
+            }
+        }
+
+        if ( split_debug )
+            out() << "     split end " << hex << thisLoc.getOfs() << dec << endl;
+    }
+
+    /* start a new index off, empty */
+    DiskLoc BtreeBucket::addBucket(IndexDetails& id) {
+        DiskLoc loc = btreeStore->insert(id.indexNamespace().c_str(), 0, BucketSize, true);
+        BtreeBucket *b = loc.btreemod();
+        b->init();
+        return loc;
+    }
+
+    void BtreeBucket::renameIndexNamespace(const char *oldNs, const char *newNs) {
+        btreeStore->rename( oldNs, newNs );
+    }
+
+    DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) {
+        DiskLoc p = thisLoc;
+        while ( !p.btree()->isHead() )
+            p = p.btree()->parent;
+        return p;
+    }
+
+    DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+        if ( keyOfs < 0 || keyOfs >= n ) {
+            out() << "ASSERT failure BtreeBucket::advance, caller: " << caller << endl;
+            out() << "  thisLoc: " << thisLoc.toString() << endl;
+            out() << "  keyOfs: " << keyOfs << " n:" << n << " direction: " << direction << endl;
+            out() << bucketSummary() << endl;
+            assert(false);
+        }
+        int adj = direction < 0 ? 1 : 0;
+        int ko = keyOfs + direction;
+        DiskLoc nextDown = childForPos(ko+adj);
+        if ( !nextDown.isNull() ) {
+            while ( 1 ) {
+                keyOfs = direction>0 ? 0 : nextDown.btree()->n - 1;
+                DiskLoc loc = nextDown.btree()->childForPos(keyOfs + adj);
+                if ( loc.isNull() )
+                    break;
+                nextDown = loc;
+            }
+            return nextDown;
+        }
+
+        if ( ko < n && ko >= 0 ) {
+            keyOfs = ko;
+            return thisLoc;
+        }
+
+        // end of bucket.  traverse back up.
+        DiskLoc childLoc = thisLoc;
+        DiskLoc ancestor = parent;
+        while ( 1 ) {
+            if ( ancestor.isNull() )
+                break;
+            BtreeBucket *an = ancestor.btree();
+            for ( int i = 0; i < an->n; i++ ) {
+                if ( an->childForPos(i+adj) == childLoc ) {
+                    keyOfs = i;
+                    return ancestor;
+                }
+            }
+            assert( direction<0 || an->nextChild == childLoc );
+            // parent exhausted also, keep going up
+            childLoc = ancestor;
+            ancestor = an->parent;
+        }
+
+        return DiskLoc();
+    }
+
+    DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const BSONObj &order, int& pos, bool& found, DiskLoc recordLoc, int direction) {
+        int p;
+        found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false);
+        if ( found ) {
+            pos = p;
+            return thisLoc;
+        }
+
+        DiskLoc child = childForPos(p);
+
+        if ( !child.isNull() ) {
+            DiskLoc l = child.btree()->locate(idx, child, key, order, pos, found, recordLoc, direction);
+            if ( !l.isNull() )
+                return l;
+        }
+
+        pos = p;
+        if ( direction < 0 )
+            return --pos == -1 ? DiskLoc() /*theend*/ : thisLoc;
+        else
+            return pos == n ? DiskLoc() /*theend*/ : thisLoc;
+    }
+
+    /* @thisLoc disk location of *this
+    */
+    int BtreeBucket::_insert(DiskLoc thisLoc, DiskLoc recordLoc,
+                             const BSONObj& key, const BSONObj &order, bool dupsAllowed,
+                             DiskLoc lChild, DiskLoc rChild, IndexDetails& idx) {
+        if ( key.objsize() > KeyMax ) {
+            problem() << "ERROR: key too large len:" << key.objsize() << " max:" << KeyMax << ' ' << key.objsize() << ' ' << idx.indexNamespace() << endl;
+            return 2;
+        }
+        assert( key.objsize() > 0 );
+
+        int pos;
+        bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed);
+        if ( insert_debug ) {
+            out() << "  " << thisLoc.toString() << '.' << "_insert " <<
+                 key.toString() << '/' << recordLoc.toString() <<
+                 " l:" << lChild.toString() << " r:" << rChild.toString() << endl;
+            out() << "    found:" << found << " pos:" << pos << " n:" << n << endl;
+        }
+
+        if ( found ) {
+            _KeyNode& kn = k(pos);
+            if ( kn.isUnused() ) {
+                log(4) << "btree _insert: reusing unused key" << endl;
+                massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull());
+                massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull());
+                kn.setUsed();
+                return 0;
+            }
+
+            out() << "_insert(): key already exists in index\n";
+            out() << "  " << idx.indexNamespace().c_str() << " thisLoc:" << thisLoc.toString() << '\n';
+            out() << "  " << key.toString() << '\n';
+            out() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
+            out() << "  old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl;
+            out() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
+            massert( 10287 , "btree: key+recloc already in index", false);
+        }
+
+        DEBUGGING out() << "TEMP: key: " << key.toString() << endl;
+        DiskLoc& child = childForPos(pos);
+        if ( insert_debug )
+            out() << "    getChild(" << pos << "): " << child.toString() << endl;
+        if ( child.isNull() || !rChild.isNull() /* means an 'internal' insert */ ) {
+            insertHere(thisLoc, pos, recordLoc, key, order, lChild, rChild, idx);
+            return 0;
+        }
+
+        return child.btree()->bt_insert(child, recordLoc, key, order, dupsAllowed, idx, /*toplevel*/false);
+    }
+
+    void BtreeBucket::dump() {
+        out() << "DUMP btreebucket n:" << n;
+        out() << " parent:" << hex << parent.getOfs() << dec;
+        for ( int i = 0; i < n; i++ ) {
+            out() << '\n';
+            KeyNode k = keyNode(i);
+            out() << '\t' << i << '\t' << k.key.toString() << "\tleft:" << hex <<
+                 k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec;
+            if ( this->k(i).isUnused() )
+                out() << " UNUSED";
+        }
+        out() << " right:" << hex << nextChild.getOfs() << dec << endl;
+    }
+
+    /* todo: meaning of return code unclear clean up */
+    int BtreeBucket::bt_insert(DiskLoc thisLoc, DiskLoc recordLoc,
+                            const BSONObj& key, const BSONObj &order, bool dupsAllowed,
+                            IndexDetails& idx, bool toplevel)
+    {
+        if ( toplevel ) {
+            if ( key.objsize() > KeyMax ) {
+                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace().c_str() << ' ' << key.objsize() << ' ' << key.toString() << '\n';
+                return 3;
+            }
+        }
+
+        int x = _insert(thisLoc, recordLoc, key, order, dupsAllowed, DiskLoc(), DiskLoc(), idx);
+        assertValid( order );
+
+        return x;
+    }
+
+    void BtreeBucket::shape(stringstream& ss) {
+        _shape(0, ss);
+    }
+    
+    DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ){
+        int pos;
+        bool found;
+        DiskLoc bucket = locate( indexdetails , indexdetails.head , key , BSONObj() , pos , found , minDiskLoc );
+        if ( bucket.isNull() )
+            return bucket;
+
+        BtreeBucket *b = bucket.btree();
+        while ( 1 ){
+            _KeyNode& knraw = b->k(pos);
+            if ( knraw.isUsed() )
+                break;
+            bucket = b->advance( bucket , pos , 1 , "findSingle" );
+            if ( bucket.isNull() )
+                return bucket;
+            b = bucket.btree();
+        }
+        KeyNode kn = b->keyNode( pos );
+        if ( key.woCompare( kn.key ) != 0 )
+            return DiskLoc();
+        return kn.recordLoc;
+    }
+
+} // namespace mongo
+
+#include "db.h"
+#include "dbhelpers.h"
+
+namespace mongo {
+
+    void BtreeBucket::a_test(IndexDetails& id) {
+        BtreeBucket *b = id.head.btree();
+
+        // record locs for testing
+        DiskLoc A(1, 20);
+        DiskLoc B(1, 30);
+        DiskLoc C(1, 40);
+
+        DiskLoc rl;
+        BSONObj key = fromjson("{x:9}");
+        BSONObj order = fromjson("{}");
+
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        assert( b->k(0).isUsed() );
+//        b->k(0).setUnused();
+        b->k(1).setUnused();
+        b->k(2).setUnused();
+        b->k(3).setUnused();
+
+        b->dumpTree(id.head, order);
+
+        /*        b->bt_insert(id.head, B, key, order, false, id);
+        b->k(1).setUnused();
+
+        b->dumpTree(id.head, order);
+        cout << "---\n";
+
+        b->bt_insert(id.head, A, key, order, false, id);
+
+        b->dumpTree(id.head, order);
+        cout << "---\n";*/
+
+        // this should assert.  does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing)
+        b->bt_insert(id.head, C, key, order, false, id);
+
+        b->dumpTree(id.head, order);
+    }
+
+    /* --- BtreeBuilder --- */
+
+    BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) : 
+      dupsAllowed(_dupsAllowed), idx(_idx), n(0) 
+    {
+        first = cur = BtreeBucket::addBucket(idx);
+        b = cur.btreemod();
+        order = idx.keyPattern();
+        committed = false;
+    }
+
+    void BtreeBuilder::newBucket() { 
+        DiskLoc L = BtreeBucket::addBucket(idx);
+        b->tempNext() = L;
+        cur = L;
+        b = cur.btreemod();
+    }
+
+    void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) { 
+        if( !dupsAllowed ) {
+            if( n > 0 ) {
+                int cmp = keyLast.woCompare(key, order);
+                massert( 10288 ,  "bad key order in BtreeBuilder - server internal error", cmp <= 0 );
+                if( cmp == 0 ) {
+                    //if( !dupsAllowed )
+                    uasserted( ASSERT_ID_DUPKEY , BtreeBucket::dupKeyError( idx , keyLast ) );
+                }
+            }
+            keyLast = key;
+        }
+
+        if ( ! b->_pushBack(loc, key, order, DiskLoc()) ){
+            // no room
+            if ( key.objsize() > KeyMax ) {
+                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace().c_str() << ' ' << key.objsize() << ' ' << key.toString() << '\n';
+            }
+            else { 
+                // bucket was full
+                newBucket();
+                b->pushBack(loc, key, order, DiskLoc());
+            }
+        }
+        n++;
+    }
+
+    void BtreeBuilder::buildNextLevel(DiskLoc loc) { 
+        int levels = 1;
+        while( 1 ) { 
+            if( loc.btree()->tempNext().isNull() ) { 
+                // only 1 bucket at this level. we are done.
+                idx.head = loc;
+                break;
+            }
+            levels++;
+
+            DiskLoc upLoc = BtreeBucket::addBucket(idx);
+            DiskLoc upStart = upLoc;
+            BtreeBucket *up = upLoc.btreemod();
+
+            DiskLoc xloc = loc;
+            while( !xloc.isNull() ) { 
+                BtreeBucket *x = xloc.btreemod();
+                BSONObj k; 
+                DiskLoc r;
+                x->popBack(r,k);
+                if( x->n == 0 )
+                    log() << "warning: empty bucket on BtreeBuild " << k.toString() << endl;
+
+                if ( ! up->_pushBack(r, k, order, xloc) ){
+                    // current bucket full
+                    DiskLoc n = BtreeBucket::addBucket(idx);
+                    up->tempNext() = n;
+                    upLoc = n; 
+                    up = upLoc.btreemod();
+                    up->pushBack(r, k, order, xloc);
+                }
+
+                xloc = x->tempNext(); /* get next in chain at current level */
+                x->parent = upLoc;
+            }
+            
+            loc = upStart;
+        }
+
+        if( levels > 1 )
+            log(2) << "btree levels: " << levels << endl;
+    }
+
+    /* when all addKeys are done, we then build the higher levels of the tree */
+    void BtreeBuilder::commit() { 
+        buildNextLevel(first);
+        committed = true;
+    }
+
+    BtreeBuilder::~BtreeBuilder() { 
+        if( !committed ) { 
+            log(2) << "Rolling back partially built index space" << endl;
+            DiskLoc x = first;
+            while( !x.isNull() ) { 
+                DiskLoc next = x.btree()->tempNext();
+                btreeStore->deleteRecord(idx.indexNamespace().c_str(), x);
+                x = next;
+            }
+            assert( idx.head.isNull() );
+            log(2) << "done rollback" << endl;
+        }
+    }
+
+}
diff --git a/db/btree.h b/db/btree.h
new file mode 100644
index 0000000..2c2ab81
--- /dev/null
+++ b/db/btree.h
@@ -0,0 +1,405 @@
+// btree.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "jsobj.h"
+#include "storage.h"
+#include "pdfile.h"
+
+namespace mongo {
+
+#pragma pack(1)
+
+    struct _KeyNode {
+        DiskLoc prevChildBucket;
+        DiskLoc recordLoc;
+        short keyDataOfs() const {
+            return (short) _kdo;
+        }
+        unsigned short _kdo;
+        void setKeyDataOfs(short s) {
+            _kdo = s;
+            assert(s>=0);
+        }
+        void setKeyDataOfsSavingUse(short s) {
+            _kdo = s;
+            assert(s>=0);
+        }
+        void setUsed() { 
+            recordLoc.GETOFS() &= ~1;
+        }
+        void setUnused() {
+            /* Setting ofs to odd is the sentinel for unused, as real recordLoc's are always
+               even numbers.
+               Note we need to keep its value basically the same as we use the recordLoc
+               as part of the key in the index (to handle duplicate keys efficiently).
+            */
+            recordLoc.GETOFS() |= 1;
+        }
+        int isUnused() {
+            return recordLoc.getOfs() & 1;
+        }
+        int isUsed() {
+            return !isUnused();
+        }
+    };
+
+#pragma pack()
+
+    class BucketBasics;
+
+    /* wrapper - this is our in memory representation of the key.  _KeyNode is the disk representation. */
+    class KeyNode {
+    public:
+        KeyNode(const BucketBasics& bb, const _KeyNode &k);
+        const DiskLoc& prevChildBucket;
+        const DiskLoc& recordLoc;
+        BSONObj key;
+    };
+
+#pragma pack(1)
+
+    /* this class is all about the storage management */
+    class BucketBasics {
+        friend class BtreeBuilder;
+        friend class KeyNode;
+    public:
+        void dumpTree(DiskLoc thisLoc, const BSONObj &order);
+        bool isHead() { return parent.isNull(); }
+        void assertValid(const BSONObj &order, bool force = false);
+        int fullValidate(const DiskLoc& thisLoc, const BSONObj &order); /* traverses everything */
+    protected:
+        void modified(const DiskLoc& thisLoc);
+        KeyNode keyNode(int i) const {
+            assert( i < n );
+            return KeyNode(*this, k(i));
+        }
+
+        char * dataAt(short ofs) {
+            return data + ofs;
+        }
+
+        void init(); // initialize a new node
+
+        /* returns false if node is full and must be split
+           keypos is where to insert -- inserted after that key #.  so keypos=0 is the leftmost one.
+        */
+        bool basicInsert(const DiskLoc& thisLoc, int keypos, const DiskLoc& recordLoc, const BSONObj& key, const BSONObj &order);
+        
+        /**
+         * @return true if works, false if not enough space
+         */
+        bool _pushBack(const DiskLoc& recordLoc, BSONObj& key, const BSONObj &order, DiskLoc prevChild);
+        void pushBack(const DiskLoc& recordLoc, BSONObj& key, const BSONObj &order, DiskLoc prevChild){
+            bool ok = _pushBack( recordLoc , key , order , prevChild );
+            assert(ok);
+        }
+        void popBack(DiskLoc& recLoc, BSONObj& key);
+        void _delKeyAtPos(int keypos); // low level version that doesn't deal with child ptrs.
+
+        /* !Packed means there is deleted fragment space within the bucket.
+           We "repack" when we run out of space before considering the node
+           to be full.
+           */
+        enum Flags { Packed=1 };
+
+        DiskLoc& childForPos(int p) {
+            return p == n ? nextChild : k(p).prevChildBucket;
+        }
+
+        int totalDataSize() const;
+        void pack( const BSONObj &order );
+        void setNotPacked();
+        void setPacked();
+        int _alloc(int bytes);
+        void _unalloc(int bytes);
+        void truncateTo(int N, const BSONObj &order);
+        void markUnused(int keypos);
+
+        /* BtreeBuilder uses the parent var as a temp place to maintain a linked list chain. 
+           we use tempNext() when we do that to be less confusing. (one might have written a union in C)
+           */
+        DiskLoc& tempNext() { return parent; }
+
+    public:
+        DiskLoc parent;
+
+        string bucketSummary() const {
+            stringstream ss;
+            ss << "  Bucket info:" << endl;
+            ss << "    n: " << n << endl;
+            ss << "    parent: " << parent.toString() << endl;
+            ss << "    nextChild: " << parent.toString() << endl;
+            ss << "    Size: " << _Size << " flags:" << flags << endl;
+            ss << "    emptySize: " << emptySize << " topSize: " << topSize << endl;
+            return ss.str();
+        }
+
+    protected:
+        void _shape(int level, stringstream&);
+        DiskLoc nextChild; // child bucket off and to the right of the highest key.
+        int _Size; // total size of this btree node in bytes. constant.
+        int Size() const;
+        int flags;
+        int emptySize; // size of the empty region
+        int topSize; // size of the data at the top of the bucket (keys are at the beginning or 'bottom')
+        int n; // # of keys so far.
+        int reserved;
+        const _KeyNode& k(int i) const {
+            return ((_KeyNode*)data)[i];
+        }
+        _KeyNode& k(int i) {
+            return ((_KeyNode*)data)[i];
+        }
+        char data[4];
+    };
+
+    class BtreeBucket : public BucketBasics {
+        friend class BtreeCursor;
+    public:
+        void dump();
+
+        /* @return true if key exists in index 
+
+           order - indicates order of keys in the index.  this is basically the index's key pattern, e.g.:
+             BSONObj order = ((IndexDetails&)idx).keyPattern();
+           likewise below in bt_insert() etc.
+        */
+        bool exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, BSONObj order);
+
+        static DiskLoc addBucket(IndexDetails&); /* start a new index off, empty */
+        
+        static void renameIndexNamespace(const char *oldNs, const char *newNs);
+
+        int bt_insert(DiskLoc thisLoc, DiskLoc recordLoc,
+                   const BSONObj& key, const BSONObj &order, bool dupsAllowed,
+                   IndexDetails& idx, bool toplevel = true);
+
+        bool unindex(const DiskLoc& thisLoc, IndexDetails& id, BSONObj& key, const DiskLoc& recordLoc);
+
+        /* locate may return an "unused" key that is just a marker.  so be careful.
+             looks for a key:recordloc pair.
+
+           found - returns true if exact match found.  note you can get back a position 
+                   result even if found is false.
+        */
+        DiskLoc locate(const IndexDetails& , const DiskLoc& thisLoc, const BSONObj& key, const BSONObj &order, 
+                       int& pos, bool& found, DiskLoc recordLoc, int direction=1);
+        
+        /**
+         * find the first instance of the key
+         * does not handle dups
+         * returned DiskLock isNull if can't find anything with that
+         */
+        DiskLoc findSingle( const IndexDetails& , const DiskLoc& thisLoc, const BSONObj& key );
+
+        /* advance one key position in the index: */
+        DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller);
+        DiskLoc getHead(const DiskLoc& thisLoc);
+
+        /* get tree shape */
+        void shape(stringstream&);
+
+        static void a_test(IndexDetails&);
+
+    private:
+        void fixParentPtrs(const DiskLoc& thisLoc);
+        void delBucket(const DiskLoc& thisLoc, IndexDetails&);
+        void delKeyAtPos(const DiskLoc& thisLoc, IndexDetails& id, int p);
+        BSONObj keyAt(int keyOfs) {
+            return keyOfs >= n ? BSONObj() : keyNode(keyOfs).key;
+        }
+        static BtreeBucket* allocTemp(); /* caller must release with free() */
+        void insertHere(DiskLoc thisLoc, int keypos,
+                        DiskLoc recordLoc, const BSONObj& key, const BSONObj &order,
+                        DiskLoc lchild, DiskLoc rchild, IndexDetails&);
+        int _insert(DiskLoc thisLoc, DiskLoc recordLoc,
+                    const BSONObj& key, const BSONObj &order, bool dupsAllowed,
+                    DiskLoc lChild, DiskLoc rChild, IndexDetails&);
+        bool find(const IndexDetails& idx, const BSONObj& key, DiskLoc recordLoc, const BSONObj &order, int& pos, bool assertIfDup);
+        static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
+    public:
+        // simply builds and returns a dup key error message string
+        static string dupKeyError( const IndexDetails& idx , const BSONObj& key );
+    };
+
+    class BtreeCursor : public Cursor {
+    public:
+        BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+
+        BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const BoundList &_bounds, int _direction );
+
+        virtual bool ok() {
+            return !bucket.isNull();
+        }
+        bool eof() {
+            return !ok();
+        }
+        virtual bool advance();
+
+        virtual void noteLocation(); // updates keyAtKeyOfs...
+        virtual void checkLocation();
+
+        /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+           if a multikey index traversal:
+             if loc has already been sent, returns true.
+             otherwise, marks loc as sent.
+             @return true if the loc has not been seen
+        */
+        set<DiskLoc> dups;
+        virtual bool getsetdup(DiskLoc loc) {
+            if( multikey ) { 
+                pair<set<DiskLoc>::iterator, bool> p = dups.insert(loc);
+                return !p.second;
+            }
+            return false;
+        }
+
+        _KeyNode& _currKeyNode() {
+            assert( !bucket.isNull() );
+            _KeyNode& kn = bucket.btree()->k(keyOfs);
+            assert( kn.isUsed() );
+            return kn;
+        }
+        KeyNode currKeyNode() const {
+            assert( !bucket.isNull() );
+            return bucket.btree()->keyNode(keyOfs);
+        }
+        virtual BSONObj currKey() const {
+            return currKeyNode().key;
+        }
+
+        virtual BSONObj indexKeyPattern() {
+            return indexDetails.keyPattern();
+        }
+
+        virtual void aboutToDeleteBucket(const DiskLoc& b) {
+            if ( bucket == b )
+                keyOfs = -1;
+        }
+
+        virtual DiskLoc currLoc() {
+            return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();
+        }
+        virtual DiskLoc refLoc() {
+            return currLoc();
+        }
+        virtual Record* _current() {
+            return currLoc().rec();
+        }
+        virtual BSONObj current() {
+            return BSONObj(_current());
+        }
+        virtual string toString() {
+            string s = string("BtreeCursor ") + indexDetails.indexName();
+            if ( direction < 0 ) s += " reverse";
+            if ( bounds_.size() > 1 ) s += " multi";
+            return s;
+        }
+
+        BSONObj prettyKey( const BSONObj &key ) const {
+            return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable();
+        }
+
+        virtual BSONObj prettyStartKey() const {
+            return prettyKey( startKey );
+        }
+        virtual BSONObj prettyEndKey() const {
+            return prettyKey( endKey );
+        }
+        
+        void forgetEndKey() { endKey = BSONObj(); }
+        
+    private:
+        /* Our btrees may (rarely) have "unused" keys when items are deleted.
+           Skip past them.
+        */
+        void skipUnusedKeys();
+
+        /* Check if the current key is beyond endKey. */
+        void checkEnd();
+
+        // selective audits on construction
+        void audit();
+
+        // set initial bucket
+        void init();
+
+        // init start / end keys with a new range
+        void initInterval();
+
+        friend class BtreeBucket;
+        NamespaceDetails *d;
+        int idxNo;
+        BSONObj startKey;
+        BSONObj endKey;
+        bool endKeyInclusive_;
+        bool multikey; // note this must be updated every getmore batch in case someone added a multikey...
+
+        const IndexDetails& indexDetails;
+        BSONObj order;
+        DiskLoc bucket;
+        int keyOfs;
+        int direction; // 1=fwd,-1=reverse
+        BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
+        DiskLoc locAtKeyOfs;
+        BoundList bounds_;
+        unsigned boundIndex_;
+    };
+
+#pragma pack()
+
+    inline bool IndexDetails::hasKey(const BSONObj& key) { 
+        return head.btree()->exists(*this, head, key, keyPattern());
+    }
+
+    /* build btree from the bottom up */
+    /* _ TODO dropDups */
+    class BtreeBuilder {
+        bool dupsAllowed; 
+        IndexDetails& idx;
+        unsigned long long n;
+        BSONObj keyLast;
+        BSONObj order;
+        bool committed;
+
+        DiskLoc cur, first;
+        BtreeBucket *b;
+
+        void newBucket();
+        void buildNextLevel(DiskLoc);
+
+    public:
+        ~BtreeBuilder();
+
+        BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx);
+
+        /* keys must be added in order */
+        void addKey(BSONObj& key, DiskLoc loc);
+
+        /* commit work.  if not called, destructor will clean up partially completed work 
+           (in case exception has happened).
+        */
+        void commit();
+
+        unsigned long long getn() { return n; }
+    };
+
+} // namespace mongo;
diff --git a/db/btreecursor.cpp b/db/btreecursor.cpp
new file mode 100644
index 0000000..bb477d6
--- /dev/null
+++ b/db/btreecursor.cpp
@@ -0,0 +1,204 @@
+// btreecursor.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "curop.h"
+
+namespace mongo {
+
+    extern int otherTraceLevel;
+
+    BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id, 
+                              const BSONObj &_startKey, const BSONObj &_endKey, bool endKeyInclusive, int _direction ) :
+            d(_d), idxNo(_idxNo), 
+            startKey( _startKey ),
+            endKey( _endKey ),
+            endKeyInclusive_( endKeyInclusive ),
+            multikey( d->isMultikey( idxNo ) ),
+            indexDetails( _id ),
+            order( _id.keyPattern() ),
+            direction( _direction ),
+            boundIndex_()
+    {
+        audit();
+        init();
+    }
+
+    BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const vector< pair< BSONObj, BSONObj > > &_bounds, int _direction )
+        :
+            d(_d), idxNo(_idxNo), 
+            endKeyInclusive_( true ),
+            multikey( d->isMultikey( idxNo ) ),
+            indexDetails( _id ),
+            order( _id.keyPattern() ),
+            direction( _direction ),
+            bounds_( _bounds ),
+            boundIndex_()
+    {
+        assert( !bounds_.empty() );
+        audit();
+        initInterval();
+    }
+
+    void BtreeCursor::audit() {
+        dassert( d->idxNo((IndexDetails&) indexDetails) == idxNo );
+
+        if ( otherTraceLevel >= 12 ) {
+            if ( otherTraceLevel >= 200 ) {
+                out() << "::BtreeCursor() qtl>200.  validating entire index." << endl;
+                indexDetails.head.btree()->fullValidate(indexDetails.head, order);
+            }
+            else {
+                out() << "BTreeCursor(). dumping head bucket" << endl;
+                indexDetails.head.btree()->dump();
+            }
+        }
+    }
+
+    void BtreeCursor::init() {
+        bool found;
+        bucket = indexDetails.head.btree()->
+        locate(indexDetails, indexDetails.head, startKey, order, keyOfs, found, direction > 0 ? minDiskLoc : maxDiskLoc, direction);
+        skipUnusedKeys();
+        checkEnd();        
+    }
+    
+    void BtreeCursor::initInterval() {
+        do {
+            startKey = bounds_[ boundIndex_ ].first;
+            endKey = bounds_[ boundIndex_ ].second;
+            init();
+        } while ( !ok() && ++boundIndex_ < bounds_.size() );
+    }
+    
+    /* skip unused keys. */
+    void BtreeCursor::skipUnusedKeys() {
+        int u = 0;
+        while ( 1 ) {
+            if ( !ok() )
+                break;
+            BtreeBucket *b = bucket.btree();
+            _KeyNode& kn = b->k(keyOfs);
+            if ( kn.isUsed() )
+                break;
+            bucket = b->advance(bucket, keyOfs, direction, "skipUnusedKeys");
+            u++;
+        }
+        if ( u > 10 )
+            OCCASIONALLY log() << "btree unused skipped:" << u << '\n';
+    }
+
+// Return a value in the set {-1, 0, 1} to represent the sign of parameter i.
+    int sgn( int i ) {
+        if ( i == 0 )
+            return 0;
+        return i > 0 ? 1 : -1;
+    }
+
+    // Check if the current key is beyond endKey.
+    void BtreeCursor::checkEnd() {
+        if ( bucket.isNull() )
+            return;
+        if ( !endKey.isEmpty() ) {
+            int cmp = sgn( endKey.woCompare( currKey(), order ) );
+            if ( ( cmp != 0 && cmp != direction ) ||
+                ( cmp == 0 && !endKeyInclusive_ ) )
+                bucket = DiskLoc();
+        }
+    }
+
+    bool BtreeCursor::advance() {
+        killCurrentOp.checkForInterrupt();
+        if ( bucket.isNull() )
+            return false;
+        bucket = bucket.btree()->advance(bucket, keyOfs, direction, "BtreeCursor::advance");
+        skipUnusedKeys();
+        checkEnd();
+        if( !ok() && ++boundIndex_ < bounds_.size() )
+            initInterval();
+        return !bucket.isNull();
+    }
+
+    void BtreeCursor::noteLocation() {
+        if ( !eof() ) {
+            BSONObj o = bucket.btree()->keyAt(keyOfs).copy();
+            keyAtKeyOfs = o;
+            locAtKeyOfs = bucket.btree()->k(keyOfs).recordLoc;
+        }
+    }
+
+    /* Since the last noteLocation(), our key may have moved around, and that old cached
+       information may thus be stale and wrong (although often it is right).  We check
+       that here; if we have moved, we have to search back for where we were at.
+
+       i.e., after operations on the index, the BtreeCursor's cached location info may
+       be invalid.  This function ensures validity, so you should call it before using
+       the cursor if other writers have used the database since the last noteLocation
+       call.
+    */
+    void BtreeCursor::checkLocation() {
+        if ( eof() )
+            return;
+
+        multikey = d->isMultikey(idxNo);
+
+        if ( keyOfs >= 0 ) {
+            BtreeBucket *b = bucket.btree();
+
+            assert( !keyAtKeyOfs.isEmpty() );
+
+            // Note keyAt() returns an empty BSONObj if keyOfs is now out of range,
+            // which is possible as keys may have been deleted.
+            if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
+                    b->k(keyOfs).recordLoc == locAtKeyOfs ) {
+                if ( !b->k(keyOfs).isUsed() ) {
+                    /* we were deleted but still exist as an unused
+                       marker key. advance.
+                    */
+                    skipUnusedKeys();
+                }
+                return;
+            }
+        }
+
+        /* normally we don't get to here.  when we do, old position is no longer
+            valid and we must refind where we left off (which is expensive)
+        */
+
+        bool found;
+
+        /* TODO: Switch to keep indexdetails and do idx.head! */
+        bucket = indexDetails.head.btree()->locate(indexDetails, indexDetails.head, keyAtKeyOfs, order, keyOfs, found, locAtKeyOfs, direction);
+        RARELY log() << "  key seems to have moved in the index, refinding. found:" << found << endl;
+        if ( ! bucket.isNull() )
+            skipUnusedKeys();
+
+    }
+
+    /* ----------------------------------------------------------------------------- */
+
+    struct BtreeCursorUnitTest {
+        BtreeCursorUnitTest() {
+            assert( minDiskLoc.compare(maxDiskLoc) < 0 );
+        }
+    } btut;
+
+} // namespace mongo
diff --git a/db/client.cpp b/db/client.cpp
new file mode 100644
index 0000000..68a0c9e
--- /dev/null
+++ b/db/client.cpp
@@ -0,0 +1,99 @@
+// client.cpp
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Client represents a connection to the database (the server-side) and corresponds 
+   to an open socket (or logical connection if pooling on sockets) from a client.
+*/
+
+#include "stdafx.h"
+#include "db.h"
+#include "client.h"
+#include "curop.h"
+#include "json.h"
+ 
+namespace mongo {
+
+    boost::mutex Client::clientsMutex;
+    set<Client*> Client::clients; // always be in clientsMutex when manipulating this
+    boost::thread_specific_ptr<Client> currentClient;
+
+    Client::Client(const char *desc) : 
+      _curOp(new CurOp()),
+      _database(0), _ns("")/*, _nsstr("")*/ 
+      ,_shutdown(false),
+      _desc(desc),
+      _god(0)
+    { 
+        ai = new AuthenticationInfo(); 
+        boostlock bl(clientsMutex);
+        clients.insert(this);
+    }
+
+    Client::~Client() { 
+        delete _curOp;
+        delete ai; 
+        ai = 0;
+        _god = 0;
+        if ( !_shutdown ) {
+            cout << "ERROR: Client::shutdown not called!" << endl;
+        }
+    }
+
+    bool Client::shutdown(){
+        _shutdown = true;
+
+        {
+            boostlock bl(clientsMutex);
+            clients.erase(this);
+        }
+
+        bool didAnything = false;
+        
+        if ( _tempCollections.size() ){
+            didAnything = true;
+            for ( list<string>::iterator i = _tempCollections.begin(); i!=_tempCollections.end(); i++ ){
+                string ns = *i;
+                dblock l;
+                setClient( ns.c_str() );
+                if ( ! nsdetails( ns.c_str() ) )
+                    continue;
+                try {
+                    string err;
+                    BSONObjBuilder b;
+                    dropCollection( ns , err , b );
+                }
+                catch ( ... ){
+                    log() << "error dropping temp collection: " << ns << endl;
+                }
+            }
+            _tempCollections.clear();
+        }
+        
+        return didAnything;
+    }
+
+    BSONObj CurOp::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}");
+    WrappingInt CurOp::_nextOpNum;
+    
+    Client::Context::Context( string ns , Database * db )
+        : _client( currentClient.get() ) {
+        assert( db && db->isOk() );
+        _client->setns( ns.c_str() , db );
+    }
+
+}
diff --git a/db/client.h b/db/client.h
new file mode 100644
index 0000000..99092ca
--- /dev/null
+++ b/db/client.h
@@ -0,0 +1,193 @@
+// client.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Client represents a connection to the database (the server-side) and corresponds 
+   to an open socket (or logical connection if pooling on sockets) from a client.
+
+   todo: switch to asio...this will fit nicely with that.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "namespace.h"
+#include "lasterror.h"
+#include "../util/top.h"
+
+namespace mongo { 
+
+    class AuthenticationInfo;
+    class Database;
+    class CurOp;
+    class Command;
+    class Client;
+
+    extern boost::thread_specific_ptr<Client> currentClient;
+
+    bool setClient(const char *ns, const string& path=dbpath, mongolock *lock = 0);
+
+
+    class Client : boost::noncopyable { 
+    public:
+        static boost::mutex clientsMutex;
+        static set<Client*> clients; // always be in clientsMutex when manipulating this
+
+        class GodScope {
+            bool _prev;
+        public:
+            GodScope();
+            ~GodScope();
+        };
+
+        /* Set database we want to use, then, restores when we finish (are out of scope)
+           Note this is also helpful if an exception happens as the state if fixed up.
+        */
+        class Context {
+            Client * _client;
+            Database * _olddb;
+            string _oldns;
+        public:
+            Context(const char *ns) 
+                : _client( currentClient.get() ) {
+                _olddb = _client->_database;
+                _oldns = _client->_ns;
+                setClient(ns);
+            }
+            Context(string ns) 
+                : _client( currentClient.get() ){
+                _olddb = _client->_database;
+                _oldns = _client->_ns;
+                setClient(ns.c_str());
+            }
+            
+            /* this version saves the context but doesn't yet set the new one: */
+            Context() 
+                : _client( currentClient.get() ) {
+                _olddb = _client->database();
+                _oldns = _client->ns();        
+
+            }
+            
+            /**
+             * if you are doing this after allowing a write there could be a race condition
+             * if someone closes that db.  this checks that the DB is still valid
+             */
+            Context( string ns , Database * db );
+
+            ~Context() {
+                DEV assert( _client == currentClient.get() );
+                _client->setns( _oldns.c_str(), _olddb );
+            }
+
+        };
+
+    private:
+        CurOp * const _curOp;
+        Database *_database;
+        Namespace _ns;
+        //NamespaceString _nsstr;
+        bool _shutdown;
+        list<string> _tempCollections;
+        const char *_desc;
+        bool _god;
+    public:
+        AuthenticationInfo *ai;
+        Top top;
+
+        CurOp* curop() { return _curOp; }
+        Database* database() { 
+            return _database; 
+        }
+        const char *ns() { return _ns.buf; }
+
+        void setns(const char *ns, Database *db) { 
+            _database = db;
+            _ns = ns;
+            //_nsstr = ns;
+        }
+        void clearns() { setns("", 0); }
+
+        Client(const char *desc);
+        ~Client();
+
+        const char *desc() const { return _desc; }
+
+        void addTempCollection( const string& ns ){
+            _tempCollections.push_back( ns );
+        }
+
+        /* each thread which does db operations has a Client object in TLS.  
+           call this when your thread starts. 
+        */
+        static void initThread(const char *desc);
+
+        /* 
+           this has to be called as the client goes away, but before thread termination
+           @return true if anything was done
+         */
+        bool shutdown();
+
+        bool isGod() const { return _god; }
+    };
+    
+    inline Client& cc() { 
+        return *currentClient.get();
+    }
+
+    /* each thread which does db operations has a Client object in TLS.  
+       call this when your thread starts. 
+    */
+    inline void Client::initThread(const char *desc) {
+        assert( currentClient.get() == 0 );
+        currentClient.reset( new Client(desc) );
+    }
+
+    inline Client::GodScope::GodScope(){
+        _prev = cc()._god;
+        cc()._god = true;
+    }
+
+    inline Client::GodScope::~GodScope(){
+        cc()._god = _prev;
+    }
+
+	/* this unlocks, does NOT upgrade. that works for our current usage */
+    inline void mongolock::releaseAndWriteLock() { 
+        if( !_writelock ) {
+
+#if BOOST_VERSION >= 103500
+            int s = dbMutex.getState();
+            if( s != -1 ) {
+                log() << "error: releaseAndWriteLock() s == " << s << endl;
+                msgasserted( 12600, "releaseAndWriteLock: unlock_shared failed, probably recursive" );
+            }
+#endif
+
+            _writelock = true;
+            dbMutex.unlock_shared();
+            dbMutex.lock();
+
+            /* this is defensive; as we were unlocked for a moment above, 
+               the Database object we reference could have been deleted:
+            */
+            cc().clearns();
+        }
+    }
+    
+};
+
diff --git a/db/clientcursor.cpp b/db/clientcursor.cpp
new file mode 100644
index 0000000..0de0b2e
--- /dev/null
+++ b/db/clientcursor.cpp
@@ -0,0 +1,278 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* clientcursor.cpp
+
+   ClientCursor is a wrapper that represents a cursorid from our database
+   application's perspective.
+
+   Cursor -- and its derived classes -- are our internal cursors.
+*/
+
+#include "stdafx.h"
+#include "query.h"
+#include "introspect.h"
+#include <time.h>
+#include "db.h"
+#include "commands.h"
+
+namespace mongo {
+
+    CCById ClientCursor::clientCursorsById;
+    CCByLoc ClientCursor::byLoc;
+    boost::recursive_mutex ClientCursor::ccmutex;
+
+    unsigned ClientCursor::byLocSize() { 
+        recursive_boostlock lock(ccmutex);
+        return byLoc.size();
+    }
+
+    void ClientCursor::setLastLoc_inlock(DiskLoc L) {
+        if ( L == _lastLoc )
+            return;
+
+        if ( !_lastLoc.isNull() ) {
+            CCByLoc::iterator i = kv_find(byLoc, _lastLoc, this);
+            if ( i != byLoc.end() )
+                byLoc.erase(i);
+        }
+
+        if ( !L.isNull() )
+            byLoc.insert( make_pair(L, this) );
+        _lastLoc = L;
+    }
+
+    /* ------------------------------------------- */
+
+    /* must call this when a btree node is updated */
+    //void removedKey(const DiskLoc& btreeLoc, int keyPos) {
+    //}
+
+    /* todo: this implementation is incomplete.  we use it as a prefix for dropDatabase, which
+             works fine as the prefix will end with '.'.  however, when used with drop and
+    		 deleteIndexes, this could take out cursors that belong to something else -- if you
+    		 drop "foo", currently, this will kill cursors for "foobar".
+    */
+    void ClientCursor::invalidate(const char *nsPrefix) {
+        vector<ClientCursor*> toDelete;
+
+        int len = strlen(nsPrefix);
+        assert( len > 0 && strchr(nsPrefix, '.') );
+
+        {
+            recursive_boostlock lock(ccmutex);
+
+            for ( CCByLoc::iterator i = byLoc.begin(); i != byLoc.end(); ++i ) {
+                ClientCursor *cc = i->second;
+                if ( strncmp(nsPrefix, cc->ns.c_str(), len) == 0 )
+                    toDelete.push_back(i->second);
+            }
+
+            for ( vector<ClientCursor*>::iterator i = toDelete.begin(); i != toDelete.end(); ++i )
+                delete (*i);
+        }
+    }
+
+    /* called every 4 seconds.  millis is amount of idle time passed since the last call -- could be zero */
+    void ClientCursor::idleTimeReport(unsigned millis) {
+        recursive_boostlock lock(ccmutex);
+        for ( CCByLoc::iterator i = byLoc.begin(); i != byLoc.end();  ) {
+            CCByLoc::iterator j = i;
+            i++;
+            if( j->second->shouldTimeout( millis ) ){
+                log(1) << "killing old cursor " << j->second->cursorid << ' ' << j->second->ns 
+                       << " idle:" << j->second->idleTime() << "ms\n";
+                delete j->second;
+            }
+        }
+    }
+
+    /* must call when a btree bucket going away.
+       note this is potentially slow
+    */
+    void ClientCursor::informAboutToDeleteBucket(const DiskLoc& b) {
+        recursive_boostlock lock(ccmutex);
+        RARELY if ( byLoc.size() > 70 ) {
+            log() << "perf warning: byLoc.size=" << byLoc.size() << " in aboutToDeleteBucket\n";
+        }
+        for ( CCByLoc::iterator i = byLoc.begin(); i != byLoc.end(); i++ )
+            i->second->c->aboutToDeleteBucket(b);
+    }
+    void aboutToDeleteBucket(const DiskLoc& b) {
+        ClientCursor::informAboutToDeleteBucket(b); 
+    }
+
+    /* must call this on a delete so we clean up the cursors. */
+    void ClientCursor::aboutToDelete(const DiskLoc& dl) {
+        recursive_boostlock lock(ccmutex);
+
+        CCByLoc::iterator j = byLoc.lower_bound(dl);
+        CCByLoc::iterator stop = byLoc.upper_bound(dl);
+        if ( j == stop )
+            return;
+
+        vector<ClientCursor*> toAdvance;
+
+        while ( 1 ) {
+            toAdvance.push_back(j->second);
+            WIN assert( j->first == dl );
+            ++j;
+            if ( j == stop )
+                break;
+        }
+
+        wassert( toAdvance.size() < 5000 );
+        
+        for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ){
+            ClientCursor* cc = *i;
+            
+            if ( cc->_doingDeletes ) continue;
+
+            Cursor *c = cc->c.get();
+            if ( c->capped() ){
+                delete cc;
+                continue;
+            }
+            
+            c->checkLocation();
+            DiskLoc tmp1 = c->refLoc();
+            if ( tmp1 != dl ) {
+                /* this might indicate a failure to call ClientCursor::updateLocation() */
+                problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl;
+            }
+            c->advance();
+            if ( c->eof() ) {
+                // advanced to end -- delete cursor
+                delete cc;
+            }
+            else {
+                wassert( c->refLoc() != dl );
+                cc->updateLocation();
+            }
+        }
+    }
+    void aboutToDelete(const DiskLoc& dl) { ClientCursor::aboutToDelete(dl); }
+
+    ClientCursor::~ClientCursor() {
+        assert( pos != -2 );
+
+        {
+            recursive_boostlock lock(ccmutex);
+            setLastLoc_inlock( DiskLoc() ); // removes us from bylocation multimap
+            clientCursorsById.erase(cursorid);
+
+            // defensive:
+            (CursorId&) cursorid = -1;
+            pos = -2;
+        }
+    }
+
+    /* call when cursor's location changes so that we can update the
+       cursorsbylocation map.  if you are locked and internally iterating, only
+       need to call when you are ready to "unlock".
+    */
+    void ClientCursor::updateLocation() {
+        assert( cursorid );
+        _idleAgeMillis = 0;
+        DiskLoc cl = c->refLoc();
+        if ( lastLoc() == cl ) {
+            //log() << "info: lastloc==curloc " << ns << '\n';
+            return;
+        }
+        {
+            recursive_boostlock lock(ccmutex);
+            setLastLoc_inlock(cl);
+            c->noteLocation();
+        }
+    }
+    
+    bool ClientCursor::yield() {
+        // need to store on the stack in case this gets deleted
+        CursorId id = cursorid;
+
+        bool doingDeletes = _doingDeletes;
+        _doingDeletes = false;
+
+        updateLocation();
+
+        {
+            /* a quick test that our temprelease is safe. 
+               todo: make a YieldingCursor class 
+               and then make the following code part of a unit test.
+            */
+            const int test = 0;
+            static bool inEmpty = false;
+            if( test && !inEmpty ) { 
+                inEmpty = true;
+                log() << "TEST: manipulate collection during remove" << endl;
+                if( test == 1 ) 
+                    Helpers::emptyCollection(ns.c_str());
+                else if( test == 2 ) {
+                    BSONObjBuilder b; string m;
+                    dropCollection(ns.c_str(), m, b);
+                }
+                else { 
+                    dropDatabase(ns.c_str());
+                }
+            }
+        }
+            
+        {
+            dbtempreleasecond unlock;
+        }
+
+        if ( ClientCursor::find( id , false ) == 0 ){
+            // i was deleted
+            return false;
+        }
+
+        _doingDeletes = doingDeletes;
+        return true;
+    }
+
+    int ctmLast = 0; // so we don't have to do find() which is a little slow very often.
+    long long ClientCursor::allocCursorId_inlock() {
+        long long x;
+        int ctm = (int) curTimeMillis();
+        while ( 1 ) {
+            x = (((long long)rand()) << 32);
+            x = x | ctm | 0x80000000; // OR to make sure not zero
+            if ( ctm != ctmLast || ClientCursor::find_inlock(x, false) == 0 )
+                break;
+        }
+        ctmLast = ctm;
+        DEV out() << "  alloccursorid " << x << endl;
+        return x;
+    }
+
+    // QUESTION: Restrict to the namespace from which this command was issued?
+    // Alternatively, make this command admin-only?
+    class CmdCursorInfo : public Command {
+    public:
+        CmdCursorInfo() : Command( "cursorInfo" ) {}
+        virtual bool slaveOk() { return true; }
+        virtual void help( stringstream& help ) const {
+            help << " example: { cursorInfo : 1 }";
+        }
+        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            recursive_boostlock lock(ClientCursor::ccmutex);
+            result.append("byLocation_size", unsigned( ClientCursor::byLoc.size() ) );
+            result.append("clientCursors_size", unsigned( ClientCursor::clientCursorsById.size() ) );
+            return true;
+        }
+    } cmdCursorInfo;
+
+} // namespace mongo
diff --git a/db/clientcursor.h b/db/clientcursor.h
new file mode 100644
index 0000000..03f20e9
--- /dev/null
+++ b/db/clientcursor.h
@@ -0,0 +1,216 @@
+/* clientcursor.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Cursor -- and its derived classes -- are our internal cursors.
+
+   ClientCursor is a wrapper that represents a cursorid from our database
+   application's perspective.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "cursor.h"
+#include "jsobj.h"
+#include "../util/message.h"
+#include "storage.h"
+#include "dbhelpers.h"
+#include "matcher.h"
+
+namespace mongo {
+
+    typedef long long CursorId; /* passed to the client so it can send back on getMore */
+    class Cursor; /* internal server cursor base class */
+    class ClientCursor;
+
+    /* todo: make this map be per connection.  this will prevent cursor hijacking security attacks perhaps.
+    */
+    typedef map<CursorId, ClientCursor*> CCById;
+
+    typedef multimap<DiskLoc, ClientCursor*> CCByLoc;
+
+    extern BSONObj id_obj;
+
+    class ClientCursor {
+        friend class CmdCursorInfo;
+        DiskLoc _lastLoc;                        // use getter and setter not this (important)
+        unsigned _idleAgeMillis;                 // how long has the cursor been around, relative to server idle time
+
+        /* 0 = normal
+           1 = no timeout allowed
+           100 = in use (pinned) -- see Pointer class
+        */
+        unsigned _pinValue;
+
+        bool _doingDeletes;
+
+        static CCById clientCursorsById;
+        static CCByLoc byLoc;
+        static boost::recursive_mutex ccmutex;   // must use this for all statics above!
+        
+        static CursorId allocCursorId_inlock();
+
+    public:
+        /* use this to assure we don't in the background time out cursor while it is under use.
+           if you are using noTimeout() already, there is no risk anyway.
+           Further, this mechanism guards against two getMore requests on the same cursor executing
+           at the same time - which might be bad.  That should never happen, but if a client driver
+           had a bug, it could (or perhaps some sort of attack situation).
+        */
+        class Pointer : boost::noncopyable { 
+        public:
+            ClientCursor *_c;
+            void release() {
+                if( _c ) {
+                    assert( _c->_pinValue >= 100 );
+                    _c->_pinValue -= 100;
+                }
+                _c = 0;
+            }
+            Pointer(long long cursorid) {
+                recursive_boostlock lock(ccmutex);
+                _c = ClientCursor::find_inlock(cursorid, true);
+                if( _c ) {
+                    if( _c->_pinValue >= 100 ) {
+                        _c = 0;
+                        uassert(12051, "clientcursor already in use? driver problem?", false);
+                    }
+                    _c->_pinValue += 100;
+                }
+            }
+            ~Pointer() {
+                release();
+            }
+        }; 
+
+        /*const*/ CursorId cursorid;
+        string ns;
+        auto_ptr<CoveredIndexMatcher> matcher;
+        auto_ptr<Cursor> c;
+        int pos;                                 // # objects into the cursor so far 
+        BSONObj query;
+
+        ClientCursor() : _idleAgeMillis(0), _pinValue(0), _doingDeletes(false), pos(0) {
+            recursive_boostlock lock(ccmutex);
+            cursorid = allocCursorId_inlock();
+            clientCursorsById.insert( make_pair(cursorid, this) );
+        }
+        ~ClientCursor();
+
+        DiskLoc lastLoc() const {
+            return _lastLoc;
+        }
+
+        auto_ptr< FieldMatcher > filter; // which fields query wants returned
+        Message originalMessage; // this is effectively an auto ptr for data the matcher points to
+
+        /* Get rid of cursors for namespaces that begin with nsprefix.
+           Used by drop, deleteIndexes, dropDatabase.
+        */
+        static void invalidate(const char *nsPrefix);
+
+        /**
+         * do a dbtemprelease 
+         * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic - 
+         *       we don't do herein as this->matcher (above) is only initialized for true queries/getmore.
+         *       (ie not set for remote/update)
+         * @return if the cursor is still valid. 
+         *         if false is returned, then this ClientCursor should be considered deleted
+         */
+        bool yield();
+    private:
+        void setLastLoc_inlock(DiskLoc);
+
+        static ClientCursor* find_inlock(CursorId id, bool warn = true) {
+            CCById::iterator it = clientCursorsById.find(id);
+            if ( it == clientCursorsById.end() ) {
+                if ( warn )
+                    OCCASIONALLY out() << "ClientCursor::find(): cursor not found in map " << id << " (ok after a drop)\n";
+                return 0;
+            }
+            return it->second;
+        }
+    public:
+        static ClientCursor* find(CursorId id, bool warn = true) { 
+            recursive_boostlock lock(ccmutex);
+            ClientCursor *c = find_inlock(id, warn);
+			// if this asserts, your code was not thread safe - you either need to set no timeout 
+			// for the cursor or keep a ClientCursor::Pointer in scope for it.
+            massert( 12521, "internal error: use of an unlocked ClientCursor", c->_pinValue ); 
+            return c;
+        }
+
+        static bool erase(CursorId id) {
+            recursive_boostlock lock(ccmutex);
+            ClientCursor *cc = find_inlock(id);
+            if ( cc ) {
+                assert( cc->_pinValue < 100 ); // you can't still have an active ClientCursor::Pointer
+                delete cc;
+                return true;
+            }
+            return false;
+        }
+
+        /* call when cursor's location changes so that we can update the
+           cursorsbylocation map.  if you are locked and internally iterating, only
+           need to call when you are ready to "unlock".
+           */
+        void updateLocation();
+
+        void cleanupByLocation(DiskLoc loc);
+        
+        void mayUpgradeStorage() {
+            /* if ( !ids_.get() )
+                return;
+            stringstream ss;
+            ss << ns << "." << cursorid;
+            ids_->mayUpgradeStorage( ss.str() );*/
+        }
+
+        /**
+         * @param millis amount of idle passed time since last call
+         */
+        bool shouldTimeout( unsigned millis ){
+            _idleAgeMillis += millis;
+            return _idleAgeMillis > 600000 && _pinValue == 0;
+        }
+        
+        unsigned idleTime(){
+            return _idleAgeMillis;
+        }
+
+        static void idleTimeReport(unsigned millis);
+
+        // cursors normally timeout after an inactivy period to prevent excess memory use
+        // setting this prevents timeout of the cursor in question.
+        void noTimeout() { 
+            _pinValue++;
+        }
+
+        void setDoingDeletes( bool doingDeletes ){
+            _doingDeletes = doingDeletes;
+        }
+
+        static unsigned byLocSize();        // just for diagnostics
+
+        static void informAboutToDeleteBucket(const DiskLoc& b);
+        static void aboutToDelete(const DiskLoc& dl);
+    };
+
+    
+} // namespace mongo
diff --git a/db/cloner.cpp b/db/cloner.cpp
new file mode 100644
index 0000000..862f37c
--- /dev/null
+++ b/db/cloner.cpp
@@ -0,0 +1,724 @@
+// cloner.cpp - copy a database (export/import basically)
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "pdfile.h"
+#include "../client/dbclient.h"
+#include "../util/builder.h"
+#include "jsobj.h"
+#include "query.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "repl.h"
+
+namespace mongo {
+
+    void ensureHaveIdIndex(const char *ns);
+
+    bool replAuthenticate(DBClientConnection *);
+
+    class Cloner: boost::noncopyable {
+        auto_ptr< DBClientWithCommands > conn;
+        void copy(const char *from_ns, const char *to_ns, bool isindex, bool logForRepl,
+                  bool masterSameProcess, bool slaveOk, Query q = Query());
+        void replayOpLog( DBClientCursor *c, const BSONObj &query );
+    public:
+        Cloner() { }
+
+        /* slaveOk     - if true it is ok if the source of the data is !ismaster.
+           useReplAuth - use the credentials we normally use as a replication slave for the cloning
+           snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
+                         for example repairDatabase need not use it.
+        */
+        bool go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot);
+        bool startCloneCollection( const char *fromhost, const char *ns, const BSONObj &query, string& errmsg, bool logForRepl, bool copyIndexes, int logSizeMb, long long &cursorId );
+        bool finishCloneCollection( const char *fromhost, const char *ns, const BSONObj &query, long long cursorId, string &errmsg );
+    };
+
+    /* for index info object:
+         { "name" : "name_1" , "ns" : "foo.index3" , "key" :  { "name" : 1.0 } }
+       we need to fix up the value in the "ns" parameter so that the name prefix is correct on a
+       copy to a new name.
+    */
+    BSONObj fixindex(BSONObj o) {
+        BSONObjBuilder b;
+        BSONObjIterator i(o);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            if ( string("ns") == e.fieldName() ) {
+                uassert( 10024 , "bad ns field for index during dbcopy", e.type() == String);
+                const char *p = strchr(e.valuestr(), '.');
+                uassert( 10025 , "bad ns field for index during dbcopy [2]", p);
+                string newname = cc().database()->name + p;
+                b.append("ns", newname);
+            }
+            else
+                b.append(e);
+        }
+        BSONObj res= b.obj();
+
+        /*    if( mod ) {
+            out() << "before: " << o.toString() << endl;
+            o.dump();
+            out() << "after:  " << res.toString() << endl;
+            res.dump();
+            }*/
+
+        return res;
+    }
+
+    /* copy the specified collection
+       isindex - if true, this is system.indexes collection, in which we do some transformation when copying.
+    */
+    void Cloner::copy(const char *from_collection, const char *to_collection, bool isindex, bool logForRepl, bool masterSameProcess, bool slaveOk, Query query) {
+        auto_ptr<DBClientCursor> c;
+        {
+            dbtemprelease r;
+            c = conn->query( from_collection, query, 0, 0, 0, QueryOption_NoCursorTimeout | ( slaveOk ? QueryOption_SlaveOk : 0 ) );
+        }
+        
+        list<BSONObj> storedForLater;
+        
+        assert( c.get() );
+        long long n = 0;
+        time_t saveLast = time( 0 );
+        while ( 1 ) {
+            {
+                dbtemprelease r;
+                if ( !c->more() )
+                    break;
+            }
+            BSONObj tmp = c->next();
+
+            /* assure object is valid.  note this will slow us down a little. */
+            if ( !tmp.valid() ) {
+                stringstream ss;
+                ss << "skipping corrupt object from " << from_collection;
+                BSONElement e = tmp.firstElement();
+                try {
+                    e.validate();
+                    ss << " firstElement: " << e;
+                }
+                catch( ... ){
+                    ss << " firstElement corrupt";
+                }
+                out() << ss.str() << endl;
+                continue;
+            }
+
+            ++n;
+            
+            BSONObj js = tmp;
+            if ( isindex ) {
+                assert( strstr(from_collection, "system.indexes") );
+                js = fixindex(tmp);
+                storedForLater.push_back( js.getOwned() );
+                continue;
+            }
+
+            try { 
+                theDataFileMgr.insert(to_collection, js);
+                if ( logForRepl )
+                    logOp("i", to_collection, js);
+            }
+            catch( UserException& e ) { 
+                log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
+            }
+            
+            RARELY if ( time( 0 ) - saveLast > 60 ) {
+                log() << n << " objects cloned so far from collection " << from_collection << endl;
+                saveLast = time( 0 );
+            }
+        }
+
+        if ( storedForLater.size() ){
+            for ( list<BSONObj>::iterator i = storedForLater.begin(); i!=storedForLater.end(); i++ ){
+                BSONObj js = *i;
+                try { 
+                    theDataFileMgr.insert(to_collection, js);
+                    if ( logForRepl )
+                        logOp("i", to_collection, js);
+                }
+                catch( UserException& e ) { 
+                    log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
+                }
+            }
+        }
+    }
+    
+    bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot) {
+
+		massert( 10289 ,  "useReplAuth is not written to replication log", !useReplAuth || !logForRepl );
+
+        string todb = cc().database()->name;
+        stringstream a,b;
+        a << "localhost:" << cmdLine.port;
+        b << "127.0.0.1:" << cmdLine.port;
+        bool masterSameProcess = ( a.str() == masterHost || b.str() == masterHost );
+        if ( masterSameProcess ) {
+            if ( fromdb == todb && cc().database()->path == dbpath ) {
+                // guard against an "infinite" loop
+                /* if you are replicating, the local.sources config may be wrong if you get this */
+                errmsg = "can't clone from self (localhost).";
+                return false;
+            }
+        }
+        /* todo: we can put these releases inside dbclient or a dbclient specialization.
+           or just wait until we get rid of global lock anyway.
+           */
+        string ns = fromdb + ".system.namespaces";
+        list<BSONObj> toClone;
+        {  
+            dbtemprelease r;
+		
+            auto_ptr<DBClientCursor> c;
+            {
+                if ( !masterSameProcess ) {
+                    auto_ptr< DBClientConnection > c( new DBClientConnection() );
+                    if ( !c->connect( masterHost, errmsg ) )
+                        return false;
+                    if( !replAuthenticate(c.get()) )
+                        return false;
+                    
+                    conn = c;
+                } else {
+                    conn.reset( new DBDirectClient() );
+                }
+                c = conn->query( ns.c_str(), BSONObj(), 0, 0, 0, slaveOk ? QueryOption_SlaveOk : 0 );
+            }
+
+            if ( c.get() == 0 ) {
+                errmsg = "query failed " + ns;
+                return false;
+            }
+            
+            while ( c->more() ){
+                BSONObj collection = c->next();
+
+                log(2) << "\t cloner got " << collection << endl;
+
+                BSONElement e = collection.findElement("name");
+                if ( e.eoo() ) {
+                    string s = "bad system.namespaces object " + collection.toString();
+                    massert( 10290 , s.c_str(), false);
+                }
+                assert( !e.eoo() );
+                assert( e.type() == String );
+                const char *from_name = e.valuestr();
+
+                if( strstr(from_name, ".system.") ) { 
+                    /* system.users is cloned -- but nothing else from system. */
+                    if( legalClientSystemNS( from_name , true ) == 0 ){
+                        log(2) << "\t\t not cloning because system collection" << endl;
+                        continue;
+                    }
+                }
+                else if( strchr(from_name, '$') ) {
+                    // don't clone index namespaces -- we take care of those separately below.
+                    log(2) << "\t\t not cloning because has $ " << endl;
+                    continue;
+                }            
+                
+                toClone.push_back( collection.getOwned() );
+            }
+        }
+
+        for ( list<BSONObj>::iterator i=toClone.begin(); i != toClone.end(); i++ ){
+            {
+                dbtemprelease r;
+            }
+            BSONObj collection = *i;
+            log(2) << "  really will clone: " << collection << endl;
+            const char * from_name = collection["name"].valuestr();
+            BSONObj options = collection.getObjectField("options");
+            
+            /* change name "<fromdb>.collection" -> <todb>.collection */
+            const char *p = strchr(from_name, '.');
+            assert(p);
+            string to_name = todb + p;
+
+            {
+                string err;
+                const char *toname = to_name.c_str();
+                userCreateNS(toname, options, err, logForRepl);
+            }
+            log(1) << "\t\t cloning " << from_name << " -> " << to_name << endl;
+            Query q;
+            if( snapshot ) 
+                q.snapshot();
+            copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, q);
+        }
+
+        // now build the indexes
+        string system_indexes_from = fromdb + ".system.indexes";
+        string system_indexes_to = todb + ".system.indexes";
+        /* [dm]: is the ID index sometimes not called "_id_"?  There is other code in the system that looks for a "_id" prefix 
+                 rather than this exact value.  we should standardize.  OR, remove names - which is in the bugdb.  Anyway, this 
+                 is dubious here at the moment.
+        */
+        copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, BSON( "name" << NE << "_id_" ) );
+
+        return true;
+    }
+
+    bool Cloner::startCloneCollection( const char *fromhost, const char *ns, const BSONObj &query, string &errmsg, bool logForRepl, bool copyIndexes, int logSizeMb, long long &cursorId ) {
+        char db[256];
+        nsToDatabase( ns, db );
+
+        NamespaceDetails *nsd = nsdetails( ns );
+        if ( nsd ){
+            /** note: its ok to clone into a collection, but only if the range you're copying 
+                doesn't exist on this server */
+            string err;
+            if ( runCount( ns , BSON( "query" << query ) , err ) > 0 ){
+                log() << "WARNING: data already exists for: " << ns << " in range : " << query << " deleting..." << endl;
+                deleteObjects( ns , query , false , logForRepl , false );
+            }
+        }
+
+        {
+            dbtemprelease r;
+            auto_ptr< DBClientConnection > c( new DBClientConnection() );
+            if ( !c->connect( fromhost, errmsg ) )
+                return false;
+            if( !replAuthenticate(c.get()) )
+                return false;
+            conn = c;
+
+            // Start temporary op log
+            BSONObjBuilder cmdSpec;
+            cmdSpec << "logCollection" << ns << "start" << 1;
+            if ( logSizeMb != INT_MIN )
+                cmdSpec << "logSizeMb" << logSizeMb;
+            BSONObj info;
+            if ( !conn->runCommand( db, cmdSpec.done(), info ) ) {
+                errmsg = "logCollection failed: " + (string)info;
+                return false;
+            }
+        }
+        
+        if ( ! nsd ) {
+            BSONObj spec = conn->findOne( string( db ) + ".system.namespaces", BSON( "name" << ns ) );
+            if ( !userCreateNS( ns, spec.getObjectField( "options" ), errmsg, true ) )
+                return false;
+        }
+
+        copy( ns, ns, false, logForRepl, false, false, query );
+
+        if ( copyIndexes ) {
+            string indexNs = string( db ) + ".system.indexes";
+            copy( indexNs.c_str(), indexNs.c_str(), true, logForRepl, false, false, BSON( "ns" << ns << "name" << NE << "_id_" ) );
+        }
+        
+        auto_ptr< DBClientCursor > c;
+        {
+            dbtemprelease r;
+            string logNS = "local.temp.oplog." + string( ns );
+            c = conn->query( logNS.c_str(), Query(), 0, 0, 0, QueryOption_CursorTailable );
+        }
+        if ( c->more() ) {
+            replayOpLog( c.get(), query );
+            cursorId = c->getCursorId();
+            massert( 10291 ,  "Expected valid tailing cursor", cursorId != 0 );
+        } else {
+            massert( 10292 ,  "Did not expect valid cursor for empty query result", c->getCursorId() == 0 );
+            cursorId = 0;
+        }
+        c->decouple();
+        return true;
+    }
+    
+    void Cloner::replayOpLog( DBClientCursor *c, const BSONObj &query ) {
+        Matcher matcher( query );
+        while( 1 ) {
+            BSONObj op;
+            {
+                dbtemprelease t;
+                if ( !c->more() )
+                    break;
+                op = c->next();
+            }
+            // For sharding v1.0, we don't allow shard key updates -- so just
+            // filter each insert by value.
+            if ( op.getStringField( "op" )[ 0 ] != 'i' || matcher.matches( op.getObjectField( "o" ) ) )
+                ReplSource::applyOperation( op );
+        }        
+    }
+    
+    bool Cloner::finishCloneCollection( const char *fromhost, const char *ns, const BSONObj &query, long long cursorId, string &errmsg ) {
+        char db[256];
+        nsToDatabase( ns, db );
+
+        auto_ptr< DBClientCursor > cur;
+        {
+            dbtemprelease r;
+            auto_ptr< DBClientConnection > c( new DBClientConnection() );
+            if ( !c->connect( fromhost, errmsg ) )
+                return false;
+            if( !replAuthenticate(c.get()) )
+                return false;
+            conn = c;            
+            string logNS = "local.temp.oplog." + string( ns );
+            if ( cursorId != 0 )
+                cur = conn->getMore( logNS.c_str(), cursorId );
+            else
+                cur = conn->query( logNS.c_str(), Query() );
+        }
+        replayOpLog( cur.get(), query );
+        {
+            dbtemprelease t;
+            BSONObj info;
+            if ( !conn->runCommand( db, BSON( "logCollection" << ns << "validateComplete" << 1 ), info ) ) {
+                errmsg = "logCollection failed: " + (string)info;
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    /* slaveOk     - if true it is ok if the source of the data is !ismaster.
+       useReplAuth - use the credentials we normally use as a replication slave for the cloning
+       snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
+                     for example repairDatabase need not use it.
+    */
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, 
+				   bool slaveOk, bool useReplAuth, bool snapshot)
+    {
+        Cloner c;
+        return c.go(masterHost, errmsg, fromdb, logForReplication, slaveOk, useReplAuth, snapshot);
+    }
+    
+    /* Usage:
+       mydb.$cmd.findOne( { clone: "fromhost" } );
+    */
+    class CmdClone : public Command {
+    public:
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual void help( stringstream &help ) const {
+            help << "clone this database from an instance of the db on another host\n";
+            help << "example: { clone : \"host13\" }";
+        }
+        CmdClone() : Command("clone") { }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string from = cmdObj.getStringField("clone");
+            if ( from.empty() )
+                return false;
+            /* replication note: we must logOp() not the command, but the cloned data -- if the slave
+               were to clone it would get a different point-in-time and not match.
+               */
+            return cloneFrom(from.c_str(), errmsg, cc().database()->name, 
+                             /*logForReplication=*/!fromRepl, /*slaveok*/false, /*usereplauth*/false, /*snapshot*/true);
+        }
+    } cmdclone;
+    
+    class CmdCloneCollection : public Command {
+    public:
+        virtual bool slaveOk() {
+            return false;
+        }
+        CmdCloneCollection() : Command("cloneCollection") { }
+        virtual void help( stringstream &help ) const {
+            help << " example: { cloneCollection: <collection ns>, from: <hostname>, query: <query> }";
+        }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string fromhost = cmdObj.getStringField("from");
+            if ( fromhost.empty() ) {
+                errmsg = "missing from spec";
+                return false;
+            }
+            string collection = cmdObj.getStringField("cloneCollection");
+            if ( collection.empty() ) {
+                errmsg = "missing cloneCollection spec";
+                return false;
+            }
+            BSONObj query = cmdObj.getObjectField("query");
+            if ( query.isEmpty() )
+                query = BSONObj();
+            BSONElement copyIndexesSpec = cmdObj.getField("copyindexes");
+            bool copyIndexes = copyIndexesSpec.isBoolean() ? copyIndexesSpec.boolean() : true;
+            // Will not be used if doesn't exist.
+            int logSizeMb = cmdObj.getIntField( "logSizeMb" );
+            
+            /* replication note: we must logOp() not the command, but the cloned data -- if the slave
+             were to clone it would get a different point-in-time and not match.
+             */
+            setClient( collection.c_str() );
+            
+            log() << "cloneCollection.  db:" << ns << " collection:" << collection << " from: " << fromhost << " query: " << query << " logSizeMb: " << logSizeMb << ( copyIndexes ? "" : ", not copying indexes" ) << endl;
+            
+            Cloner c;
+            long long cursorId;
+            if ( !c.startCloneCollection( fromhost.c_str(), collection.c_str(), query, errmsg, !fromRepl, copyIndexes, logSizeMb, cursorId ) )
+                return false;
+            return c.finishCloneCollection( fromhost.c_str(), collection.c_str(), query, cursorId, errmsg);
+        }
+    } cmdclonecollection;
+
+    class CmdStartCloneCollection : public Command {
+    public:
+        virtual bool slaveOk() {
+            return false;
+        }
+        CmdStartCloneCollection() : Command("startCloneCollection") { }
+        virtual void help( stringstream &help ) const {
+            help << " example: { startCloneCollection: <collection ns>, from: <hostname>, query: <query> }";
+            help << ", returned object includes a finishToken field, the value of which may be passed to the finishCloneCollection command";
+        }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string fromhost = cmdObj.getStringField("from");
+            if ( fromhost.empty() ) {
+                errmsg = "missing from spec";
+                return false;
+            }
+            string collection = cmdObj.getStringField("startCloneCollection");
+            if ( collection.empty() ) {
+                errmsg = "missing startCloneCollection spec";
+                return false;
+            }
+            BSONObj query = cmdObj.getObjectField("query");
+            if ( query.isEmpty() )
+                query = BSONObj();
+            BSONElement copyIndexesSpec = cmdObj.getField("copyindexes");
+            bool copyIndexes = copyIndexesSpec.isBoolean() ? copyIndexesSpec.boolean() : true;
+            // Will not be used if doesn't exist.
+            int logSizeMb = cmdObj.getIntField( "logSizeMb" );
+            
+            /* replication note: we must logOp() not the command, but the cloned data -- if the slave
+             were to clone it would get a different point-in-time and not match.
+             */
+            setClient( collection.c_str() );
+            
+            log() << "startCloneCollection.  db:" << ns << " collection:" << collection << " from: " << fromhost << " query: " << query << endl;
+            
+            Cloner c;
+            long long cursorId;
+            bool res = c.startCloneCollection( fromhost.c_str(), collection.c_str(), query, errmsg, !fromRepl, copyIndexes, logSizeMb, cursorId );
+            
+            if ( res ) {
+                BSONObjBuilder b;
+                b << "fromhost" << fromhost;
+                b << "collection" << collection;
+                b << "query" << query;
+                b.appendDate( "cursorId", cursorId );
+                BSONObj token = b.done();
+                result << "finishToken" << token;
+            }
+            return res;
+        }
+    } cmdstartclonecollection;
+    
+    class CmdFinishCloneCollection : public Command {
+    public:
+        virtual bool slaveOk() {
+            return false;
+        }
+        CmdFinishCloneCollection() : Command("finishCloneCollection") { }
+        virtual void help( stringstream &help ) const {
+            help << " example: { finishCloneCollection: <finishToken> }";
+        }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONObj fromToken = cmdObj.getObjectField("finishCloneCollection");
+            if ( fromToken.isEmpty() ) {
+                errmsg = "missing finishCloneCollection finishToken spec";
+                return false;
+            }
+            string fromhost = fromToken.getStringField( "fromhost" );
+            if ( fromhost.empty() ) {
+                errmsg = "missing fromhost spec";
+                return false;
+            }
+            string collection = fromToken.getStringField("collection");
+            if ( collection.empty() ) {
+                errmsg = "missing collection spec";
+                return false;
+            }
+            BSONObj query = fromToken.getObjectField("query");
+            if ( query.isEmpty() ) {
+                query = BSONObj();
+            }
+            long long cursorId = 0;
+            BSONElement cursorIdToken = fromToken.getField( "cursorId" );
+            if ( cursorIdToken.type() == Date ) {
+                cursorId = cursorIdToken._numberLong();
+            }
+            
+            setClient( collection.c_str() );
+            
+            log() << "finishCloneCollection.  db:" << ns << " collection:" << collection << " from: " << fromhost << " query: " << query << endl;
+            
+            Cloner c;
+            return c.finishCloneCollection( fromhost.c_str(), collection.c_str(), query, cursorId, errmsg );
+        }
+    } cmdfinishclonecollection;
+
+    /* Usage:
+       admindb.$cmd.findOne( { copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db> } );
+    */
+    class CmdCopyDb : public Command {
+    public:
+        CmdCopyDb() : Command("copydb") { }
+        virtual bool adminOnly() {
+            return true;
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual void help( stringstream &help ) const {
+            help << "copy a database from antoher host to this host\n";
+            help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}";
+        }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string fromhost = cmdObj.getStringField("fromhost");
+            if ( fromhost.empty() ) {
+                /* copy from self */
+                stringstream ss;
+                ss << "localhost:" << cmdLine.port;
+                fromhost = ss.str();
+            }
+            string fromdb = cmdObj.getStringField("fromdb");
+            string todb = cmdObj.getStringField("todb");
+            if ( fromhost.empty() || todb.empty() || fromdb.empty() ) {
+                errmsg = "parms missing - {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}";
+                return false;
+            }
+            setClient(todb.c_str());
+            bool res = cloneFrom(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, /*slaveok*/false, /*replauth*/false, /*snapshot*/true);
+            cc().clearns();
+            return res;
+        }
+    } cmdcopydb;
+    
+    class CmdRenameCollection : public Command {
+    public:
+        CmdRenameCollection() : Command( "renameCollection" ) {}
+        virtual bool adminOnly() {
+            return true;
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual bool logTheOp() {
+            return true; // can't log steps when doing fast rename within a db, so always log the op rather than individual steps comprising it.
+        }
+        virtual void help( stringstream &help ) const {
+            help << " example: { renameCollection: foo.a, to: bar.b }";
+        }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string source = cmdObj.getStringField( name.c_str() );
+            string target = cmdObj.getStringField( "to" );
+            if ( source.empty() || target.empty() ) {
+                errmsg = "invalid command syntax";
+                return false;
+            }
+            
+            setClient( source.c_str() );
+            NamespaceDetails *nsd = nsdetails( source.c_str() );
+            uassert( 10026 ,  "source namespace does not exist", nsd );
+            bool capped = nsd->capped;
+            long long size = 0;
+            if ( capped )
+                for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext )
+                    size += i.ext()->length;
+            
+            setClient( target.c_str() );
+            
+            if ( nsdetails( target.c_str() ) ){
+                uassert( 10027 ,  "target namespace exists", cmdObj["dropTarget"].trueValue() );
+                BSONObjBuilder bb( result.subobjStart( "dropTarget" ) );
+                dropCollection( target , errmsg , bb );
+                bb.done();
+                if ( errmsg.size() > 0 )
+                    return false;
+            }
+
+            {
+                char from[256];
+                nsToDatabase( source.c_str(), from );
+                char to[256];
+                nsToDatabase( target.c_str(), to );
+                if ( strcmp( from, to ) == 0 ) {
+                    renameNamespace( source.c_str(), target.c_str() );
+                    return true;
+                }
+            }
+
+            BSONObjBuilder spec;
+            if ( capped ) {
+                spec.appendBool( "capped", true );
+                spec.append( "size", double( size ) );
+            }
+            if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) )
+                return false;
+            
+            auto_ptr< DBClientCursor > c;
+            DBDirectClient bridge;
+
+            {
+                c = bridge.query( source, BSONObj() );
+            }
+            while( 1 ) {
+                {
+                    if ( !c->more() )
+                        break;
+                }
+                BSONObj o = c->next();
+                theDataFileMgr.insert( target.c_str(), o );
+            }
+            
+            char cl[256];
+            nsToDatabase( source.c_str(), cl );
+            string sourceIndexes = string( cl ) + ".system.indexes";
+            nsToDatabase( target.c_str(), cl );
+            string targetIndexes = string( cl ) + ".system.indexes";
+            {
+                c = bridge.query( sourceIndexes, QUERY( "ns" << source ) );
+            }
+            while( 1 ) {
+                {
+                    if ( !c->more() )
+                        break;
+                }
+                BSONObj o = c->next();
+                BSONObjBuilder b;
+                BSONObjIterator i( o );
+                while( i.moreWithEOO() ) {
+                    BSONElement e = i.next();
+                    if ( e.eoo() )
+                        break;
+                    if ( strcmp( e.fieldName(), "ns" ) == 0 ) {
+                        b.append( "ns", target );
+                    } else {
+                        b.append( e );
+                    }
+                }
+                BSONObj n = b.done();
+                theDataFileMgr.insert( targetIndexes.c_str(), n );
+            }
+
+            setClient( source.c_str() );
+            dropCollection( source, errmsg, result );
+            return true;
+        }
+    } cmdrenamecollection;
+
+} // namespace mongo
diff --git a/db/cmdline.h b/db/cmdline.h
new file mode 100644
index 0000000..b071259
--- /dev/null
+++ b/db/cmdline.h
@@ -0,0 +1,57 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+    
+    /* command line options        
+    */
+    /* concurrency: OK/READ */
+    struct CmdLine { 
+        int port;              // --port
+
+        string source;         // --source
+        string only;           // --only
+        
+        bool quiet;            // --quiet
+        bool notablescan;      // --notablescan
+        bool prealloc;         // --noprealloc
+        bool smallfiles;       // --smallfiles
+        
+        bool quota;            // --quota
+        int quotaFiles;        // --quotaFiles
+        bool cpu;              // --cpu show cpu time periodically
+
+        long long oplogSize;   // --oplogSize
+        int defaultProfile;    // --profile
+        int slowMS;            // --time in ms that is "slow"
+
+        enum { 
+            DefaultDBPort = 27017,
+			ConfigServerPort = 27019,
+			ShardServerPort = 27018
+        };
+
+        CmdLine() : 
+            port(DefaultDBPort), quiet(false), notablescan(false), prealloc(true), smallfiles(false),
+            quota(false), quotaFiles(8), cpu(false), oplogSize(0), defaultProfile(0), slowMS(100)
+        { } 
+
+    };
+
+    extern CmdLine cmdLine;
+}
diff --git a/db/commands.cpp b/db/commands.cpp
new file mode 100644
index 0000000..3078ea1
--- /dev/null
+++ b/db/commands.cpp
@@ -0,0 +1,102 @@
+/* commands.cpp
+   db "commands" (sent via db.$cmd.findOne(...))
+ */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "stdafx.h"
+#include "jsobj.h"
+#include "commands.h"
+
+namespace mongo {
+
+    map<string,Command*> * Command::_commands;
+
+    Command::Command(const char *_name) : name(_name) {
+        // register ourself.
+        if ( _commands == 0 )
+            _commands = new map<string,Command*>;
+        (*_commands)[name] = this;
+    }
+
+    void Command::help( stringstream& help ) const {
+        help << "no help defined";
+    }
+    
+    bool Command::runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder) {
+        const char *p = strchr(ns, '.');
+        if ( !p ) return false;
+        if ( strcmp(p, ".$cmd") != 0 ) return false;
+
+        bool ok = false;
+        bool valid = false;
+
+        BSONElement e;
+        e = jsobj.firstElement();
+
+        map<string,Command*>::iterator i;
+
+        if ( e.eoo() )
+            ;
+        /* check for properly registered command objects.  Note that all the commands below should be
+           migrated over to the command object format.
+           */
+        else if ( (i = _commands->find(e.fieldName())) != _commands->end() ) {
+            valid = true;
+            string errmsg;
+            Command *c = i->second;
+            if ( c->adminOnly() && strncmp(ns, "admin", 5) != 0 ) {
+                ok = false;
+                errmsg = "access denied";
+            }
+            else if ( jsobj.getBoolField( "help" ) ){
+                stringstream help;
+                help << "help for: " << e.fieldName() << " ";
+                c->help( help );
+                anObjBuilder.append( "help" , help.str() );
+            }
+            else {
+                ok = c->run(ns, jsobj, errmsg, anObjBuilder, false);
+            }
+
+            anObjBuilder.append( "ok" , ok ? 1.0 : 0.0 );
+            
+            if ( !ok ) {
+                anObjBuilder.append("errmsg", errmsg);
+                uassert_nothrow(errmsg.c_str());
+            }
+            return true;
+        }
+        
+        return false;
+    }
+
+    Command* Command::findCommand( const string& name ){
+        map<string,Command*>::iterator i = _commands->find( name );
+        if ( i == _commands->end() )
+            return 0;
+        return i->second;
+    }
+
+
+    bool Command::readOnly( const string& name ){
+        Command * c = findCommand( name );
+        if ( ! c )
+            return false;
+        return c->readOnly();
+    }
+    
+} // namespace mongo
diff --git a/db/commands.h b/db/commands.h
new file mode 100644
index 0000000..20fb98c
--- /dev/null
+++ b/db/commands.h
@@ -0,0 +1,114 @@
+// commands.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../stdafx.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    class BSONObj;
+    class BSONObjBuilder;
+    class BufBuilder;
+    
+// db "commands" (sent via db.$cmd.findOne(...))
+// subclass to make a command.
+    class Command {
+    public:
+        string name;
+
+        /* run the given command
+           implement this...
+
+           fromRepl - command is being invoked as part of replication syncing.  In this situation you
+                      normally do not want to log the command to the local oplog.
+
+           return value is true if succeeded.  if false, set errmsg text.
+        */
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) = 0;
+
+        /* true if a read lock is sufficient 
+		   note: logTheTop() MUST be false if readOnly
+		 */
+        virtual bool readOnly() { 
+            return false;
+        }
+
+        /* Return true if only the admin ns has privileges to run this command. */
+        virtual bool adminOnly() {
+            return false;
+        }
+
+        /* Like adminOnly, but even stricter: we must either be authenticated for admin db, 
+           or, if running without auth, on the local interface.
+
+           When localHostOnlyIfNoAuth() is true, adminOnly() must also be true.
+        */
+        virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return false; }
+
+        /* Return true if slaves of a replication pair are allowed to execute the command
+           (the command directly from a client -- if fromRepl, always allowed).
+        */
+        virtual bool slaveOk() = 0;
+        
+        /* Return true if the client force a command to be run on a slave by
+           turning on the 'slaveok' option in the command query.
+        */
+        virtual bool slaveOverrideOk() {
+            return false;
+        }
+
+        /* Override and return true to if true,log the operation (logOp()) to the replication log.
+           (not done if fromRepl of course)
+
+           Note if run() returns false, we do NOT log.
+        */
+        virtual bool logTheOp() {
+            return false;
+        }
+
+        virtual void help( stringstream& help ) const;
+
+        /* Return true if authentication and security applies to the commands.  Some commands 
+           (e.g., getnonce, authenticate) can be done by anyone even unauthorized.
+        */
+        virtual bool requiresAuth() { return true; }
+
+        Command(const char *_name);
+        virtual ~Command() {}
+
+    protected:
+        BSONObj getQuery( const BSONObj& cmdObj ){
+            if ( cmdObj["query"].type() == Object )
+                return cmdObj["query"].embeddedObject();
+            if ( cmdObj["q"].type() == Object )
+                return cmdObj["q"].embeddedObject();
+            return BSONObj();
+        }
+
+        static map<string,Command*> * _commands;
+
+    public:
+        static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder);
+        static bool readOnly( const string& name );
+        static Command * findCommand( const string& name );
+    };
+
+    bool _runCommands(const char *ns, BSONObj& jsobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions);
+
+} // namespace mongo
diff --git a/db/concurrency.h b/db/concurrency.h
new file mode 100644
index 0000000..daf09b6
--- /dev/null
+++ b/db/concurrency.h
@@ -0,0 +1,272 @@
+/* concurrency.h
+
+   mongod concurrency rules & notes will be placed here.
+
+   Mutex heirarchy (1 = "leaf")
+     name                   level
+     Logstream::mutex       1
+     ClientCursor::ccmutex  2
+     dblock                 3
+
+     End func name with _inlock to indicate "caller must lock before calling".
+*/
+
+#pragma once
+
+#if BOOST_VERSION >= 103500
+#include <boost/thread/shared_mutex.hpp>
+#undef assert
+#define assert xassert
+#else
+#warning built with boost version 1.34 or older limited concurrency
+#endif
+
+namespace mongo {
+
+    /* mutex time stats */
+    class MutexInfo {
+        unsigned long long start, enter, timeLocked; // all in microseconds
+        int locked;
+
+    public:
+        MutexInfo() : locked(0) {
+            start = curTimeMicros64();
+        }
+        void entered() {
+            if ( locked == 0 )
+                enter = curTimeMicros64();
+            locked++;
+            assert( locked >= 1 );
+        }
+        void leaving() {
+            locked--;
+            assert( locked >= 0 );
+            if ( locked == 0 )
+                timeLocked += curTimeMicros64() - enter;
+        }
+        int isLocked() const {
+            return locked;
+        }
+        void getTimingInfo(unsigned long long &s, unsigned long long &tl) const {
+            s = start;
+            tl = timeLocked;
+        }
+    };
+
+#if BOOST_VERSION >= 103500
+//#if 0
+    class MongoMutex {
+        MutexInfo _minfo;
+        boost::shared_mutex _m;
+        ThreadLocalValue<int> _state;
+
+        /* we use a separate TLS value for releasedEarly - that is ok as 
+           our normal/common code path, we never even touch it.
+        */
+        ThreadLocalValue<bool> _releasedEarly;
+    public:
+        /**
+         * @return
+         *    > 0  write lock
+         *    = 0  no lock
+         *    < 0  read lock
+         */
+        int getState(){ return _state.get(); }
+        void assertWriteLocked() { 
+            assert( getState() > 0 ); 
+            DEV assert( !_releasedEarly.get() );
+        }
+        bool atLeastReadLocked() { return _state.get() != 0; }
+        void assertAtLeastReadLocked() { assert(atLeastReadLocked()); }
+
+        void lock() { 
+            DEV cout << "LOCK" << endl;
+            int s = _state.get();
+            if( s > 0 ) {
+                _state.set(s+1);
+                return;
+            }
+            massert( 10293 , "internal error: locks are not upgradeable", s == 0 );
+            _state.set(1);
+            _m.lock(); 
+            _minfo.entered();
+        }
+        void unlock() { 
+            DEV cout << "UNLOCK" << endl;
+            int s = _state.get();
+            if( s > 1 ) { 
+                _state.set(s-1);
+                return;
+            }
+            if( s != 1 ) { 
+                if( _releasedEarly.get() ) { 
+                    _releasedEarly.set(false);
+                    return;
+                }
+                assert(false); // attempt to unlock when wasn't in a write lock
+            }
+            _state.set(0);
+            _minfo.leaving();
+            _m.unlock(); 
+        }
+
+        /* unlock (write lock), and when unlock() is called later, 
+           be smart then and don't unlock it again.
+           */
+        void releaseEarly() {
+            assert( getState() == 1 ); // must not be recursive
+            assert( !_releasedEarly.get() );
+            _releasedEarly.set(true);
+            unlock();
+        }
+
+        void lock_shared() { 
+            DEV cout << " LOCKSHARED" << endl;
+            int s = _state.get();
+            if( s ) {
+                if( s > 0 ) { 
+                    // already in write lock - just be recursive and stay write locked
+                    _state.set(s+1);
+                    return;
+                }
+                else { 
+                    // already in read lock - recurse
+                    _state.set(s-1);
+                    return;
+                }
+            }
+            _state.set(-1);
+            _m.lock_shared(); 
+        }
+        void unlock_shared() { 
+            DEV cout << " UNLOCKSHARED" << endl;
+            int s = _state.get();
+            if( s > 0 ) { 
+                assert( s > 1 ); /* we must have done a lock write first to have s > 1 */
+                _state.set(s-1);
+                return;
+            }
+            if( s < -1 ) { 
+                _state.set(s+1);
+                return;
+            }
+            assert( s == -1 );
+            _state.set(0);
+            _m.unlock_shared(); 
+        }
+        MutexInfo& info() { return _minfo; }
+    };
+#else
+    /* this will be for old versions of boost */
+    class MongoMutex { 
+        MutexInfo _minfo;
+        boost::recursive_mutex m;
+        ThreadLocalValue<bool> _releasedEarly;
+    public:
+        MongoMutex() { }
+        void lock() { 
+#if BOOST_VERSION >= 103500
+            m.lock();
+#else
+            boost::detail::thread::lock_ops<boost::recursive_mutex>::lock(m);
+#endif
+            _minfo.entered();
+        }
+
+        void releaseEarly() {
+            assertWriteLocked(); // aso must not be recursive, although we don't verify that in the old boost version
+            assert( !_releasedEarly.get() );
+            _releasedEarly.set(true);
+            _unlock();
+        }
+
+        void _unlock() {
+            _minfo.leaving();
+#if BOOST_VERSION >= 103500
+            m.unlock();
+#else
+            boost::detail::thread::lock_ops<boost::recursive_mutex>::unlock(m);
+#endif
+        }
+        void unlock() { 
+            if( _releasedEarly.get() ) { 
+                _releasedEarly.set(false);
+                return;
+            }
+            _unlock();
+        }
+
+        void lock_shared() { lock(); }
+        void unlock_shared() { unlock(); }
+        MutexInfo& info() { return _minfo; }
+        void assertWriteLocked() { 
+            assert( info().isLocked() );
+        }
+        void assertAtLeastReadLocked() { 
+            assert( info().isLocked() );
+        }
+        bool atLeastReadLocked() { return info().isLocked(); }
+        int getState(){ return info().isLocked() ? 1 : 0; }
+    };
+#endif
+
+    extern MongoMutex &dbMutex;
+
+	void dbunlocking_write();
+	void dbunlocking_read();
+
+    struct writelock {
+        writelock(const string& ns) {
+            dbMutex.lock();
+        }
+        ~writelock() { 
+            dbunlocking_write();
+            dbMutex.unlock();
+        }
+    };
+    
+    struct readlock {
+        readlock(const string& ns) {
+            dbMutex.lock_shared();
+        }
+        ~readlock() { 
+            dbunlocking_read();
+            dbMutex.unlock_shared();
+        }
+    };
+    
+    class mongolock {
+        bool _writelock;
+    public:
+        mongolock(bool write) : _writelock(write) {
+            if( _writelock ) {
+                dbMutex.lock();
+            }
+            else
+                dbMutex.lock_shared();
+        }
+        ~mongolock() { 
+            if( _writelock ) { 
+                dbunlocking_write();
+                dbMutex.unlock();
+            }
+            else {
+                dbunlocking_read();
+                dbMutex.unlock_shared();
+            }
+        }
+        /* this unlocks, does NOT upgrade. that works for our current usage */
+        void releaseAndWriteLock();
+    };
+    
+	/* use writelock and readlock instead */
+    struct dblock : public writelock {
+        dblock() : writelock("") { }
+        ~dblock() { 
+        }
+    };
+
+    // eliminate
+    inline void assertInWriteLock() { dbMutex.assertWriteLocked(); }
+
+}
diff --git a/db/curop.h b/db/curop.h
new file mode 100644
index 0000000..8a28f4f
--- /dev/null
+++ b/db/curop.h
@@ -0,0 +1,157 @@
+// curop.h
+
+#pragma once
+
+#include "namespace.h"
+#include "security.h"
+#include "client.h"
+
+namespace mongo { 
+
+    class OpDebug {
+    public:
+        StringBuilder str;
+        
+        void reset(){
+            str.reset();
+        }
+    };
+    
+    /* Current operation (for the current Client).
+       an embedded member of Client class, and typically used from within the mutex there. */
+    class CurOp : boost::noncopyable {
+        static WrappingInt _nextOpNum;
+        static BSONObj _tooBig; // { $msg : "query not recording (too large)" }
+
+        bool _active;
+        Timer _timer;
+        int _op;
+        WrappingInt _opNum;
+        char _ns[Namespace::MaxNsLen+2];
+        struct sockaddr_in client;
+
+        char _queryBuf[256];
+        bool haveQuery() const { return *((int *) _queryBuf) != 0; }
+        void resetQuery(int x=0) { *((int *)_queryBuf) = x; }
+        BSONObj query() {
+            if( *((int *) _queryBuf) == 1 ) { 
+                return _tooBig;
+            }
+            BSONObj o(_queryBuf);
+            return o;
+        }
+
+        OpDebug _debug;
+    public:
+        void reset( const sockaddr_in &_client) { 
+            _active = true;
+            _opNum = _nextOpNum.atomicIncrement();
+            _timer.reset();
+            _ns[0] = '?'; // just in case not set later
+            _debug.reset();
+            resetQuery();
+            client = _client;
+        }
+
+        OpDebug& debug(){
+            return _debug;
+        }
+
+        WrappingInt opNum() const { return _opNum; }
+        bool active() const { return _active; }
+
+        int elapsedMillis(){ return _timer.millis(); }
+        
+        /** micros */
+        unsigned long long startTime(){
+            return _timer.startTime();
+        }
+
+        void setActive(bool active) { _active = active; }
+        void setNS(const char *ns) {
+            strncpy(_ns, ns, Namespace::MaxNsLen);
+        }
+        void setOp(int op) { _op = op; }
+        void setQuery(const BSONObj& query) { 
+            if( query.objsize() > (int) sizeof(_queryBuf) ) { 
+                resetQuery(1); // flag as too big and return
+                return;
+            }
+            memcpy(_queryBuf, query.objdata(), query.objsize());
+        }
+
+        CurOp() { 
+            _active = false;
+//            opNum = 0; 
+            _op = 0;
+            // These addresses should never be written to again.  The zeroes are
+            // placed here as a precaution because currentOp may be accessed
+            // without the db mutex.
+            memset(_ns, 0, sizeof(_ns));
+            memset(_queryBuf, 0, sizeof(_queryBuf));
+        }
+
+        BSONObj info() { 
+            AuthenticationInfo *ai = currentClient.get()->ai;
+            if( !ai->isAuthorized("admin") ) { 
+                BSONObjBuilder b;
+                b.append("err", "unauthorized");
+                return b.obj();
+            }
+            return infoNoauth();
+        }
+        
+        BSONObj infoNoauth() {
+            BSONObjBuilder b;
+            b.append("opid", _opNum);
+            b.append("active", _active);
+            if( _active ) 
+                b.append("secs_running", _timer.seconds() );
+            if( _op == 2004 ) 
+                b.append("op", "query");
+            else if( _op == 2005 )
+                b.append("op", "getMore");
+            else if( _op == 2001 )
+                b.append("op", "update");
+            else if( _op == 2002 )
+                b.append("op", "insert");
+            else if( _op == 2006 )
+                b.append("op", "delete");
+            else
+                b.append("op", _op);
+            b.append("ns", _ns);
+
+            if( haveQuery() ) {
+                b.append("query", query());
+            }
+            // b.append("inLock",  ??
+            stringstream clientStr;
+            clientStr << inet_ntoa( client.sin_addr ) << ":" << ntohs( client.sin_port );
+            b.append("client", clientStr.str());
+            return b.obj();
+        }
+    };
+
+    /* 0 = ok
+       1 = kill current operation and reset this to 0
+       future: maybe use this as a "going away" thing on process termination with a higher flag value 
+    */
+    extern class KillCurrentOp { 
+         enum { Off, On, All } state;
+        WrappingInt toKill;
+    public:
+        void killAll() { state = All; }
+        void kill(WrappingInt i) { toKill = i; state = On; }
+
+        void checkForInterrupt() { 
+            if( state != Off ) { 
+                if( state == All ) 
+                    uasserted(11600,"interrupted at shutdown");
+                if( cc().curop()->opNum() == toKill ) { 
+                    state = Off;
+                    uasserted(11601,"interrupted");
+                }
+            }
+        }
+    } killCurrentOp;
+}
diff --git a/db/cursor.cpp b/db/cursor.cpp
new file mode 100644
index 0000000..29f9c97
--- /dev/null
+++ b/db/cursor.cpp
@@ -0,0 +1,159 @@
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "stdafx.h"
+#include "pdfile.h"
+#include "curop.h"
+
+namespace mongo {
+
+    bool BasicCursor::advance() {
+        killCurrentOp.checkForInterrupt();
+        if ( eof() ) {
+            if ( tailable_ && !last.isNull() ) {
+                curr = s->next( last );                    
+            } else {
+                return false;
+            }
+        } else {
+            last = curr;
+            curr = s->next( curr );
+        }
+        return ok();
+    }
+
+    /* these will be used outside of mutexes - really functors - thus the const */
+    class Forward : public AdvanceStrategy {
+        virtual DiskLoc next( const DiskLoc &prev ) const {
+            return prev.rec()->getNext( prev );
+        }
+    } _forward;
+
+    class Reverse : public AdvanceStrategy {
+        virtual DiskLoc next( const DiskLoc &prev ) const {
+            return prev.rec()->getPrev( prev );
+        }
+    } _reverse;
+
+    const AdvanceStrategy *forward() {
+        return &_forward;
+    }
+    const AdvanceStrategy *reverse() {
+        return &_reverse;
+    }
+
+    DiskLoc nextLoop( NamespaceDetails *nsd, const DiskLoc &prev ) {
+        assert( nsd->capLooped() );
+        DiskLoc next = forward()->next( prev );
+        if ( !next.isNull() )
+            return next;
+        return nsd->firstRecord();
+    }
+
+    DiskLoc prevLoop( NamespaceDetails *nsd, const DiskLoc &curr ) {
+        assert( nsd->capLooped() );
+        DiskLoc prev = reverse()->next( curr );
+        if ( !prev.isNull() )
+            return prev;
+        return nsd->lastRecord();
+    }
+
+    ForwardCappedCursor::ForwardCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
+            nsd( _nsd ) {
+        if ( !nsd )
+            return;
+        DiskLoc start = startLoc;
+        if ( start.isNull() ) {
+            if ( !nsd->capLooped() )
+                start = nsd->firstRecord();
+            else {
+                start = nsd->capExtent.ext()->firstRecord;
+                if ( !start.isNull() && start == nsd->capFirstNewRecord ) {
+                    start = nsd->capExtent.ext()->lastRecord;
+                    start = nextLoop( nsd, start );
+                }
+            }
+        }
+        curr = start;
+        s = this;
+    }
+
+    DiskLoc ForwardCappedCursor::next( const DiskLoc &prev ) const {
+        assert( nsd );
+        if ( !nsd->capLooped() )
+            return forward()->next( prev );
+
+        DiskLoc i = prev;
+        // Last record
+        if ( i == nsd->capExtent.ext()->lastRecord )
+            return DiskLoc();
+        i = nextLoop( nsd, i );
+        // If we become capFirstNewRecord from same extent, advance to next extent.
+        if ( i == nsd->capFirstNewRecord &&
+                i != nsd->capExtent.ext()->firstRecord )
+            i = nextLoop( nsd, nsd->capExtent.ext()->lastRecord );
+        // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord
+        if ( i == nsd->capExtent.ext()->firstRecord )
+            i = nsd->capFirstNewRecord;
+        return i;
+    }
+
+    ReverseCappedCursor::ReverseCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
+            nsd( _nsd ) {
+        if ( !nsd )
+            return;
+        DiskLoc start = startLoc;
+        if ( start.isNull() ) {
+            if ( !nsd->capLooped() ) {
+                start = nsd->lastRecord();
+            } else {
+                start = nsd->capExtent.ext()->lastRecord;
+            }
+        }
+        curr = start;
+        s = this;
+    }
+
+    DiskLoc ReverseCappedCursor::next( const DiskLoc &prev ) const {
+        assert( nsd );
+        if ( !nsd->capLooped() )
+            return reverse()->next( prev );
+
+        DiskLoc i = prev;
+        // Last record
+        if ( nsd->capFirstNewRecord == nsd->capExtent.ext()->firstRecord ) {
+            if ( i == nextLoop( nsd, nsd->capExtent.ext()->lastRecord ) ) {
+                return DiskLoc();
+            }
+        } else {
+            if ( i == nsd->capExtent.ext()->firstRecord ) {
+                return DiskLoc();
+            }
+        }
+        // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev.
+        if ( i == nsd->capFirstNewRecord )
+            i = prevLoop( nsd, nsd->capExtent.ext()->firstRecord );
+        else
+            i = prevLoop( nsd, i );
+        // If we just became last in cap extent, advance past capFirstNewRecord
+        // (We know capExtent.ext()->firstRecord != capFirstNewRecord, since would
+        // have returned DiskLoc() earlier otherwise.)
+        if ( i == nsd->capExtent.ext()->lastRecord )
+            i = reverse()->next( nsd->capFirstNewRecord );
+
+        return i;
+    }
+} // namespace mongo
diff --git a/db/cursor.h b/db/cursor.h
new file mode 100644
index 0000000..3868cca
--- /dev/null
+++ b/db/cursor.h
@@ -0,0 +1,198 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+
+#include "jsobj.h"
+#include "storage.h"
+
+namespace mongo {
+    
+    class Record;
+
+    /* Query cursors, base class.  This is for our internal cursors.  "ClientCursor" is a separate
+       concept and is for the user's cursor.
+
+       WARNING concurrency: the vfunctions below are called back from within a 
+       ClientCursor::ccmutex.  Don't cause a deadlock, you've been warned.
+    */
+    class Cursor {
+    public:
+        virtual ~Cursor() {}
+        virtual bool ok() = 0;
+        bool eof() {
+            return !ok();
+        }
+        virtual Record* _current() = 0;
+        virtual BSONObj current() = 0;
+        virtual DiskLoc currLoc() = 0;
+        virtual bool advance() = 0; /*true=ok*/
+        virtual BSONObj currKey() const { return BSONObj(); }
+
+        // DiskLoc the cursor requires for continued operation.  Before this
+        // DiskLoc is deleted, the cursor must be incremented or destroyed.
+        virtual DiskLoc refLoc() = 0;
+
+        /* Implement these if you want the cursor to be "tailable" */
+        
+        /* Request that the cursor starts tailing after advancing past last record. */
+        /* The implementation may or may not honor this request. */
+        virtual void setTailable() {}
+        /* indicates if tailing is enabled. */
+        virtual bool tailable() {
+            return false;
+        }
+
+        virtual void aboutToDeleteBucket(const DiskLoc& b) { }
+
+        /* optional to implement.  if implemented, means 'this' is a prototype */
+        virtual Cursor* clone() {
+            return 0;
+        }
+
+        virtual BSONObj indexKeyPattern() {
+            return BSONObj();
+        }
+
+        /* called after every query block is iterated -- i.e. between getMore() blocks
+           so you can note where we are, if necessary.
+           */
+        virtual void noteLocation() { }
+
+        /* called before query getmore block is iterated */
+        virtual void checkLocation() { }
+
+        virtual string toString() {
+            return "abstract?";
+        }
+
+        /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+           if a multikey index traversal:
+             if loc has already been sent, returns true.
+             otherwise, marks loc as sent.
+           @param deep - match was against an array, so we know it is multikey.  this is legacy and kept
+                         for backwards datafile compatibility.  'deep' can be eliminated next time we 
+                         force a data file conversion. 7Jul09
+        */
+        virtual bool getsetdup(DiskLoc loc) = 0;
+
+        virtual BSONObj prettyStartKey() const { return BSONObj(); }
+        virtual BSONObj prettyEndKey() const { return BSONObj(); }
+
+        virtual bool capped() const { return false; }
+    };
+
+    // strategy object implementing direction of traversal.
+    class AdvanceStrategy {
+    public:
+        virtual ~AdvanceStrategy() { }
+        virtual DiskLoc next( const DiskLoc &prev ) const = 0;
+    };
+
+    const AdvanceStrategy *forward();
+    const AdvanceStrategy *reverse();
+
+    /* table-scan style cursor */
+    class BasicCursor : public Cursor {
+    protected:
+        DiskLoc curr, last;
+        const AdvanceStrategy *s;
+
+    private:
+        bool tailable_;
+        void init() {
+            tailable_ = false;
+        }
+    public:
+        bool ok() {
+            return !curr.isNull();
+        }
+        Record* _current() {
+            assert( ok() );
+            return curr.rec();
+        }
+        BSONObj current() {
+            Record *r = _current();
+            BSONObj j(r);
+            return j;
+        }
+        virtual DiskLoc currLoc() {
+            return curr;
+        }
+        virtual DiskLoc refLoc() {
+            return curr.isNull() ? last : curr;
+        }
+        
+        bool advance();
+
+        BasicCursor(DiskLoc dl, const AdvanceStrategy *_s = forward()) : curr(dl), s( _s ) {
+            init();
+        }
+        BasicCursor(const AdvanceStrategy *_s = forward()) : s( _s ) {
+            init();
+        }
+        virtual string toString() {
+            return "BasicCursor";
+        }
+        virtual void setTailable() {
+            if ( !curr.isNull() || !last.isNull() )
+                tailable_ = true;
+        }
+        virtual bool tailable() {
+            return tailable_;
+        }
+        virtual bool getsetdup(DiskLoc loc) { return false; }
+    };
+
+    /* used for order { $natural: -1 } */
+    class ReverseCursor : public BasicCursor {
+    public:
+        ReverseCursor(DiskLoc dl) : BasicCursor( dl, reverse() ) { }
+        ReverseCursor() : BasicCursor( reverse() ) { }
+        virtual string toString() {
+            return "ReverseCursor";
+        }
+    };
+
+    class NamespaceDetails;
+
+    class ForwardCappedCursor : public BasicCursor, public AdvanceStrategy {
+    public:
+        ForwardCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() );
+        virtual string toString() {
+            return "ForwardCappedCursor";
+        }
+        virtual DiskLoc next( const DiskLoc &prev ) const;
+        virtual bool capped() const { return true; }
+    private:
+        NamespaceDetails *nsd;
+    };
+
+    class ReverseCappedCursor : public BasicCursor, public AdvanceStrategy {
+    public:
+        ReverseCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() );
+        virtual string toString() {
+            return "ReverseCappedCursor";
+        }
+        virtual DiskLoc next( const DiskLoc &prev ) const;
+        virtual bool capped() const { return true; }
+    private:
+        NamespaceDetails *nsd;
+    };
+
+} // namespace mongo
diff --git a/db/database.cpp b/db/database.cpp
new file mode 100644
index 0000000..6361e86
--- /dev/null
+++ b/db/database.cpp
@@ -0,0 +1,64 @@
+// database.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "pdfile.h"
+#include "database.h"
+
+namespace mongo {
+
+    bool Database::_openAllFiles = false;
+
+    bool Database::setProfilingLevel( int newLevel , string& errmsg ){
+        if ( profile == newLevel )
+            return true;
+        
+        if ( newLevel < 0 || newLevel > 2 ){
+            errmsg = "profiling level has to be >=0 and <= 2";
+            return false;
+        }
+        
+        if ( newLevel == 0 ){
+            profile = 0;
+            return true;
+        }
+        
+        assert( cc().database() == this );
+
+        if ( ! namespaceIndex.details( profileName.c_str() ) ){
+            log(1) << "creating profile ns: " << profileName << endl;
+            BSONObjBuilder spec;
+            spec.appendBool( "capped", true );
+            spec.append( "size", 131072.0 );
+            if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , true ) ){
+                return false;
+            }
+        }
+        profile = newLevel;
+        return true;
+    }
+
+    void Database::finishInit(){
+        if ( cmdLine.defaultProfile == profile )
+            return;
+        
+        string errmsg;
+        massert( 12506 , errmsg , setProfilingLevel( cmdLine.defaultProfile , errmsg ) );
+    }
+
+} // namespace mongo
diff --git a/db/database.h b/db/database.h
new file mode 100644
index 0000000..0fcf386
--- /dev/null
+++ b/db/database.h
@@ -0,0 +1,207 @@
+// database.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "cmdline.h"
+
+namespace mongo {
+
+
+    /**
+     * Database represents a database database
+     * Each database database has its own set of files -- dbname.ns, dbname.0, dbname.1, ...
+     * NOT memory mapped
+    */
+    class Database {
+    public:
+        static bool _openAllFiles;
+        
+        Database(const char *nm, bool& newDb, const string& _path = dbpath)
+            : name(nm), path(_path), namespaceIndex( path, name ) {
+            
+            { // check db name is valid
+                int L = strlen(nm);
+                uassert( 10028 ,  "db name is empty", L > 0 );
+                uassert( 10029 ,  "bad db name [1]", *nm != '.' );
+                uassert( 10030 ,  "bad db name [2]", nm[L-1] != '.' );
+                uassert( 10031 ,  "bad char(s) in db name", strchr(nm, ' ') == 0 );
+                uassert( 10032 ,  "db name too long", L < 64 );
+            }
+
+            newDb = namespaceIndex.exists();
+            profile = 0;
+            profileName = name + ".system.profile";
+
+            // If already exists, open.  Otherwise behave as if empty until
+            // there's a write, then open.
+            if ( ! newDb || cmdLine.defaultProfile ) {
+                namespaceIndex.init();
+                if( _openAllFiles )
+                    openAllFiles();
+
+            }
+            
+            magic = 781231;
+        }
+        
+        ~Database() {
+            magic = 0;
+            btreeStore->closeFiles(name, path);
+            int n = files.size();
+            for ( int i = 0; i < n; i++ )
+                delete files[i];
+        }
+        
+        /**
+         * tries to make sure that this hasn't been deleted
+         */
+        bool isOk(){
+            return magic == 781231;
+        }
+
+        bool isEmpty(){
+            return ! namespaceIndex.allocated();
+        }
+
+        bool exists(int n) { 
+            stringstream ss;
+            ss << name << '.' << n;
+            boost::filesystem::path fullName;
+            fullName = boost::filesystem::path(path) / ss.str();
+            return boost::filesystem::exists(fullName);
+        }
+
+        void openAllFiles() { 
+            int n = 0;
+            while( exists(n) ) { 
+                getFile(n);
+                n++;
+            }
+            // If last file is empty, consider it preallocated and make sure it's not mapped
+            // until a write is requested
+            if ( n > 1 && getFile( n - 1 )->getHeader()->isEmpty() ) {
+                delete files[ n - 1 ];
+                files.pop_back();
+            }
+        }
+
+        MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false ) {
+            assert(this);
+
+            namespaceIndex.init();
+            if ( n < 0 || n >= DiskLoc::MaxFiles ) {
+                out() << "getFile(): n=" << n << endl;
+#if !defined(_RECSTORE)
+                if( n >= RecCache::Base && n <= RecCache::Base+1000 )
+                    massert( 10294 , "getFile(): bad file number - using recstore db w/nonrecstore db build?", false);
+#endif
+                massert( 10295 , "getFile(): bad file number value (corrupt db?): run repair", false);
+            }
+            DEV {
+                if ( n > 100 )
+                    out() << "getFile(): n=" << n << "?" << endl;
+            }
+            MongoDataFile* p = 0;
+            if ( !preallocateOnly ) {
+                while ( n >= (int) files.size() )
+                    files.push_back(0);
+                p = files[n];
+            }
+            if ( p == 0 ) {
+                stringstream ss;
+                ss << name << '.' << n;
+                boost::filesystem::path fullName;
+                fullName = boost::filesystem::path(path) / ss.str();
+                string fullNameString = fullName.string();
+                p = new MongoDataFile(n);
+                int minSize = 0;
+                if ( n != 0 && files[ n - 1 ] )
+                    minSize = files[ n - 1 ]->getHeader()->fileLength;
+                if ( sizeNeeded + MDFHeader::headerSize() > minSize )
+                    minSize = sizeNeeded + MDFHeader::headerSize();
+                try {
+                    p->open( fullNameString.c_str(), minSize, preallocateOnly );
+                }
+                catch ( AssertionException& ) {
+                    delete p;
+                    throw;
+                }
+                if ( preallocateOnly )
+                    delete p;
+                else
+                    files[n] = p;
+            }
+            return preallocateOnly ? 0 : p;
+        }
+
+        MongoDataFile* addAFile( int sizeNeeded = 0, bool preallocateNextFile = false ) {
+            int n = (int) files.size();
+            MongoDataFile *ret = getFile( n, sizeNeeded );
+            if ( preallocateNextFile )
+                preallocateAFile();
+            return ret;
+        }
+        
+        // safe to call this multiple times - the implementation will only preallocate one file
+        void preallocateAFile() {
+            int n = (int) files.size();
+            getFile( n, 0, true );
+        }
+
+        MongoDataFile* suitableFile( int sizeNeeded ) {
+            MongoDataFile* f = newestFile();
+            for ( int i = 0; i < 8; i++ ) {
+                if ( f->getHeader()->unusedLength >= sizeNeeded )
+                    break;
+                f = addAFile( sizeNeeded );
+                if ( f->getHeader()->fileLength >= MongoDataFile::maxSize() ) // this is as big as they get so might as well stop
+                    break;
+            }
+            return f;
+        }
+
+        Extent* allocExtent( const char *ns, int size, bool capped ) { 
+            Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped );
+            if( e ) return e;
+            return suitableFile( size )->createExtent( ns, size, capped );
+        }
+        
+        MongoDataFile* newestFile() {
+            int n = (int) files.size();
+            if ( n > 0 ) n--;
+            return getFile(n);
+        }
+        
+        /**
+         * @return true if success, false otherwise
+         */
+        bool setProfilingLevel( int newLevel , string& errmsg );
+
+        void finishInit();
+        
+        vector<MongoDataFile*> files;
+        string name; // "alleyinsider"
+        string path;
+        NamespaceIndex namespaceIndex;
+        int profile; // 0=off.
+        string profileName; // "alleyinsider.system.profile"
+        int magic; // used for making sure the object is still loaded in memory 
+    };
+
+} // namespace mongo
diff --git a/db/db.cpp b/db/db.cpp
new file mode 100644
index 0000000..9b1a22a
--- /dev/null
+++ b/db/db.cpp
@@ -0,0 +1,1101 @@
+// db.cpp : Defines the entry point for the console application.
+//
+
+/**
+*    Copyright (C) 2008 10gen Inc.info
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "db.h"
+#include "query.h"
+#include "introspect.h"
+#include "repl.h"
+#include "../util/unittest.h"
+#include "../util/file_allocator.h"
+#include "../util/background.h"
+#include "dbmessage.h"
+#include "instance.h"
+#include "clientcursor.h"
+#include "pdfile.h"
+#if !defined(_WIN32)
+#include <sys/file.h>
+#endif
+
+#if defined(_WIN32)
+#include "../util/ntservice.h"
+#endif
+
+#include "../scripting/engine.h"
+#include "module.h"
+#include "cmdline.h"
+
+namespace mongo {
+
+    bool useJNI = true;
+
+    /* only off if --nocursors which is for debugging. */
+    extern bool useCursors;
+    /* only off if --nohints */
+    extern bool useHints;
+
+    bool noHttpInterface = false;
+
+    extern string bind_ip;
+    extern char *appsrvPath;
+    extern bool autoresync;
+    extern int diagLogging;
+    extern int lenForNewNsFiles;
+    extern int lockFile;
+
+    void setupSignals();
+    void closeAllSockets();
+    void startReplication();
+    void pairWith(const char *remoteEnd, const char *arb);
+    void setRecCacheSize(unsigned MB);
+
+    const char *ourgetns() { 
+        Client *c = currentClient.get();
+        return c ? c->ns() : "";
+    }
+
+    struct MyStartupTests {
+        MyStartupTests() {
+            assert( sizeof(OID) == 12 );
+        }
+    } mystartupdbcpp;
+
+    QueryResult* emptyMoreResult(long long);
+
+    void testTheDb() {
+        OpDebug debug;
+        setClient("sys.unittest.pdfile");
+
+        /* this is not validly formatted, if you query this namespace bad things will happen */
+        theDataFileMgr.insert("sys.unittest.pdfile", (void *) "hello worldx", 13);
+        theDataFileMgr.insert("sys.unittest.pdfile", (void *) "hello worldx", 13);
+
+        BSONObj j1((const char *) &js1);
+        deleteObjects("sys.unittest.delete", j1, false);
+        theDataFileMgr.insert("sys.unittest.delete", &js1, sizeof(js1));
+        deleteObjects("sys.unittest.delete", j1, false);
+        updateObjects("sys.unittest.delete", j1, j1, true,false,true,debug);
+        updateObjects("sys.unittest.delete", j1, j1, false,false,true,debug);
+
+        auto_ptr<Cursor> c = theDataFileMgr.findAll("sys.unittest.pdfile");
+        while ( c->ok() ) {
+            c->_current();
+            c->advance();
+        }
+        out() << endl;
+
+        cc().clearns();
+    }
+
+    MessagingPort *connGrab = 0;
+    void connThread();
+
+    class OurListener : public Listener {
+    public:
+        OurListener(const string &ip, int p) : Listener(ip, p) { }
+        virtual void accepted(MessagingPort *mp) {
+            assert( connGrab == 0 );
+            if ( ! connTicketHolder.tryAcquire() ){
+                log() << "connection refused because too many open connections" << endl;
+                // TODO: would be nice if we notified them...
+                mp->shutdown();
+                return;
+            }
+            connGrab = mp;
+            try {
+                boost::thread thr(connThread);
+                while ( connGrab )
+                    sleepmillis(1);
+            }
+            catch ( boost::thread_resource_error& ){
+                log() << "can't create new thread, closing connection" << endl;
+                mp->shutdown();
+                connGrab = 0;
+            }
+            catch ( ... ){
+                log() << "unkonwn exception starting connThread" << endl;
+                mp->shutdown();
+                connGrab = 0;
+            }
+        }
+    };
+
+    void webServerThread();
+    void pdfileInit();
+
+    void listen(int port) {
+        log() << mongodVersion() << endl;
+        printGitVersion();
+        printSysInfo();
+        pdfileInit();
+        //testTheDb();
+        log() << "waiting for connections on port " << port << endl;
+        OurListener l(bind_ip, port);
+        startReplication();
+        if ( !noHttpInterface )
+            boost::thread thr(webServerThread);
+        if ( l.init() ) {
+            ListeningSockets::get()->add( l.socket() );
+            l.listen();
+        }
+    }
+
+} // namespace mongo
+
+#include "client.h"
+
+namespace mongo {
+
+  void sysRuntimeInfo() {
+    out() << "sysinfo:\n";
+#if defined(_SC_PAGE_SIZE)
+    out() << "  page size: " << (int) sysconf(_SC_PAGE_SIZE) << endl;
+#endif
+#if defined(_SC_PHYS_PAGES)
+    out() << "  _SC_PHYS_PAGES: " << sysconf(_SC_PHYS_PAGES) << endl;
+#endif
+#if defined(_SC_AVPHYS_PAGES)
+    out() << "  _SC_AVPHYS_PAGES: " << sysconf(_SC_AVPHYS_PAGES) << endl;
+#endif
+  }
+
+    /* we create one thread for each connection from an app server database.
+       app server will open a pool of threads.
+    */
+    void connThread()
+    {
+        TicketHolderReleaser connTicketReleaser( &connTicketHolder );
+        Client::initThread("conn");
+
+        /* todo: move to Client object */
+        LastError *le = new LastError();
+        lastError.reset(le);
+
+        MessagingPort& dbMsgPort = *connGrab;
+        connGrab = 0;
+        Client& c = cc();
+
+        try {
+
+            c.ai->isLocalHost = dbMsgPort.farEnd.isLocalHost();
+
+            Message m;
+            while ( 1 ) {
+                m.reset();
+
+                if ( !dbMsgPort.recv(m) ) {
+                    if( !cmdLine.quiet )
+                        log() << "end connection " << dbMsgPort.farEnd.toString() << endl;
+                    dbMsgPort.shutdown();
+                    break;
+                }
+
+                lastError.startRequest( m , le );
+
+                DbResponse dbresponse;
+                if ( !assembleResponse( m, dbresponse, dbMsgPort.farEnd.sa ) ) {
+                    out() << curTimeMillis() % 10000 << "   end msg " << dbMsgPort.farEnd.toString() << endl;
+                    /* todo: we may not wish to allow this, even on localhost: very low priv accounts could stop us. */
+                    if ( dbMsgPort.farEnd.isLocalHost() ) {
+                        dbMsgPort.shutdown();
+                        sleepmillis(50);
+                        problem() << "exiting end msg" << endl;
+                        dbexit(EXIT_CLEAN);
+                    }
+                    else {
+                        out() << "  (not from localhost, ignoring end msg)" << endl;
+                    }
+                }
+
+                if ( dbresponse.response )
+                    dbMsgPort.reply(m, *dbresponse.response, dbresponse.responseTo);
+            }
+
+        }
+        catch ( AssertionException& ) {
+            problem() << "AssertionException in connThread, closing client connection" << endl;
+            dbMsgPort.shutdown();
+        }
+        catch ( SocketException& ) {
+            problem() << "SocketException in connThread, closing client connection" << endl;
+            dbMsgPort.shutdown();
+        }
+        catch ( std::exception &e ) {
+            problem() << "Uncaught std::exception: " << e.what() << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+        catch ( ... ) {
+            problem() << "Uncaught exception, terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+
+        // any thread cleanup can happen here
+
+        if ( currentClient.get() )
+            currentClient->shutdown();
+        globalScriptEngine->threadDone();
+    }
+
+
+    void msg(const char *m, const char *address, int port, int extras = 0) {
+
+        SockAddr db(address, port);
+
+//  SockAddr db("127.0.0.1", DBPort);
+//  SockAddr db("192.168.37.1", MessagingPort::DBPort);
+//  SockAddr db("10.0.21.60", MessagingPort::DBPort);
+//  SockAddr db("172.16.0.179", MessagingPort::DBPort);
+
+        MessagingPort p;
+        if ( !p.connect(db) )
+            return;
+
+        const int Loops = 1;
+        for ( int q = 0; q < Loops; q++ ) {
+            Message send;
+            Message response;
+
+            send.setData( dbMsg , m);
+            int len = send.data->dataLen();
+
+            for ( int i = 0; i < extras; i++ )
+                p.say(/*db, */send);
+
+            Timer t;
+            bool ok = p.call(send, response);
+            double tm = ((double) t.micros()) + 1;
+            out() << " ****ok. response.data:" << ok << " time:" << tm / 1000.0 << "ms " <<
+                 ((double) len) * 8 / 1000000 / (tm/1000000) << "Mbps" << endl;
+            if (  q+1 < Loops ) {
+                out() << "\t\tSLEEP 8 then sending again as a test" << endl;
+                sleepsecs(8);
+            }
+        }
+        sleepsecs(1);
+
+        p.shutdown();
+    }
+
+    void msg(const char *m, int extras = 0) {
+        msg(m, "127.0.0.1", CmdLine::DefaultDBPort, extras);
+    }
+
+    bool shouldRepairDatabases = 0;
+    bool forceRepair = 0;
+    
+    bool doDBUpgrade( const string& dbName , string errmsg , MDFHeader * h ){
+        static DBDirectClient db;
+        
+        if ( h->version == 4 && h->versionMinor == 4 ){
+            assert( VERSION == 4 );
+            assert( VERSION_MINOR == 5 );
+            
+            list<string> colls = db.getCollectionNames( dbName );
+            for ( list<string>::iterator i=colls.begin(); i!=colls.end(); i++){
+                string c = *i;
+                log() << "\t upgrading collection:" << c << endl;
+                BSONObj out;
+                bool ok = db.runCommand( dbName , BSON( "reIndex" << c.substr( dbName.size() + 1 ) ) , out );
+                if ( ! ok ){
+                    errmsg = "reindex failed";
+                    log() << "\t\t reindex failed: " << out << endl;
+                    return false;
+                }
+            }
+            
+            h->versionMinor = 5;
+            return true;
+        }
+        
+        // do this in the general case
+        return repairDatabase( dbName.c_str(), errmsg );
+    }
+    
+    void repairDatabases() {
+        log(1) << "enter repairDatabases" << endl;
+        dblock lk;
+        vector< string > dbNames;
+        getDatabaseNames( dbNames );
+        for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
+            string dbName = *i;
+            log(1) << "\t" << dbName << endl;
+            assert( !setClient( dbName.c_str() ) );
+            MongoDataFile *p = cc().database()->getFile( 0 );
+            MDFHeader *h = p->getHeader();
+            if ( !h->currentVersion() || forceRepair ) {
+                log() << "****" << endl;
+                log() << "****" << endl;
+                log() << "need to upgrade database " << dbName << " with pdfile version " << h->version << "." << h->versionMinor << ", "
+                      << "new version: " << VERSION << "." << VERSION_MINOR << endl;
+                if ( shouldRepairDatabases ){
+                    // QUESTION: Repair even if file format is higher version than code?
+                    log() << "\t starting upgrade" << endl;
+                    string errmsg;
+                    assert( doDBUpgrade( dbName , errmsg , h ) );
+                }
+                else {
+                    log() << "\t Not upgrading, exiting!" << endl;
+                    log() << "\t run --upgrade to upgrade dbs, then start again" << endl;
+                    log() << "****" << endl;
+                    dbexit( EXIT_NEED_UPGRADE );
+                    shouldRepairDatabases = 1;
+                    return;
+                }
+            } else {
+                closeDatabase( dbName.c_str() );
+            }
+        }
+
+        log(1) << "done repairDatabases" << endl;
+
+        if ( shouldRepairDatabases ){
+            log() << "finished checking dbs" << endl;
+            cc().shutdown();
+            dbexit( EXIT_CLEAN );
+        }
+    }
+
+    void clearTmpFiles() {
+        boost::filesystem::path path( dbpath );
+        for ( boost::filesystem::directory_iterator i( path );
+                i != boost::filesystem::directory_iterator(); ++i ) {
+            string fileName = boost::filesystem::path(*i).leaf();
+            if ( boost::filesystem::is_directory( *i ) &&
+                    fileName.length() > 2 && fileName.substr( 0, 3 ) == "tmp" )
+                boost::filesystem::remove_all( *i );
+        }
+    }
+
+    void clearTmpCollections() {
+        vector< string > toDelete;
+        DBDirectClient cli;
+        auto_ptr< DBClientCursor > c = cli.query( "local.system.namespaces", Query( fromjson( "{name:/^local.temp./}" ) ) );
+        while( c->more() ) {
+            BSONObj o = c->next();
+            toDelete.push_back( o.getStringField( "name" ) );
+        }
+        for( vector< string >::iterator i = toDelete.begin(); i != toDelete.end(); ++i ) {
+            log() << "Dropping old temporary collection: " << *i << endl;
+            cli.dropCollection( *i );
+        }
+    }
+
+    /**
+     * does background async flushes of mmapped files
+     */
+    class DataFileSync : public BackgroundJob {
+    public:
+        void run(){
+            log(1) << "will flush memory every: " << _sleepsecs << " seconds" << endl;
+            while ( ! inShutdown() ){
+                if ( _sleepsecs == 0 ){
+                    // in case at some point we add an option to change at runtime
+                    sleepsecs(5);
+                    continue;
+                }
+                sleepmillis( (int)(_sleepsecs * 1000) );
+                MemoryMappedFile::flushAll( false );
+                log(1) << "flushing mmmap" << endl;
+            }
+        }
+        
+        double _sleepsecs; // default value controlled by program options
+    } dataFileSync;
+
+    void show_32_warning(){
+#if BOOST_VERSION < 103500
+        cout << "\nwarning: built with boost version <= 1.34, limited concurrency" << endl;
+#endif
+
+        if ( sizeof(int*) != 4 )
+            return;
+        cout << endl;
+        cout << "** NOTE: when using MongoDB 32 bit, you are limited to about 2 gigabytes of data" << endl;
+        cout << "**       see http://blog.mongodb.org/post/137788967/32-bit-limitations for more" << endl;
+        cout << endl;
+    }
+
+    Timer startupSrandTimer;
+
+    void _initAndListen(int listenPort, const char *appserverLoc = null) {
+
+#if !defined(_WIN32)
+        pid_t pid = 0;
+        pid = getpid();
+#else
+        int pid=0;
+#endif
+
+        bool is32bit = sizeof(int*) == 4;
+
+        log() << "Mongo DB : starting : pid = " << pid << " port = " << cmdLine.port << " dbpath = " << dbpath
+              <<  " master = " << master << " slave = " << (int) slave << "  " << ( is32bit ? "32" : "64" ) << "-bit " << endl;
+
+        show_32_warning();
+
+        stringstream ss;
+        ss << "dbpath (" << dbpath << ") does not exist";
+        massert( 10296 ,  ss.str().c_str(), boost::filesystem::exists( dbpath ) );
+
+        acquirePathLock();
+        remove_all( dbpath + "/_tmp/" );
+
+        theFileAllocator().start();
+
+        BOOST_CHECK_EXCEPTION( clearTmpFiles() );
+
+        Client::initThread("initandlisten");
+
+        clearTmpCollections();
+
+        _diaglog.init();
+
+        Module::initAll();
+
+#if 0
+        {
+            stringstream indexpath;
+            indexpath << dbpath << "/indexes.dat";
+            RecCache::tempStore.init(indexpath.str().c_str(), BucketSize);
+        }
+#endif
+
+        if ( useJNI ) {
+            ScriptEngine::setup();
+        }
+
+        repairDatabases();
+
+        /* we didn't want to pre-open all fiels for the repair check above. for regular
+           operation we do for read/write lock concurrency reasons.
+        */        
+        Database::_openAllFiles = true;
+
+        if ( shouldRepairDatabases )
+            return;
+
+        /* this is for security on certain platforms (nonce generation) */
+        srand((unsigned) (curTimeMicros() ^ startupSrandTimer.micros()));
+
+        listen(listenPort);
+
+        // listen() will return when exit code closes its socket.
+        while( 1 )
+            sleepsecs( 100 );
+    }
+    void initAndListen(int listenPort, const char *appserverLoc = null) {
+        try { _initAndListen(listenPort, appserverLoc); }
+        catch ( std::exception &e ) {
+            problem() << "exception in initAndListen std::exception: " << e.what() << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+        catch ( int& n ){
+            problem() << "exception in initAndListen int: " << n << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+        catch(...) {
+            log() << " exception in initAndListen, terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+    }
+
+    #if defined(_WIN32)
+    bool initService() {
+        ServiceController::reportStatus( SERVICE_RUNNING );
+        initAndListen( cmdLine.port, appsrvPath );
+        return true;
+    }
+    #endif
+
+} // namespace mongo
+
+
+using namespace mongo;
+
+#include <boost/program_options.hpp>
+
+namespace po = boost::program_options;
+
+
+void show_help_text(po::options_description options) {
+    show_32_warning();
+    cout << options << endl;
+};
+
+/* Return error string or "" if no errors. */
+string arg_error_check(int argc, char* argv[]) {
+    for (int i = 1; i < argc; i++) {
+        string s = argv[i];
+        /* check for inclusion of old-style arbiter setting. */
+        if (s == "--pairwith") {
+            if (argc > i + 2) {
+                string old_arbiter = argv[i + 2];
+                if (old_arbiter == "-" || old_arbiter.substr(0, 1) != "-") {
+                    return "Specifying arbiter using --pairwith is no longer supported, please use --arbiter";
+                }
+            }
+        }
+    }
+    return "";
+}
+
+int main(int argc, char* argv[], char *envp[] )
+{
+    getcurns = ourgetns;
+
+    po::options_description general_options("General options");
+    po::options_description replication_options("Replication options");
+    po::options_description sharding_options("Sharding options");
+    po::options_description visible_options("Allowed options");
+    po::options_description hidden_options("Hidden options");
+    po::options_description cmdline_options("Command line options");
+
+    po::positional_options_description positional_options;
+
+    general_options.add_options()
+        ("help,h", "show this usage information")
+        ("version", "show version information")
+        ("config,f", po::value<string>(), "configuration file specifying additional options")
+        ("port", po::value<int>(&cmdLine.port)/*->default_value(CmdLine::DefaultDBPort)*/, "specify port number")
+        ("bind_ip", po::value<string>(&bind_ip),
+         "local ip address to bind listener - all local ips bound by default")
+        ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)")
+        ("dbpath", po::value<string>()->default_value("/data/db/"), "directory for datafiles")
+        ("quiet", "quieter output")
+        ("logpath", po::value<string>() , "file to send all output to instead of stdout" )
+        ("logappend" , "appnd to logpath instead of over-writing" )
+#ifndef _WIN32
+        ("fork" , "fork server process" )
+#endif
+        ("cpu", "periodically show cpu and iowait utilization")
+        ("noauth", "run without security")
+        ("auth", "run with security")
+        ("objcheck", "inspect client data for validity on receipt")
+        ("quota", "enable db quota management")
+        ("quotaFiles", po::value<int>(), "number of files allower per db, requires --quota")
+        ("appsrvpath", po::value<string>(), "root directory for the babble app server")
+        ("nocursors", "diagnostic/debugging option")
+        ("nohints", "ignore query hints")
+        ("nohttpinterface", "disable http interface")
+        ("noscripting", "disable scripting engine")
+        ("noprealloc", "disable data file preallocation")
+        ("smallfiles", "use a smaller default file size")
+        ("nssize", po::value<int>()->default_value(16), ".ns file size (in MB) for new databases")
+        ("diaglog", po::value<int>(), "0=off 1=W 2=R 3=both 7=W+some reads")
+        ("sysinfo", "print some diagnostic system information")
+        ("upgrade", "upgrade db if needed")
+        ("repair", "run repair on all dbs")
+        ("notablescan", "do not allow table scans")
+        ("syncdelay",po::value<double>(&dataFileSync._sleepsecs)->default_value(60), "seconds between disk syncs (0 for never)")
+        ("profile",po::value<int>(), "0=off 1=slow, 2=all")
+        ("slowms",po::value<int>(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" )
+        ("maxConns",po::value<int>(), "max number of simultaneous connections")
+#if defined(_WIN32)
+        ("install", "install mongodb service")
+        ("remove", "remove mongodb service")
+        ("service", "start mongodb service")
+#endif
+        ;
+
+    replication_options.add_options()
+        ("master", "master mode")
+        ("slave", "slave mode")
+        ("source", po::value<string>(), "when slave: specify master as <server:port>")
+        ("only", po::value<string>(), "when slave: specify a single database to replicate")
+        ("pairwith", po::value<string>(), "address of server to pair with")
+        ("arbiter", po::value<string>(), "address of arbiter server")
+        ("autoresync", "automatically resync if slave data is stale")
+        ("oplogSize", po::value<long>(), "size limit (in MB) for op log")
+        ("opIdMem", po::value<long>(), "size limit (in bytes) for in memory storage of op ids")
+        ;
+
+	sharding_options.add_options()
+		("configsvr", "declare this is a config db of a cluster")
+		("shardsvr", "declare this is a shard db of a cluster")
+		;
+
+    hidden_options.add_options()
+        ("command", po::value< vector<string> >(), "command")
+        ("cacheSize", po::value<long>(), "cache size (in MB) for rec store")
+        ;
+
+    /* support for -vv -vvvv etc. */
+    for (string s = "vv"; s.length() <= 10; s.append("v")) {
+        hidden_options.add_options()(s.c_str(), "verbose");
+    }
+
+    positional_options.add("command", 3);
+    visible_options.add(general_options);
+    visible_options.add(replication_options);
+    visible_options.add(sharding_options);
+    Module::addOptions( visible_options );
+    cmdline_options.add(visible_options);
+    cmdline_options.add(hidden_options);
+
+    setupSignals();
+
+    dbExecCommand = argv[0];
+
+    srand(curTimeMicros());
+    boost::filesystem::path::default_name_check( boost::filesystem::no_check );
+
+    {
+        unsigned x = 0x12345678;
+        unsigned char& b = (unsigned char&) x;
+        if ( b != 0x78 ) {
+            out() << "big endian cpus not yet supported" << endl;
+            return 33;
+        }
+    }
+
+    DEV out() << "DEV is defined (using _DEBUG), which is slower...\n";
+
+    UnitTest::runTests();
+
+    if (argc == 1) {
+        cout << dbExecCommand << " --help for help and startup options" << endl;
+    }
+
+    {
+        bool installService = false;
+        bool removeService = false;
+        bool startService = false;
+        po::variables_map params;
+
+        string error_message = arg_error_check(argc, argv);
+        if (error_message != "") {
+            cout << error_message << endl << endl;
+            show_help_text(visible_options);
+            return 0;
+        }
+
+        /* don't allow guessing - creates ambiguities when some options are
+         * prefixes of others. allow long disguises and don't allow guessing
+         * to get away with our vvvvvvv trick. */
+        int command_line_style = (((po::command_line_style::unix_style ^
+                                    po::command_line_style::allow_guessing) |
+                                   po::command_line_style::allow_long_disguise) ^
+                                  po::command_line_style::allow_sticky);
+
+        try {
+            po::store(po::command_line_parser(argc, argv).options(cmdline_options).
+                      positional(positional_options).
+                      style(command_line_style).run(), params);
+
+            if (params.count("config")) {
+                ifstream config_file (params["config"].as<string>().c_str());
+                if (config_file.is_open()) {
+                    po::store(po::parse_config_file(config_file, cmdline_options), params);
+                    config_file.close();
+                } else {
+                    cout << "ERROR: could not read from config file" << endl << endl;
+                    cout << visible_options << endl;
+                    return 0;
+                }
+            }
+
+            po::notify(params);
+        } catch (po::error &e) {
+            cout << "ERROR: " << e.what() << endl << endl;
+            cout << visible_options << endl;
+            return 0;
+        }
+
+        if (params.count("help")) {
+            show_help_text(visible_options);
+            return 0;
+        }
+        if (params.count("version")) {
+            cout << mongodVersion() << endl;
+            printGitVersion();
+            return 0;
+        }
+        dbpath = params["dbpath"].as<string>();
+        if (params.count("quiet")) {
+            cmdLine.quiet = true;
+        }
+        if (params.count("verbose")) {
+            logLevel = 1;
+        }
+        for (string s = "vv"; s.length() <= 10; s.append("v")) {
+            if (params.count(s)) {
+                logLevel = s.length();
+            }
+        }
+        if (params.count("cpu")) {
+            cmdLine.cpu = true;
+        }
+        if (params.count("noauth")) {
+            noauth = true;
+        }
+        if (params.count("auth")) {
+            noauth = false;
+        }
+        if (params.count("quota")) {
+            cmdLine.quota = true;
+        }
+        if (params.count("quotaFiles")) {
+            cmdLine.quota = true;
+            cmdLine.quotaFiles = params["quotaFiles"].as<int>() - 1;
+        }
+        if (params.count("objcheck")) {
+            objcheck = true;
+        }
+        if (params.count("appsrvpath")) {
+            /* casting away the const-ness here */
+            appsrvPath = (char*)(params["appsrvpath"].as<string>().c_str());
+        }
+#ifndef _WIN32
+        if (params.count("fork")) {
+            if ( ! params.count( "logpath" ) ){
+                cout << "--fork has to be used with --logpath" << endl;
+                return -1;
+            }
+            pid_t c = fork();
+            if ( c ){
+                cout << "forked process: " << c << endl;
+                ::exit(0);
+            }
+            setsid();
+            setupSignals();
+        }
+#endif
+        if (params.count("logpath")) {
+            string lp = params["logpath"].as<string>();
+            uassert( 10033 ,  "logpath has to be non-zero" , lp.size() );
+            initLogging( lp , params.count( "logappend" ) );
+        }
+        if (params.count("nocursors")) {
+            useCursors = false;
+        }
+        if (params.count("nohints")) {
+            useHints = false;
+        }
+        if (params.count("nohttpinterface")) {
+            noHttpInterface = true;
+        }
+        if (params.count("noscripting")) {
+            useJNI = false;
+        }
+        if (params.count("noprealloc")) {
+            cmdLine.prealloc = false;
+        }
+        if (params.count("smallfiles")) {
+            cmdLine.smallfiles = true;
+        }
+        if (params.count("diaglog")) {
+            int x = params["diaglog"].as<int>();
+            if ( x < 0 || x > 7 ) {
+                out() << "can't interpret --diaglog setting" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            _diaglog.level = x;
+        }
+        if (params.count("sysinfo")) {
+            sysRuntimeInfo();
+            return 0;
+        }
+        if (params.count("repair")) {
+            shouldRepairDatabases = 1;
+            forceRepair = 1;
+        }
+        if (params.count("upgrade")) {
+            shouldRepairDatabases = 1;
+        }
+        if (params.count("notablescan")) {
+            cmdLine.notablescan = true;
+        }
+        if (params.count("install")) {
+            installService = true;
+        }
+        if (params.count("remove")) {
+            removeService = true;
+        }
+        if (params.count("service")) {
+            startService = true;
+        }
+        if (params.count("master")) {
+            master = true;
+        }
+        if (params.count("slave")) {
+            slave = SimpleSlave;
+        }
+        if (params.count("autoresync")) {
+            autoresync = true;
+        }
+        if (params.count("source")) {
+            /* specifies what the source in local.sources should be */
+            cmdLine.source = params["source"].as<string>().c_str();
+        }
+        if (params.count("only")) {
+            cmdLine.only = params["only"].as<string>().c_str();
+        }
+        if (params.count("pairwith")) {
+            string paired = params["pairwith"].as<string>();
+            if (params.count("arbiter")) {
+                string arbiter = params["arbiter"].as<string>();
+                pairWith(paired.c_str(), arbiter.c_str());
+            } else {
+                pairWith(paired.c_str(), "-");
+            }
+        } else if (params.count("arbiter")) {
+            uasserted(10999,"specifying --arbiter without --pairwith");
+        }
+        if( params.count("nssize") ) {
+            int x = params["nssize"].as<int>();
+            uassert( 10034 , "bad --nssize arg", x > 0 && x <= (0x7fffffff/1024/1024));
+            lenForNewNsFiles = x * 1024 * 1024;
+            assert(lenForNewNsFiles > 0);
+        }
+        if (params.count("oplogSize")) {
+            long x = params["oplogSize"].as<long>();
+            uassert( 10035 , "bad --oplogSize arg", x > 0);
+            cmdLine.oplogSize = x * 1024 * 1024;
+            assert(cmdLine.oplogSize > 0);
+        }
+        if (params.count("opIdMem")) {
+            long x = params["opIdMem"].as<long>();
+            uassert( 10036 , "bad --opIdMem arg", x > 0);
+            opIdMem = x;
+            assert(opIdMem > 0);
+        }
+        if (params.count("cacheSize")) {
+            long x = params["cacheSize"].as<long>();
+            uassert( 10037 , "bad --cacheSize arg", x > 0);
+            setRecCacheSize(x);
+        }
+		if (params.count("port") == 0 ) { 
+			if( params.count("configsvr") ) {
+				cmdLine.port = CmdLine::ConfigServerPort;
+			}
+			if( params.count("shardsvr") )
+				cmdLine.port = CmdLine::ShardServerPort;
+		}
+        if ( params.count("configsvr" ) && params.count( "diaglog" ) == 0 ){
+            _diaglog.level = 1;
+        }
+        if ( params.count( "profile" ) ){
+            cmdLine.defaultProfile = params["profile"].as<int>();
+        }
+        if ( params.count( "maxConns" ) ){
+            int newSize = params["maxConns"].as<int>();
+            uassert( 12507 , "maxConns has to be at least 5" , newSize >= 5 );
+            uassert( 12508 , "maxConns can't be greater than 10000000" , newSize < 10000000 );
+            connTicketHolder.resize( newSize );
+        }
+        
+        Module::configAll( params );
+        dataFileSync.go();
+
+        if (params.count("command")) {
+            vector<string> command = params["command"].as< vector<string> >();
+
+            if (command[0].compare("msg") == 0) {
+                const char *m;
+
+                if (command.size() < 3) {
+                    cout << "Too few parameters to 'msg' command" << endl;
+                    cout << visible_options << endl;
+                    return 0;
+                }
+
+                m = command[1].c_str();
+
+                msg(m, "127.0.0.1", atoi(command[2].c_str()));
+                return 0;
+            }
+            if (command[0].compare("run") == 0) {
+                if (command.size() > 1) {
+                    cout << "Too many parameters to 'run' command" << endl;
+                    cout << visible_options << endl;
+                    return 0;
+                }
+
+                initAndListen(cmdLine.port);
+                return 0;
+            }
+
+            if (command[0].compare("dbpath") == 0) {
+                cout << dbpath << endl;
+                return 0;
+            }
+
+            cout << "Invalid command: " << command[0] << endl;
+            cout << visible_options << endl;
+            return 0;
+        }
+
+#if defined(_WIN32)
+        if ( installService ) {
+            if ( !ServiceController::installService( L"MongoDB", L"Mongo DB", L"Mongo DB Server", argc, argv ) )
+                dbexit( EXIT_NTSERVICE_ERROR );
+            dbexit( EXIT_CLEAN );
+        }
+        else if ( removeService ) {
+            if ( !ServiceController::removeService( L"MongoDB" ) )
+                dbexit( EXIT_NTSERVICE_ERROR );
+            dbexit( EXIT_CLEAN );
+        }
+        else if ( startService ) {
+            if ( !ServiceController::startService( L"MongoDB", mongo::initService ) )
+                dbexit( EXIT_NTSERVICE_ERROR );
+            dbexit( EXIT_CLEAN );
+        }
+#endif
+    }
+
+    initAndListen(cmdLine.port, appsrvPath);
+    dbexit(EXIT_CLEAN);
+    return 0;
+}
+
+namespace mongo {
+
+    /* we do not use log() below as it uses a mutex and that could cause deadlocks.
+    */
+
+    string getDbContext();
+
+#undef out
+
+    void exitCleanly() {
+        goingAway = true;
+        killCurrentOp.killAll();
+        {
+            dblock lk;
+            log() << "now exiting" << endl;
+            dbexit( EXIT_KILL );        
+        }
+    }
+
+#if !defined(_WIN32)
+
+} // namespace mongo
+
+#include <signal.h>
+#include <string.h>
+
+namespace mongo {
+
+    void pipeSigHandler( int signal ) {
+#ifdef psignal
+        psignal( signal, "Signal Received : ");
+#else
+        cout << "got pipe signal:" << signal << endl;
+#endif
+    }
+
+    void abruptQuit(int x) {
+        ostringstream ossSig;
+        ossSig << "Got signal: " << x << " (" << strsignal( x ) << ")." << endl;
+        rawOut( ossSig.str() );
+
+        /*
+        ostringstream ossOp;
+        ossOp << "Last op: " << currentOp.infoNoauth() << endl;
+        rawOut( ossOp.str() );
+        */
+
+        ostringstream oss;
+        oss << "Backtrace:" << endl;
+        printStackTrace( oss );
+        rawOut( oss.str() );
+        dbexit( EXIT_ABRUBT );
+    }
+
+    sigset_t asyncSignals;
+    // The above signals will be processed by this thread only, in order to
+    // ensure the db and log mutexes aren't held.
+    void interruptThread() {
+        int x;
+        sigwait( &asyncSignals, &x );
+        log() << "got kill or ctrl c signal " << x << " (" << strsignal( x ) << "), will terminate after current cmd ends" << endl;
+        exitCleanly();
+    }
+
+    void setupSignals() {
+        assert( signal(SIGSEGV, abruptQuit) != SIG_ERR );
+        assert( signal(SIGFPE, abruptQuit) != SIG_ERR );
+        assert( signal(SIGABRT, abruptQuit) != SIG_ERR );
+        assert( signal(SIGBUS, abruptQuit) != SIG_ERR );
+        assert( signal(SIGPIPE, pipeSigHandler) != SIG_ERR );
+        assert( signal(SIGUSR1 , rotateLogs ) != SIG_ERR );
+
+        setupSIGTRAPforGDB();
+
+        sigemptyset( &asyncSignals );
+        sigaddset( &asyncSignals, SIGINT );
+        sigaddset( &asyncSignals, SIGTERM );
+        assert( pthread_sigmask( SIG_SETMASK, &asyncSignals, 0 ) == 0 );
+        boost::thread it( interruptThread );
+    }
+
+#else
+void ctrlCTerminate() {
+    log() << "got kill or ctrl c signal, will terminate after current cmd ends" << endl;
+    exitCleanly();
+}
+BOOL CtrlHandler( DWORD fdwCtrlType )
+{
+    switch( fdwCtrlType )
+    {
+    case CTRL_C_EVENT:
+        rawOut("Ctrl-C signal\n");
+        ctrlCTerminate();
+        return( TRUE );
+    case CTRL_CLOSE_EVENT:
+        rawOut("CTRL_CLOSE_EVENT signal\n");
+        ctrlCTerminate();
+        return( TRUE );
+    case CTRL_BREAK_EVENT:
+        rawOut("CTRL_BREAK_EVENT signal\n");
+        ctrlCTerminate();
+        return TRUE;
+    case CTRL_LOGOFF_EVENT:
+        rawOut("CTRL_LOGOFF_EVENT signal (ignored)\n");
+        return FALSE;
+    case CTRL_SHUTDOWN_EVENT:
+         rawOut("CTRL_SHUTDOWN_EVENT signal (ignored)\n");
+         return FALSE;
+    default:
+        return FALSE;
+    }
+}
+
+    void setupSignals() {
+        if( SetConsoleCtrlHandler( (PHANDLER_ROUTINE) CtrlHandler, TRUE ) )
+            ;
+        else
+            massert( 10297 , "Couldn't register Windows Ctrl-C handler", false);
+    }
+#endif
+
+void temptestfoo() {
+    MongoMutex m;
+    m.lock();
+//    m.lock_upgrade();
+    m.lock_shared();
+}
+
+
+} // namespace mongo
+
+#include "recstore.h"
+#include "reccache.h"
+
diff --git a/db/db.h b/db/db.h
new file mode 100644
index 0000000..3475f34
--- /dev/null
+++ b/db/db.h
@@ -0,0 +1,219 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "../util/message.h"
+#include "../util/top.h"
+#include "boost/version.hpp"
+#include "concurrency.h"
+#include "pdfile.h"
+#include "client.h"
+
+namespace mongo {
+
+//    void jniCallback(Message& m, Message& out);
+
+    /* Note the limit here is rather arbitrary and is simply a standard. generally the code works
+       with any object that fits in ram.
+
+       Also note that the server has some basic checks to enforce this limit but those checks are not exhaustive
+       for example need to check for size too big after
+         update $push (append) operation
+         various db.eval() type operations
+
+       Note also we sometimes do work with objects slightly larger - an object in the replication local.oplog
+       could be slightly larger.
+    */
+    const int MaxBSONObjectSize = 4 * 1024 * 1024;
+    
+    /**
+     * class to hold path + dbname -> Database
+     * might be able to optimizer further
+     */
+    class DatabaseHolder {
+    public:
+        DatabaseHolder() : _size(0){
+        }
+
+        Database * get( const string& ns , const string& path ){
+            dbMutex.assertAtLeastReadLocked();
+            map<string,Database*>& m = _paths[path];
+            
+            string db = _todb( ns );
+
+            map<string,Database*>::iterator it = m.find(db);
+            if ( it != m.end() ) 
+                return it->second;
+            return 0;
+        }
+        
+        void put( const string& ns , const string& path , Database * db ){
+            dbMutex.assertWriteLocked();
+            map<string,Database*>& m = _paths[path];
+            Database*& d = m[_todb(ns)];
+            if ( ! d )
+                _size++;
+            d = db;
+        }
+        
+        void erase( const string& ns , const string& path ){
+            dbMutex.assertWriteLocked();
+            map<string,Database*>& m = _paths[path];
+            _size -= m.erase( _todb( ns ) );
+        }
+
+        bool closeAll( const string& path , BSONObjBuilder& result );
+
+        int size(){
+            return _size;
+        }
+        
+        /**
+         * gets all unique db names, ignoring paths
+         */
+        void getAllShortNames( set<string>& all ) const{
+            dbMutex.assertAtLeastReadLocked();
+            for ( map<string, map<string,Database*> >::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ){
+                map<string,Database*> m = i->second;
+                for( map<string,Database*>::const_iterator j=m.begin(); j!=m.end(); j++ ){
+                    all.insert( j->first );
+                }
+            }
+        }
+        
+    private:
+        
+        string _todb( const string& ns ){
+            size_t i = ns.find( '.' );
+            if ( i == string::npos )
+                return ns;
+            return ns.substr( 0 , i );
+        }
+        
+        map<string, map<string,Database*> > _paths;
+        int _size;
+        
+    };
+
+    extern DatabaseHolder dbHolder;
+
+    /* returns true if the database ("database") did not exist, and it was created on this call 
+       path - datafiles directory, if not the default, so we can differentiate between db's of the same
+              name in different places (for example temp ones on repair).
+    */
+    inline bool setClient(const char *ns, const string& path , mongolock *lock ) {
+        if( logLevel > 5 )
+            log() << "setClient: " << ns << endl;
+
+        dbMutex.assertAtLeastReadLocked();
+
+        Client& c = cc();
+        c.top.clientStart( ns );
+
+        Database * db = dbHolder.get( ns , path );
+        if ( db ){
+            c.setns(ns, db );
+            return false;
+        }
+
+        if( lock )
+            lock->releaseAndWriteLock();
+
+        assertInWriteLock();
+        
+        char cl[256];
+        nsToDatabase(ns, cl);
+        bool justCreated;
+        Database *newdb = new Database(cl, justCreated, path);
+        dbHolder.put(ns,path,newdb);
+        c.setns(ns, newdb);
+
+        newdb->finishInit();
+
+        return justCreated;
+    }
+
+    // shared functionality for removing references to a database from this program instance
+    // does not delete the files on disk
+    void closeDatabase( const char *cl, const string& path = dbpath );
+
+    struct dbtemprelease {
+        string clientname;
+        string clientpath;
+        int locktype;
+        dbtemprelease() {
+            Client& client = cc();
+            Database *database = client.database();
+            if ( database ) {
+                clientname = database->name;
+                clientpath = database->path;
+            }
+            client.top.clientStop();
+            locktype = dbMutex.getState();
+            assert( locktype );
+            if ( locktype > 0 ) {
+				massert( 10298 , "can't temprelease nested write lock", locktype == 1);
+                dbMutex.unlock();
+			}
+            else {
+				massert( 10299 , "can't temprelease nested read lock", locktype == -1);
+                dbMutex.unlock_shared();
+			}
+        }
+        ~dbtemprelease() {
+            if ( locktype > 0 )
+                dbMutex.lock();
+            else
+                dbMutex.lock_shared();
+            if ( clientname.empty() )
+                cc().setns("", 0);
+            else
+                setClient(clientname.c_str(), clientpath.c_str());
+        }
+    };
+
+    /**
+       only does a temp release if we're not nested and have a lock
+     */
+    struct dbtempreleasecond {
+        dbtemprelease * real;
+        int locktype;
+        
+        dbtempreleasecond(){
+            real = 0;
+            locktype = dbMutex.getState();
+            if ( locktype == 1 || locktype == -1 )
+                real = new dbtemprelease();
+        }
+        
+        ~dbtempreleasecond(){
+            if ( real ){
+                delete real;
+                real = 0;
+            }
+        }
+        
+    };
+
+    extern TicketHolder connTicketHolder;
+
+
+} // namespace mongo
+
+//#include "dbinfo.h"
+#include "concurrency.h"
diff --git a/db/db.rc b/db/db.rc
new file mode 100644
index 0000000..fbfd379
--- /dev/null
+++ b/db/db.rc
@@ -0,0 +1,61 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+#define APSTUDIO_READONLY_SYMBOLS
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 2 resource.
+//
+// #include "afxres.h"
+
+/////////////////////////////////////////////////////////////////////////////
+#undef APSTUDIO_READONLY_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+// English (U.S.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
+LANGUAGE 9, 1
+#pragma code_page(1252)
+
+#ifdef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// TEXTINCLUDE
+//
+
+1 TEXTINCLUDE  
+BEGIN
+    "resource.h\0"
+END
+
+2 TEXTINCLUDE  
+BEGIN
+    "#include ""afxres.h""\r\n"
+    "\0"
+END
+
+3 TEXTINCLUDE  
+BEGIN
+    "\r\n"
+    "\0"
+END
+
+#endif    // APSTUDIO_INVOKED
+
+#endif    // English (U.S.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 3 resource.
+//
+
+
+/////////////////////////////////////////////////////////////////////////////
+#endif    // not APSTUDIO_INVOKED
+
diff --git a/db/db.sln b/db/db.sln
new file mode 100644
index 0000000..35fd85f
--- /dev/null
+++ b/db/db.sln
@@ -0,0 +1,57 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongod", "db.vcproj", "{215B2D68-0A70-4D10-8E75-B31010C62A91}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{4082881B-EB00-486F-906C-843B8EC06E18}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "dbtests", "dbtests", "{C72EBEDD-342D-4371-8B0D-D7505902FA69}"
+	ProjectSection(SolutionItems) = preProject
+		..\dbtests\btreetests.cpp = ..\dbtests\btreetests.cpp
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "shell", "shell", "{2CABB3B8-C9A6-478D-9463-0B37799ED708}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
+	ProjectSection(SolutionItems) = preProject
+		..\tools\bridge.cpp = ..\tools\bridge.cpp
+		..\tools\export.cpp = ..\tools\export.cpp
+		..\tools\files.cpp = ..\tools\files.cpp
+		..\tools\sniffer.cpp = ..\tools\sniffer.cpp
+		..\tools\tool.cpp = ..\tools\tool.cpp
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "..\dbtests\test.vcproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug Recstore|Win32 = Debug Recstore|Win32
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug Recstore|Win32.ActiveCfg = Debug Recstore|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug Recstore|Win32.Build.0 = Debug Recstore|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.Build.0 = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug Recstore|Win32.ActiveCfg = Debug Recstore|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug Recstore|Win32.Build.0 = Debug Recstore|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.Build.0 = Debug|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.ActiveCfg = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.Build.0 = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug Recstore|Win32.ActiveCfg = Debug Recstore|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug Recstore|Win32.Build.0 = Debug Recstore|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/db/db.vcproj b/db/db.vcproj
new file mode 100644
index 0000000..6dc0aae
--- /dev/null
+++ b/db/db.vcproj
@@ -0,0 +1,1891 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="mongod"
+	ProjectGUID="{215B2D68-0A70-4D10-8E75-B31010C62A91}"
+	RootNamespace="db"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			UseOfMFC="0"
+			UseOfATL="0"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\js\src;&quot;..\pcre-7.4&quot;;&quot;c:\Program Files\boost\boost_1_35_0&quot;"
+				PreprocessorDefinitions="OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="2"
+				WarningLevel="3"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="4"
+				DisableSpecificWarnings="4355;4800"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="ws2_32.lib Psapi.lib"
+				LinkIncremental="2"
+				AdditionalLibraryDirectories="&quot;c:\Program Files\boost\boost_1_35_0\lib&quot;"
+				IgnoreAllDefaultLibraries="false"
+				IgnoreDefaultLibraryNames=""
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="..\..\js\src;&quot;..\pcre-7.4&quot;;&quot;c:\Program Files\boost\boost_1_35_0&quot;"
+				PreprocessorDefinitions="OLDJS;STATIC_JS_API;XP_WIN;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC"
+				RuntimeLibrary="0"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="2"
+				PrecompiledHeaderThrough="stdafx.h"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+				DisableSpecificWarnings="4355;4800"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="ws2_32.lib"
+				LinkIncremental="1"
+				AdditionalLibraryDirectories="&quot;c:\program files\boost\boost_1_35_0\lib&quot;"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="release_nojni|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="&quot;..\pcre-7.4&quot;;&quot;c:\Program Files\boost\boost_1_35_0&quot;;&quot;c:\program files\java\jdk\include&quot;;&quot;c:\program files\java\jdk\include\win32&quot;"
+				PreprocessorDefinitions="NOJNI;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="2"
+				PrecompiledHeaderThrough="stdafx.h"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+				DisableSpecificWarnings="4355;4800"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="ws2_32.lib"
+				LinkIncremental="1"
+				AdditionalLibraryDirectories="&quot;c:\program files\boost\boost_1_35_0\lib&quot;"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug Recstore|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			UseOfMFC="0"
+			UseOfATL="0"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\js\src;&quot;..\pcre-7.4&quot;;&quot;c:\Program Files\boost\boost_1_35_0&quot;"
+				PreprocessorDefinitions="_RECSTORE;OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="2"
+				WarningLevel="3"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="4"
+				DisableSpecificWarnings="4355;4800"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="ws2_32.lib"
+				LinkIncremental="2"
+				AdditionalLibraryDirectories="&quot;c:\Program Files\boost\boost_1_35_0\lib&quot;"
+				IgnoreAllDefaultLibraries="false"
+				IgnoreDefaultLibraryNames=""
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="misc and third party"
+			>
+			<File
+				RelativePath="..\..\boostw\boost_1_34_1\boost\config\auto_link.hpp"
+				>
+			</File>
+			<File
+				RelativePath=".\db.rc"
+				>
+			</File>
+			<File
+				RelativePath="..\..\js\js\Debug\js.lib"
+				>
+				<FileConfiguration
+					Name="Release|Win32"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\js\js\Release\js.lib"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug Recstore|Win32"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="C:\Program Files\Java\jdk\lib\jvm.lib"
+				>
+				<FileConfiguration
+					Name="release_nojni|Win32"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\pcre-7.4\pcrecpp.cc"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="0"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="0"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="release_nojni|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="0"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug Recstore|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="0"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\pcre-7.4\pcrecpp.h"
+				>
+			</File>
+			<File
+				RelativePath="..\SConstruct"
+				>
+			</File>
+			<File
+				RelativePath="..\targetver.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\boostw\boost_1_34_1\boost\version.hpp"
+				>
+			</File>
+			<Filter
+				Name="pcre"
+				>
+				<File
+					RelativePath="..\pcre-7.4\config.h"
+					>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre.h"
+					>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_chartables.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_compile.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_config.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_dfa_exec.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_exec.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_fullinfo.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_get.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_globals.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_info.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_maketables.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_newline.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_ord2utf8.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_refcount.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_scanner.cc"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_stringpiece.cc"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_study.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_tables.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_try_flipped.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_ucp_searchfuncs.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_valid_utf8.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_version.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcre_xclass.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\pcre-7.4\pcreposix.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+			</Filter>
+		</Filter>
+		<Filter
+			Name="storage related"
+			>
+			<File
+				RelativePath=".\rec.h"
+				>
+			</File>
+			<File
+				RelativePath=".\reccache.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\reccache.h"
+				>
+			</File>
+			<File
+				RelativePath=".\reci.h"
+				>
+			</File>
+			<File
+				RelativePath=".\recstore.h"
+				>
+			</File>
+			<File
+				RelativePath=".\storage.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\storage.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="client"
+			>
+			<File
+				RelativePath="..\client\connpool.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\client\connpool.h"
+				>
+			</File>
+			<File
+				RelativePath="..\client\dbclient.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\client\dbclient.h"
+				>
+			</File>
+			<File
+				RelativePath="..\client\model.h"
+				>
+			</File>
+			<File
+				RelativePath="..\client\quorum.cpp"
+				>
+			</File>
+			<Filter
+				Name="btree related"
+				>
+				<File
+					RelativePath=".\btree.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\btree.h"
+					>
+				</File>
+				<File
+					RelativePath=".\btreecursor.cpp"
+					>
+				</File>
+			</Filter>
+		</Filter>
+		<Filter
+			Name="db"
+			>
+			<File
+				RelativePath=".\client.h"
+				>
+			</File>
+			<File
+				RelativePath=".\clientcursor.h"
+				>
+			</File>
+			<File
+				RelativePath=".\cmdline.h"
+				>
+			</File>
+			<File
+				RelativePath=".\commands.h"
+				>
+			</File>
+			<File
+				RelativePath=".\concurrency.h"
+				>
+			</File>
+			<File
+				RelativePath=".\curop.h"
+				>
+			</File>
+			<File
+				RelativePath=".\cursor.h"
+				>
+			</File>
+			<File
+				RelativePath=".\database.h"
+				>
+			</File>
+			<File
+				RelativePath=".\db.h"
+				>
+			</File>
+			<File
+				RelativePath=".\dbhelpers.h"
+				>
+			</File>
+			<File
+				RelativePath=".\dbinfo.h"
+				>
+			</File>
+			<File
+				RelativePath=".\dbmessage.h"
+				>
+			</File>
+			<File
+				RelativePath=".\introspect.h"
+				>
+			</File>
+			<File
+				RelativePath=".\jsobj.h"
+				>
+			</File>
+			<File
+				RelativePath=".\json.h"
+				>
+			</File>
+			<File
+				RelativePath=".\matcher.h"
+				>
+			</File>
+			<File
+				RelativePath="..\grid\message.h"
+				>
+			</File>
+			<File
+				RelativePath=".\minilex.h"
+				>
+			</File>
+			<File
+				RelativePath=".\namespace.h"
+				>
+			</File>
+			<File
+				RelativePath=".\pdfile.h"
+				>
+			</File>
+			<File
+				RelativePath="..\grid\protocol.h"
+				>
+			</File>
+			<File
+				RelativePath=".\query.h"
+				>
+			</File>
+			<File
+				RelativePath=".\queryoptimizer.h"
+				>
+			</File>
+			<File
+				RelativePath=".\queryutil.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\repl.h"
+				>
+			</File>
+			<File
+				RelativePath=".\replset.h"
+				>
+			</File>
+			<File
+				RelativePath=".\resource.h"
+				>
+			</File>
+			<File
+				RelativePath=".\scanandorder.h"
+				>
+			</File>
+			<File
+				RelativePath=".\security.h"
+				>
+			</File>
+			<File
+				RelativePath="..\stdafx.h"
+				>
+			</File>
+			<Filter
+				Name="cpp"
+				Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+				UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+				>
+				<File
+					RelativePath=".\client.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\clientcursor.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\cloner.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\commands.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\cursor.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\s\d_util.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\database.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\db.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\dbcommands.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\dbcommands_admin.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\dbeval.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\dbhelpers.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\dbstats.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\dbwebserver.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\extsort.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\index.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\instance.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\introspect.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\jsobj.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\json.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\lasterror.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\matcher.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\mmap_win.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\modules\mms.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\module.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\mr.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\namespace.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\nonce.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\client\parallel.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\pdfile.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\query.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\queryoptimizer.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\repl.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\security.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\security_commands.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\stdafx.cpp"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="1"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="1"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="1"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="1"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath=".\tests.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\top.cpp"
+					>
+				</File>
+				<File
+					RelativePath=".\update.cpp"
+					>
+				</File>
+			</Filter>
+		</Filter>
+		<Filter
+			Name="util"
+			>
+			<File
+				RelativePath="..\util\assert_util.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\builder.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\file.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\goodies.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\hashtab.h"
+				>
+			</File>
+			<File
+				RelativePath=".\lasterror.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\log.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\lruishmap.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\md5.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\md5.hpp"
+				>
+			</File>
+			<File
+				RelativePath="..\util\miniwebserver.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\mmap.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\sock.h"
+				>
+			</File>
+			<File
+				RelativePath="..\util\unittest.h"
+				>
+			</File>
+			<Filter
+				Name="cpp"
+				>
+				<File
+					RelativePath="..\util\assert_util.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\background.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\base64.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\httpclient.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\md5.c"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+							PrecompiledHeaderThrough=""
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="release_nojni|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug Recstore|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="0"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\util\md5main.cpp"
+					>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							UsePrecompiledHeader="2"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\util\message.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\miniwebserver.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\mmap.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\ntservice.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\processinfo_win32.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\sock.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\util\util.cpp"
+					>
+				</File>
+			</Filter>
+		</Filter>
+		<Filter
+			Name="shard"
+			>
+			<File
+				RelativePath="..\s\d_logic.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="scripting"
+			>
+			<File
+				RelativePath="..\scripting\engine.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\scripting\engine_spidermonkey.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\shell\mongo_vstudio.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="0"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="0"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug Recstore|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="0"
+					/>
+				</FileConfiguration>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/db/db.vcxproj b/db/db.vcxproj
new file mode 100644
index 0000000..878b52c
--- /dev/null
+++ b/db/db.vcxproj
@@ -0,0 +1,489 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug Recstore|Win32">
+      <Configuration>Debug Recstore</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectName>mongod</ProjectName>
+    <ProjectGuid>{215B2D68-0A70-4D10-8E75-B31010C62A91}</ProjectGuid>
+    <RootNamespace>db</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.21006.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">true</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\pcre-7.4;c:\Program Files\boost\boost_1_41_0;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\Program Files\boost\boost_1_41_0\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\pcre-7.4;c:\Program Files\boost\boost_1_41_0;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>OLDJS;STATIC_JS_API;XP_WIN;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>stdafx.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\Program Files\boost\boost_1_41_0\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\pcre-7.4;c:\Program Files\boost\boost_1_41_0;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_RECSTORE;OLDJS;STATIC_JS_API;XP_WIN;WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;PCRE_STATIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\Program Files\boost\boost_1_41_0\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="..\pcre-7.4\pcrecpp.h" />
+    <ClInclude Include="..\targetver.h" />
+    <ClInclude Include="..\pcre-7.4\config.h" />
+    <ClInclude Include="..\pcre-7.4\pcre.h" />
+    <ClInclude Include="rec.h" />
+    <ClInclude Include="reccache.h" />
+    <ClInclude Include="reci.h" />
+    <ClInclude Include="recstore.h" />
+    <ClInclude Include="storage.h" />
+    <ClInclude Include="..\client\connpool.h" />
+    <ClInclude Include="..\client\dbclient.h" />
+    <ClInclude Include="..\client\model.h" />
+    <ClInclude Include="btree.h" />
+    <ClInclude Include="client.h" />
+    <ClInclude Include="clientcursor.h" />
+    <ClInclude Include="cmdline.h" />
+    <ClInclude Include="commands.h" />
+    <ClInclude Include="concurrency.h" />
+    <ClInclude Include="curop.h" />
+    <ClInclude Include="cursor.h" />
+    <ClInclude Include="database.h" />
+    <ClInclude Include="db.h" />
+    <ClInclude Include="dbhelpers.h" />
+    <ClInclude Include="dbinfo.h" />
+    <ClInclude Include="dbmessage.h" />
+    <ClInclude Include="introspect.h" />
+    <ClInclude Include="jsobj.h" />
+    <ClInclude Include="json.h" />
+    <ClInclude Include="matcher.h" />
+    <ClInclude Include="..\grid\message.h" />
+    <ClInclude Include="minilex.h" />
+    <ClInclude Include="namespace.h" />
+    <ClInclude Include="pdfile.h" />
+    <ClInclude Include="..\grid\protocol.h" />
+    <ClInclude Include="query.h" />
+    <ClInclude Include="queryoptimizer.h" />
+    <ClInclude Include="repl.h" />
+    <ClInclude Include="replset.h" />
+    <ClInclude Include="resource.h" />
+    <ClInclude Include="scanandorder.h" />
+    <ClInclude Include="security.h" />
+    <ClInclude Include="..\stdafx.h" />
+    <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\file.h" />
+    <ClInclude Include="..\util\goodies.h" />
+    <ClInclude Include="..\util\hashtab.h" />
+    <ClInclude Include="lasterror.h" />
+    <ClInclude Include="..\util\log.h" />
+    <ClInclude Include="..\util\lruishmap.h" />
+    <ClInclude Include="..\util\md5.h" />
+    <ClInclude Include="..\util\md5.hpp" />
+    <ClInclude Include="..\util\miniwebserver.h" />
+    <ClInclude Include="..\util\mmap.h" />
+    <ClInclude Include="..\util\sock.h" />
+    <ClInclude Include="..\util\unittest.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="db.rc" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\js\js\Release\js.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+    </CustomBuild>
+    <CustomBuild Include="..\..\js\js\Debug\js.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\pcre-7.4\pcrecpp.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_chartables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_compile.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_config.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_dfa_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_fullinfo.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_get.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_globals.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_info.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_maketables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_newline.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_ord2utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_refcount.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_scanner.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_stringpiece.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_study.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_tables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_try_flipped.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_ucp_searchfuncs.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_valid_utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_version.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcre_xclass.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\pcre-7.4\pcreposix.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="reccache.cpp" />
+    <ClCompile Include="storage.cpp" />
+    <ClCompile Include="..\client\connpool.cpp" />
+    <ClCompile Include="..\client\dbclient.cpp" />
+    <ClCompile Include="btree.cpp" />
+    <ClCompile Include="btreecursor.cpp" />
+    <ClCompile Include="queryutil.cpp" />
+    <ClCompile Include="client.cpp" />
+    <ClCompile Include="clientcursor.cpp" />
+    <ClCompile Include="cloner.cpp" />
+    <ClCompile Include="commands.cpp" />
+    <ClCompile Include="cursor.cpp" />
+    <ClCompile Include="..\s\d_util.cpp" />
+    <ClCompile Include="db.cpp" />
+    <ClCompile Include="dbcommands.cpp" />
+    <ClCompile Include="dbeval.cpp" />
+    <ClCompile Include="dbhelpers.cpp" />
+    <ClCompile Include="dbinfo.cpp" />
+    <ClCompile Include="dbwebserver.cpp" />
+    <ClCompile Include="extsort.cpp" />
+    <ClCompile Include="instance.cpp" />
+    <ClCompile Include="introspect.cpp" />
+    <ClCompile Include="jsobj.cpp" />
+    <ClCompile Include="json.cpp" />
+    <ClCompile Include="lasterror.cpp" />
+    <ClCompile Include="matcher.cpp" />
+    <ClCompile Include="..\util\mmap_win.cpp" />
+    <ClCompile Include="modules\mms.cpp" />
+    <ClCompile Include="module.cpp" />
+    <ClCompile Include="mr.cpp" />
+    <ClCompile Include="namespace.cpp" />
+    <ClCompile Include="nonce.cpp" />
+    <ClCompile Include="..\client\parallel.cpp" />
+    <ClCompile Include="pdfile.cpp" />
+    <ClCompile Include="query.cpp" />
+    <ClCompile Include="queryoptimizer.cpp" />
+    <ClCompile Include="repl.cpp" />
+    <ClCompile Include="security.cpp" />
+    <ClCompile Include="security_commands.cpp" />
+    <ClCompile Include="..\stdafx.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="tests.cpp" />
+    <ClCompile Include="update.cpp" />
+    <ClCompile Include="..\util\assert_util.cpp" />
+    <ClCompile Include="..\util\background.cpp" />
+    <ClCompile Include="..\util\base64.cpp" />
+    <ClCompile Include="..\util\httpclient.cpp" />
+    <ClCompile Include="..\util\md5.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeaderFile>
+    </ClCompile>
+    <ClCompile Include="..\util\md5main.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Use</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\message.cpp" />
+    <ClCompile Include="..\util\miniwebserver.cpp" />
+    <ClCompile Include="..\util\mmap.cpp" />
+    <ClCompile Include="..\util\ntservice.cpp" />
+    <ClCompile Include="..\util\processinfo_none.cpp" />
+    <ClCompile Include="..\util\sock.cpp" />
+    <ClCompile Include="..\util\util.cpp" />
+    <ClCompile Include="..\s\d_logic.cpp" />
+    <ClCompile Include="..\scripting\engine.cpp" />
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\shell\mongo_vstudio.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug Recstore|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\SConstruct" />
+    <None Include="ClassDiagram1.cd" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/db/db_10.sln b/db/db_10.sln
new file mode 100644
index 0000000..76e8fe9
--- /dev/null
+++ b/db/db_10.sln
@@ -0,0 +1,45 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongod", "db.vcxproj", "{215B2D68-0A70-4D10-8E75-B31010C62A91}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcxproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "..\dbtests\test.vcxproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug Recstore|Win32 = Debug Recstore|Win32
+		Debug|Win32 = Debug|Win32
+		release_nojni|Win32 = release_nojni|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug Recstore|Win32.ActiveCfg = Debug Recstore|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug Recstore|Win32.Build.0 = Debug Recstore|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.release_nojni|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.Build.0 = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug Recstore|Win32.ActiveCfg = Debug Recstore|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug Recstore|Win32.Build.0 = Debug Recstore|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.Build.0 = Debug|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.release_nojni|Win32.ActiveCfg = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.release_nojni|Win32.Build.0 = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.ActiveCfg = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.Build.0 = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug Recstore|Win32.ActiveCfg = Debug Recstore|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug Recstore|Win32.Build.0 = Debug Recstore|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.release_nojni|Win32.ActiveCfg = release_nojni|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.release_nojni|Win32.Build.0 = release_nojni|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/db/dbcommands.cpp b/db/dbcommands.cpp
new file mode 100644
index 0000000..ff072a1
--- /dev/null
+++ b/db/dbcommands.cpp
@@ -0,0 +1,1465 @@
+// dbcommands.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "../util/md5.hpp"
+#include "../util/processinfo.h"
+#include "json.h"
+#include "repl.h"
+#include "replset.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "queryoptimizer.h"
+#include "../scripting/engine.h"
+#include "dbstats.h"
+
+namespace mongo {
+
+    TicketHolder connTicketHolder( 20000 );
+
+    extern int otherTraceLevel;
+    void flushOpLog( stringstream &ss );
+
+    class CmdShutdown : public Command {
+    public:
+        virtual bool requiresAuth() { return true; }
+        virtual bool adminOnly() { return true; }
+        virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return true; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "shutdown the database.  must be ran against admin db and either (1) ran from localhost or (2) authenticated.\n";
+        }
+        CmdShutdown() : Command("shutdown") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            log() << "terminating, shutdown command received" << endl;
+            dbexit( EXIT_CLEAN );
+            return true;
+        }
+    } cmdShutdown;
+
+    /* reset any errors so that getlasterror comes back clean.
+
+       useful before performing a long series of operations where we want to
+       see if any of the operations triggered an error, but don't want to check
+       after each op as that woudl be a client/server turnaround.
+    */
+    class CmdResetError : public Command {
+    public:
+        virtual bool readOnly() { return true; }
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "reset error state (used with getpreverror)";
+        }
+        CmdResetError() : Command("reseterror") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            LastError *le = lastError.get();
+            assert( le );
+            le->reset();
+            return true;
+        }
+    } cmdResetError;
+
+    /* for diagnostic / testing purposes. */
+    class CmdSleep : public Command { 
+    public:
+        virtual bool readOnly() { return true; }
+        virtual bool adminOnly() { return true; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "internal / make db block for 100 seconds";
+        }
+        CmdSleep() : Command("sleep") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            sleepsecs(100);
+            return true;
+        }
+    } cmdSleep;
+
+    class CmdGetLastError : public Command {
+    public:
+        virtual bool readOnly() { return true; }
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "return error status of the last operation";
+        }
+        CmdGetLastError() : Command("getlasterror") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            LastError *le = lastError.disableForCommand();
+            if ( le->nPrev != 1 )
+                LastError::noError.appendSelf( result );
+            else
+                le->appendSelf( result );
+            
+            if ( cmdObj["fsync"].trueValue() ){
+                log() << "fsync from getlasterror" << endl;
+                result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) );
+            }
+            
+            return true;
+        }
+    } cmdGetLastError;
+
+    /* for testing purposes only */
+    class CmdForceError : public Command {
+    public:
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdForceError() : Command("forceerror") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            uassert( 10038 , "forced error", false);
+            return true;
+        }
+    } cmdForceError;
+
+    class CmdGetPrevError : public Command {
+    public:
+        virtual bool readOnly() { return true; }
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "check for errors since last reseterror commandcal";
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdGetPrevError() : Command("getpreverror") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            LastError *le = lastError.disableForCommand();
+            le->appendSelf( result );
+            if ( le->valid )
+                result.append( "nPrev", le->nPrev );
+            else
+                result.append( "nPrev", -1 );
+            return true;
+        }
+    } cmdGetPrevError;
+
+    class CmdSwitchToClientErrors : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "convert to id based errors rather than connection based";
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdSwitchToClientErrors() : Command("switchtoclienterrors") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( lastError.getID() ){
+                errmsg = "already in client id mode";
+                return false;
+            }
+            LastError *le = lastError.disableForCommand();
+            le->overridenById = true;
+            result << "ok" << 1;
+            return true;
+        }
+    } cmdSwitchToClientErrors;
+
+    class CmdDropDatabase : public Command {
+    public:
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "drop (delete) this database";
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        CmdDropDatabase() : Command("dropDatabase") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONElement e = cmdObj.findElement(name);
+            log() << "dropDatabase " << ns << endl;
+            int p = (int) e.number();
+            if ( p != 1 )
+                return false;
+            dropDatabase(ns);
+            result.append( "dropped" , ns );
+            return true;
+        }
+    } cmdDropDatabase;
+
+    class CmdRepairDatabase : public Command {
+    public:
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "repair database.  also compacts. note: slow.";
+        }
+        CmdRepairDatabase() : Command("repairDatabase") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONElement e = cmdObj.findElement(name);
+            log() << "repairDatabase " << ns << endl;
+            int p = (int) e.number();
+            if ( p != 1 )
+                return false;
+            e = cmdObj.findElement( "preserveClonedFilesOnFailure" );
+            bool preserveClonedFilesOnFailure = e.isBoolean() && e.boolean();
+            e = cmdObj.findElement( "backupOriginalFiles" );
+            bool backupOriginalFiles = e.isBoolean() && e.boolean();
+            return repairDatabase( ns, errmsg, preserveClonedFilesOnFailure, backupOriginalFiles );
+        }
+    } cmdRepairDatabase;
+    
+    /* set db profiling level
+       todo: how do we handle profiling information put in the db with replication?
+             sensibly or not?
+    */
+    class CmdProfile : public Command {
+    public:
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "enable or disable performance profiling";
+        }
+        CmdProfile() : Command("profile") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONElement e = cmdObj.findElement(name);
+            result.append("was", (double) cc().database()->profile);
+            int p = (int) e.number();
+            bool ok = false;
+            if ( p == -1 )
+                ok = true;
+            else if ( p >= 0 && p <= 2 ) {
+                ok = cc().database()->setProfilingLevel( p , errmsg );
+            }
+
+            BSONElement slow = cmdObj["slowms"];
+            if ( slow.isNumber() )
+                cmdLine.slowMS = slow.numberInt();
+            
+            return ok;
+        }
+    } cmdProfile;
+
+    class CmdServerStatus : public Command {
+    public:
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdServerStatus() : Command("serverStatus") {
+            started = time(0);
+        }
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+            result.append("uptime",(double) (time(0)-started));
+            
+            {
+                BSONObjBuilder t;
+
+                unsigned long long last, start, timeLocked;
+                dbMutex.info().getTimingInfo(start, timeLocked);
+                last = curTimeMicros64();
+                double tt = (double) last-start;
+                double tl = (double) timeLocked;
+                t.append("totalTime", tt);
+                t.append("lockTime", tl);
+                t.append("ratio", tl/tt);
+                
+                result.append( "globalLock" , t.obj() );
+            }
+            
+            {
+
+                BSONObjBuilder t( result.subobjStart( "mem" ) );
+                
+                ProcessInfo p;
+                if ( p.supported() ){
+                    t.append( "resident" , p.getResidentSize() );
+                    t.append( "virtual" , p.getVirtualMemorySize() );
+                    t.appendBool( "supported" , true );
+                }
+                else {
+                    result.append( "note" , "not all mem info support on this platform" );
+                    t.appendBool( "supported" , false );
+                }
+                    
+                t.append( "mapped" , MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) );
+
+                t.done();
+                    
+            }
+            
+            {
+                BSONObjBuilder bb( result.subobjStart( "connections" ) );
+                bb.append( "current" , connTicketHolder.used() );
+                bb.append( "available" , connTicketHolder.available() );
+                bb.done();
+            }
+            {
+                BSONObjBuilder bb( result.subobjStart( "extra_info" ) );
+                bb.append("note", "fields vary by platform");
+                ProcessInfo p;
+                p.getExtraInfo(bb);
+                bb.done();
+            }
+
+            result.append( "opcounters" , globalOpCounters.getObj() );
+            
+            return true;
+        }
+        time_t started;
+    } cmdServerStatus;
+
+    /* just to check if the db has asserted */
+    class CmdAssertInfo : public Command {
+    public:
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "check if any asserts have occurred on the server";
+        }
+        CmdAssertInfo() : Command("assertinfo") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            result.appendBool("dbasserted", lastAssert[0].isSet() || lastAssert[1].isSet() || lastAssert[2].isSet());
+            result.appendBool("asserted", lastAssert[0].isSet() || lastAssert[1].isSet() || lastAssert[2].isSet() || lastAssert[3].isSet());
+            result.append("assert", lastAssert[AssertRegular].toString());
+            result.append("assertw", lastAssert[AssertW].toString());
+            result.append("assertmsg", lastAssert[AssertMsg].toString());
+            result.append("assertuser", lastAssert[AssertUser].toString());
+            return true;
+        }
+    } cmdAsserts;
+
+    class CmdGetOpTime : public Command {
+    public:
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdGetOpTime() : Command("getoptime") { }
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            result.appendDate("optime", OpTime::now().asDate());
+            return true;
+        }
+    } cmdgetoptime;
+
+    /*
+    class Cmd : public Command {
+    public:
+        Cmd() : Command("") { }
+        bool adminOnly() { return true; }
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result) {
+            return true;
+        }
+    } cmd;
+    */
+
+    class CmdDiagLogging : public Command {
+    public:
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdDiagLogging() : Command("diagLogging") { }
+        bool adminOnly() {
+            return true;
+        }
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() );
+            stringstream ss;
+            flushOpLog( ss );
+            out() << ss.str() << endl;
+            if ( !cmdLine.quiet )
+                log() << "CMD: diagLogging set to " << _diaglog.level << " from: " << was << endl;
+            result.append( "was" , was );
+            return true;
+        }
+    } cmddiaglogging;
+
+    /* remove bit from a bit array - actually remove its slot, not a clear
+       note: this function does not work with x == 63 -- that is ok
+             but keep in mind in the future if max indexes were extended to 
+             exactly 64 it would be a problem
+    */
+    unsigned long long removeBit(unsigned long long b, int x) {
+        unsigned long long tmp = b;
+        return
+            (tmp & ((((unsigned long long) 1) << x)-1)) |
+            ((tmp >> (x+1)) << x);
+    }
+
+    struct DBCommandsUnitTest {
+        DBCommandsUnitTest() {
+            assert( removeBit(1, 0) == 0 );
+            assert( removeBit(2, 0) == 1 );
+            assert( removeBit(2, 1) == 0 );
+            assert( removeBit(255, 1) == 127 );
+            assert( removeBit(21, 2) == 9 );
+            assert( removeBit(0x4000000000000001ULL, 62) == 1 );
+        }
+    } dbc_unittest;
+
+    bool deleteIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool mayDeleteIdIndex ) {
+
+        d->aboutToDeleteAnIndex();
+
+        /* there may be pointers pointing at keys in the btree(s).  kill them. */
+        ClientCursor::invalidate(ns);
+
+        // delete a specific index or all?
+        if ( *name == '*' && name[1] == 0 ) {
+            log(4) << "  d->nIndexes was " << d->nIndexes << '\n';
+            anObjBuilder.append("nIndexesWas", (double)d->nIndexes);
+            IndexDetails *idIndex = 0;
+            if( d->nIndexes ) {
+                for ( int i = 0; i < d->nIndexes; i++ ) {
+                    if ( !mayDeleteIdIndex && d->idx(i).isIdIndex() ) {
+                        idIndex = &d->idx(i);
+                    } else {
+                        d->idx(i).kill_idx();
+                    }
+                }
+                d->nIndexes = 0;
+            }
+            if ( idIndex ) {
+                d->addIndex(ns) = *idIndex;
+                wassert( d->nIndexes == 1 );
+            }
+            /* assuming here that id index is not multikey: */
+            d->multiKeyIndexBits = 0;
+            anObjBuilder.append("msg", "all indexes deleted for collection");
+        }
+        else {
+            // delete just one index
+            int x = d->findIndexByName(name);
+            if ( x >= 0 ) {
+                log(4) << "  d->nIndexes was " << d->nIndexes << endl;
+                anObjBuilder.append("nIndexesWas", (double)d->nIndexes);
+
+                /* note it is  important we remove the IndexDetails with this
+                 call, otherwise, on recreate, the old one would be reused, and its
+                 IndexDetails::info ptr would be bad info.
+                 */
+                IndexDetails *id = &d->idx(x);
+                if ( !mayDeleteIdIndex && id->isIdIndex() ) {
+                    errmsg = "may not delete _id index";
+                    return false;
+                }
+                id->kill_idx();
+                d->multiKeyIndexBits = removeBit(d->multiKeyIndexBits, x);
+                d->nIndexes--;
+                for ( int i = x; i < d->nIndexes; i++ )
+                    d->idx(i) = d->idx(i+1);
+            } else {
+                log() << "deleteIndexes: " << name << " not found" << endl;
+                errmsg = "index not found";
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /* drop collection */
+    class CmdDrop : public Command {
+    public:
+        CmdDrop() : Command("drop") { }
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual bool adminOnly() {
+            return false;
+        }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            string nsToDrop = cc().database()->name + '.' + cmdObj.findElement(name).valuestr();
+            NamespaceDetails *d = nsdetails(nsToDrop.c_str());
+            if ( !cmdLine.quiet )
+                log() << "CMD: drop " << nsToDrop << endl;
+            if ( d == 0 ) {
+                errmsg = "ns not found";
+                return false;
+            }
+            uassert( 10039 ,  "can't drop collection with reserved $ character in name", strchr(nsToDrop.c_str(), '$') == 0 );
+            dropCollection( nsToDrop, errmsg, result );
+            return true;
+        }
+    } cmdDrop;
+
+    /* select count(*) */
+    class CmdCount : public Command {
+    public:
+        virtual bool readOnly() { return true; }
+        CmdCount() : Command("count") { }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            // ok on --slave setups, not ok for nonmaster of a repl pair (unless override)
+            return slave == SimpleSlave;
+        }
+        virtual bool slaveOverrideOk() {
+            return true;
+        }
+        virtual bool adminOnly() {
+            return false;
+        }
+        virtual bool run(const char *_ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            string ns = cc().database()->name + '.' + cmdObj.findElement(name).valuestr();
+            string err;
+            long long n = runCount(ns.c_str(), cmdObj, err);
+            long long nn = n;
+            bool ok = true;
+            if ( n == -1 ){
+                nn = 0;
+                result.appendBool( "missing" , true );
+            }
+            else if ( n < 0 ) {
+                nn = 0;
+                ok = false;
+                if ( !err.empty() )
+                    errmsg = err;
+            }
+            result.append("n", (double) nn);
+            return ok;
+        }
+    } cmdCount;
+
+    /* create collection */
+    class CmdCreate : public Command {
+    public:
+        CmdCreate() : Command("create") { }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual bool adminOnly() {
+            return false;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "create a collection";
+        }
+        virtual bool run(const char *_ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            string ns = cc().database()->name + '.' + cmdObj.findElement(name).valuestr();
+            string err;
+            bool ok = userCreateNS(ns.c_str(), cmdObj, err, true);
+            if ( !ok && !err.empty() )
+                errmsg = err;
+            return ok;
+        }
+    } cmdCreate;
+
+    class CmdDeleteIndexes : public Command {
+    public:
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "delete indexes for a collection";
+        }
+        CmdDeleteIndexes() : Command("deleteIndexes") { }
+        bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) {
+            /* note: temp implementation.  space not reclaimed! */
+            BSONElement e = jsobj.findElement(name.c_str());
+            string toDeleteNs = cc().database()->name + '.' + e.valuestr();
+            NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
+            if ( !cmdLine.quiet )
+                log() << "CMD: deleteIndexes " << toDeleteNs << endl;
+            if ( d ) {
+                BSONElement f = jsobj.findElement("index");
+                if ( f.type() == String ) {
+                    return deleteIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false );
+                }
+                else {
+                    errmsg = "invalid index name spec";
+                    return false;
+                }
+            }
+            else {
+                errmsg = "ns not found";
+                return false;
+            }
+        }
+    } cmdDeleteIndexes;
+
+    class CmdReIndex : public Command {
+    public:
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "re-index a collection";
+        }
+        CmdReIndex() : Command("reIndex") { }
+        bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            static DBDirectClient db;
+
+            BSONElement e = jsobj.findElement(name.c_str());
+            string toDeleteNs = cc().database()->name + '.' + e.valuestr();
+            NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
+            log() << "CMD: reIndex " << toDeleteNs << endl;
+
+            if ( ! d ){
+                errmsg = "ns not found";
+                return false;
+            }
+
+            list<BSONObj> all;
+            auto_ptr<DBClientCursor> i = db.getIndexes( toDeleteNs );
+            BSONObjBuilder b;
+            while ( i->more() ){
+                BSONObj o = i->next().getOwned();
+                b.append( BSONObjBuilder::numStr( all.size() ) , o );
+                all.push_back( o );
+            }
+
+
+            bool ok = deleteIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true );
+            if ( ! ok ){
+                errmsg = "deleteIndexes failed";
+                return false;
+            }
+
+            for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ){
+                BSONObj o = *i;
+                db.insert( Namespace( toDeleteNs.c_str() ).getSisterNS( "system.indexes" ).c_str() , o );
+            }
+
+            result.append( "ok" , 1 );
+            result.append( "nIndexes" , (int)all.size() );
+            result.appendArray( "indexes" , b.obj() );
+            return true;
+        }
+    } cmdReIndex;
+
+
+
+    class CmdListDatabases : public Command {
+    public:
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual bool slaveOverrideOk() {
+            return true;
+        }
+        virtual bool adminOnly() {
+            return true;
+        }
+        CmdListDatabases() : Command("listDatabases") {}
+        bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            vector< string > dbNames;
+            getDatabaseNames( dbNames );
+            vector< BSONObj > dbInfos;
+
+            set<string> seen;
+            boost::intmax_t totalSize = 0;
+            for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
+                BSONObjBuilder b;
+                b.append( "name", i->c_str() );
+                boost::intmax_t size = dbSize( i->c_str() );
+                b.append( "sizeOnDisk", (double) size );
+                setClient( i->c_str() );
+                b.appendBool( "empty", cc().database()->isEmpty() );
+                totalSize += size;
+                dbInfos.push_back( b.obj() );
+
+                seen.insert( i->c_str() );
+            }
+            
+            // TODO: erh 1/1/2010 I think this is broken where path != dbpath ??
+            set<string> allShortNames;
+            dbHolder.getAllShortNames( allShortNames );
+            for ( set<string>::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ){
+                string name = *i;
+
+                if ( seen.count( name ) )
+                    continue;
+
+                BSONObjBuilder b;
+                b << "name" << name << "sizeOnDisk" << double( 1 );
+                setClient( name.c_str() );
+                b.appendBool( "empty", cc().database()->isEmpty() );
+
+                dbInfos.push_back( b.obj() );
+            }
+
+            result.append( "databases", dbInfos );
+            result.append( "totalSize", double( totalSize ) );
+            return true;
+        }
+    } cmdListDatabases;
+
+    class CmdCloseAllDatabases : public Command {
+    public:
+        virtual bool adminOnly() { return true; }
+        virtual bool slaveOk() { return false; }
+        CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {}
+        bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            return dbHolder.closeAll( dbpath , result );
+        }
+    } cmdCloseAllDatabases;
+
+    class CmdFileMD5 : public Command {
+    public:
+        CmdFileMD5() : Command( "filemd5" ){}
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << " example: { filemd5 : ObjectId(aaaaaaa) , key : { ts : 1 } }";
+        }
+        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            static DBDirectClient db;
+
+            string ns = nsToDatabase( dbname );
+            ns += ".";
+            {
+                string root = jsobj.getStringField( "root" );
+                if ( root.size() == 0 )
+                    root = "fs";
+                ns += root;
+            }
+            ns += ".chunks"; // make this an option in jsobj
+
+            BSONObjBuilder query;
+            query.appendAs( jsobj["filemd5"] , "files_id" );
+            Query q( query.obj() );
+            q.sort( BSON( "files_id" << 1 << "n" << 1 ) );
+
+            md5digest d;
+            md5_state_t st;
+            md5_init(&st);
+
+            dbtemprelease temp;
+
+            auto_ptr<DBClientCursor> cursor = db.query( ns.c_str() , q );
+            int n = 0;
+            while ( cursor->more() ){
+                BSONObj c = cursor->next();
+                int myn = c.getIntField( "n" );
+                if ( n != myn ){
+                    log() << "should have chunk: " << n << " have:" << myn << endl;
+                    uassert( 10040 ,  "chunks out of order" , n == myn );
+                }
+
+                int len;
+                const char * data = c["data"].binData( len );
+                md5_append( &st , (const md5_byte_t*)(data + 4) , len - 4 );
+
+                n++;
+            }
+            md5_finish(&st, d);
+
+            result.append( "md5" , digestToString( d ) );
+            return true;
+        }
+    } cmdFileMD5;
+
+    IndexDetails *cmdIndexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
+        if ( ns[ 0 ] == '\0' || min.isEmpty() || max.isEmpty() ) {
+            errmsg = "invalid command syntax (note: min and max are required)";
+            return 0;
+        }
+        return indexDetailsForRange( ns, errmsg, min, max, keyPattern );
+    }
+
+    class CmdMedianKey : public Command {
+    public:
+        CmdMedianKey() : Command( "medianKey" ) {}
+        virtual bool slaveOk() { return true; }
+        virtual void help( stringstream &help ) const {
+            help << " example: { medianKey:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }\n"
+                "NOTE: This command may take awhile to run";
+        }
+        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            const char *ns = jsobj.getStringField( "medianKey" );
+            BSONObj min = jsobj.getObjectField( "min" );
+            BSONObj max = jsobj.getObjectField( "max" );
+            BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
+
+            IndexDetails *id = cmdIndexDetailsForRange( ns, errmsg, min, max, keyPattern );
+            if ( id == 0 )
+                return false;
+
+            Timer t;
+            int num = 0;
+            NamespaceDetails *d = nsdetails(ns);
+            int idxNo = d->idxNo(*id);
+            for( BtreeCursor c( d, idxNo, *id, min, max, false, 1 ); c.ok(); c.advance(), ++num );
+            num /= 2;
+            BtreeCursor c( d, idxNo, *id, min, max, false, 1 );
+            for( ; num; c.advance(), --num );
+            int ms = t.millis();
+            if ( ms > cmdLine.slowMS ) {
+                out() << "Finding median for index: " << keyPattern << " between " << min << " and " << max << " took " << ms << "ms." << endl;
+            }
+
+            if ( !c.ok() ) {
+                errmsg = "no index entries in the specified range";
+                return false;
+            }
+
+            result.append( "median", c.prettyKey( c.currKey() ) );
+            return true;
+        }
+    } cmdMedianKey;
+
+    class CmdDatasize : public Command {
+    public:
+        CmdDatasize() : Command( "datasize" ) {}
+        virtual bool slaveOk() { return true; }
+        virtual void help( stringstream &help ) const {
+            help <<
+                "\ndetermine data size for a set of data in a certain range"
+                "\nexample: { datasize:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }"
+                "\nkeyPattern, min, and max parameters are optional."
+                "\nnot: This command may take a while to run";
+        }
+        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            const char *ns = jsobj.getStringField( "datasize" );
+            BSONObj min = jsobj.getObjectField( "min" );
+            BSONObj max = jsobj.getObjectField( "max" );
+            BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
+
+            auto_ptr< Cursor > c;
+            if ( min.isEmpty() && max.isEmpty() ) {
+                setClient( ns );
+                c = theDataFileMgr.findAll( ns );
+            } else if ( min.isEmpty() || max.isEmpty() ) {
+                errmsg = "only one of min or max specified";
+                return false;
+            } else {
+                IndexDetails *idx = cmdIndexDetailsForRange( ns, errmsg, min, max, keyPattern );
+                if ( idx == 0 )
+                    return false;
+                NamespaceDetails *d = nsdetails(ns);
+                c.reset( new BtreeCursor( d, d->idxNo(*idx), *idx, min, max, false, 1 ) );
+            }
+
+            Timer t;
+            long long size = 0;
+            long long numObjects = 0;
+            while( c->ok() ) {
+                size += c->current().objsize();
+                c->advance();
+                numObjects++;
+            }
+            int ms = t.millis();
+            if ( ms > cmdLine.slowMS ) {
+                if ( min.isEmpty() ) {
+                    out() << "Finding size for ns: " << ns << " took " << ms << "ms." << endl;
+                } else {
+                    out() << "Finding size for ns: " << ns << " between " << min << " and " << max << " took " << ms << "ms." << endl;
+                }
+            }
+
+            result.append( "size", (double)size );
+            result.append( "numObjects" , (double)numObjects );
+            return true;
+        }
+    } cmdDatasize;
+
+    class CollectionStats : public Command {
+    public:
+        CollectionStats() : Command( "collstats" ) {}
+        virtual bool slaveOk() { return true; }
+        virtual void help( stringstream &help ) const {
+            help << " example: { collstats:\"blog.posts\" } ";
+        }
+        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            string ns = dbname;
+            if ( ns.find( "." ) != string::npos )
+                ns = ns.substr( 0 , ns.find( "." ) );
+            ns += ".";
+            ns += jsobj.firstElement().valuestr();
+
+            NamespaceDetails * nsd = nsdetails( ns.c_str() );
+            if ( ! nsd ){
+                errmsg = "ns not found";
+                return false;
+            }
+
+            result.append( "ns" , ns.c_str() );
+
+            result.append( "count" , nsd->nrecords );
+            result.append( "size" , nsd->datasize );
+            result.append( "storageSize" , nsd->storageSize() );
+            result.append( "nindexes" , nsd->nIndexes );
+
+            if ( nsd->capped ){
+                result.append( "capped" , nsd->capped );
+                result.append( "max" , nsd->max );
+            }
+
+            return true;
+        }
+    } cmdCollectionStatis;
+
+    class CmdBuildInfo : public Command {
+    public:
+        CmdBuildInfo() : Command( "buildinfo" ) {}
+        virtual bool slaveOk() { return true; }
+        virtual bool adminOnly() { return true; }
+        virtual void help( stringstream &help ) const {
+            help << "example: { buildinfo:1 }";
+        }
+        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            result << "version" << versionString << "gitVersion" << gitVersion() << "sysInfo" << sysInfo();
+            result << "bits" << ( sizeof( int* ) == 4 ? 32 : 64 );
+            return true;
+        }
+    } cmdBuildInfo;
+
+    class CmdCloneCollectionAsCapped : public Command {
+    public:
+        CmdCloneCollectionAsCapped() : Command( "cloneCollectionAsCapped" ) {}
+        virtual bool slaveOk() { return false; }
+        virtual void help( stringstream &help ) const {
+            help << "example: { cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }";
+        }
+        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            string from = jsobj.getStringField( "cloneCollectionAsCapped" );
+            string to = jsobj.getStringField( "toCollection" );
+            long long size = (long long)jsobj.getField( "size" ).number();
+
+            if ( from.empty() || to.empty() || size == 0 ) {
+                errmsg = "invalid command spec";
+                return false;
+            }
+
+            char realDbName[256];
+            nsToDatabase( dbname, realDbName );
+
+            string fromNs = string( realDbName ) + "." + from;
+            string toNs = string( realDbName ) + "." + to;
+            massert( 10300 ,  "source collection " + fromNs + " does not exist", !setClient( fromNs.c_str() ) );
+            NamespaceDetails *nsd = nsdetails( fromNs.c_str() );
+            massert( 10301 ,  "source collection " + fromNs + " does not exist", nsd );
+            long long excessSize = nsd->datasize - size * 2;
+            DiskLoc extent = nsd->firstExtent;
+            for( ; excessSize > 0 && extent != nsd->lastExtent; extent = extent.ext()->xnext ) {
+                excessSize -= extent.ext()->length;
+                if ( excessSize > 0 )
+                    log( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl;
+                log( 6 ) << "excessSize: " << excessSize << endl;
+            }
+            DiskLoc startLoc = extent.ext()->firstRecord;
+
+            CursorId id;
+            {
+                auto_ptr< Cursor > c = theDataFileMgr.findAll( fromNs.c_str(), startLoc );
+                ClientCursor *cc = new ClientCursor();
+                cc->c = c;
+                cc->ns = fromNs;
+                cc->matcher.reset( new CoveredIndexMatcher( BSONObj(), fromjson( "{$natural:1}" ) ) );
+                id = cc->cursorid;
+            }
+
+            DBDirectClient client;
+            setClient( toNs.c_str() );
+            BSONObjBuilder spec;
+            spec.appendBool( "capped", true );
+            spec.append( "size", double( size ) );
+            if ( !userCreateNS( toNs.c_str(), spec.done(), errmsg, true ) )
+                return false;
+
+            auto_ptr< DBClientCursor > c = client.getMore( fromNs, id );
+            while( c->more() ) {
+                BSONObj obj = c->next();
+                theDataFileMgr.insertAndLog( toNs.c_str(), obj, true );
+            }
+
+            return true;
+        }
+    } cmdCloneCollectionAsCapped;
+
+    class CmdConvertToCapped : public Command {
+    public:
+        CmdConvertToCapped() : Command( "convertToCapped" ) {}
+        virtual bool slaveOk() { return false; }
+        virtual void help( stringstream &help ) const {
+            help << "example: { convertToCapped:<fromCollectionName>, size:<sizeInBytes> }";
+        }
+        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            string from = jsobj.getStringField( "convertToCapped" );
+            long long size = (long long)jsobj.getField( "size" ).number();
+
+            if ( from.empty() || size == 0 ) {
+                errmsg = "invalid command spec";
+                return false;
+            }
+
+            char realDbName[256];
+            nsToDatabase( dbname, realDbName );
+
+            DBDirectClient client;
+            client.dropCollection( string( realDbName ) + "." + from + ".$temp_convertToCapped" );
+
+            BSONObj info;
+            if ( !client.runCommand( realDbName,
+                                    BSON( "cloneCollectionAsCapped" << from << "toCollection" << ( from + ".$temp_convertToCapped" ) << "size" << double( size ) ),
+                                    info ) ) {
+                errmsg = "cloneCollectionAsCapped failed: " + string(info);
+                return false;
+            }
+
+            if ( !client.dropCollection( string( realDbName ) + "." + from ) ) {
+                errmsg = "failed to drop original collection";
+                return false;
+            }
+
+            if ( !client.runCommand( "admin",
+                                    BSON( "renameCollection" << ( string( realDbName ) + "." + from + ".$temp_convertToCapped" ) << "to" << ( string( realDbName ) + "." + from ) ),
+                                    info ) ) {
+                errmsg = "renameCollection failed: " + string(info);
+                return false;
+            }
+
+            return true;
+        }
+    } cmdConvertToCapped;
+
+    class GroupCommand : public Command {
+    public:
+        GroupCommand() : Command("group"){}
+        virtual bool slaveOk() { return true; }
+        virtual void help( stringstream &help ) const {
+            help << "see http://www.mongodb.org/display/DOCS/Aggregation";
+        }
+
+        BSONObj getKey( const BSONObj& obj , const BSONObj& keyPattern , ScriptingFunction func , double avgSize , Scope * s ){
+            if ( func ){
+                BSONObjBuilder b( obj.objsize() + 32 );
+                b.append( "0" , obj );
+                int res = s->invoke( func , b.obj() );
+                uassert( 10041 ,  (string)"invoke failed in $keyf: " + s->getError() , res == 0 );
+                int type = s->type("return");
+                uassert( 10042 ,  "return of $key has to be an object" , type == Object );
+                return s->getObject( "return" );
+            }
+            return obj.extractFields( keyPattern , true );
+        }
+
+        bool group( string realdbname , auto_ptr<DBClientCursor> cursor ,
+                    BSONObj keyPattern , string keyFunctionCode , string reduceCode , const char * reduceScope ,
+                    BSONObj initial , string finalize ,
+                    string& errmsg , BSONObjBuilder& result ){
+
+
+            auto_ptr<Scope> s = globalScriptEngine->getPooledScope( realdbname );
+            s->localConnect( realdbname.c_str() );
+
+            if ( reduceScope )
+                s->init( reduceScope );
+
+            s->setObject( "$initial" , initial , true );
+
+            s->exec( "$reduce = " + reduceCode , "reduce setup" , false , true , true , 100 );
+            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+            ScriptingFunction f = s->createFunction(
+                "function(){ "
+                "  if ( $arr[n] == null ){ "
+                "    next = {}; "
+                "    Object.extend( next , $key ); "
+                "    Object.extend( next , $initial , true ); "
+                "    $arr[n] = next; "
+                "    next = null; "
+                "  } "
+                "  $reduce( obj , $arr[n] ); "
+                "}" );
+
+            ScriptingFunction keyFunction = 0;
+            if ( keyFunctionCode.size() ){
+                keyFunction = s->createFunction( keyFunctionCode.c_str() );
+            }
+
+
+            double keysize = keyPattern.objsize() * 3;
+            double keynum = 1;
+
+            map<BSONObj,int,BSONObjCmp> map;
+            list<BSONObj> blah;
+
+            while ( cursor->more() ){
+                BSONObj obj = cursor->next();
+                BSONObj key = getKey( obj , keyPattern , keyFunction , keysize / keynum , s.get() );
+                keysize += key.objsize();
+                keynum++;
+
+                int& n = map[key];
+                if ( n == 0 ){
+                    n = map.size();
+                    s->setObject( "$key" , key , true );
+
+                    uassert( 10043 ,  "group() can't handle more than 10000 unique keys" , n <= 10000 );
+                }
+
+                s->setObject( "obj" , obj , true );
+                s->setNumber( "n" , n - 1 );
+                if ( s->invoke( f , BSONObj() , 0 , true ) ){
+                    throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() );
+                }
+            }
+
+            if (!finalize.empty()){
+                s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 );
+                ScriptingFunction g = s->createFunction(
+                    "function(){ "
+                    "  for(var i=0; i < $arr.length; i++){ "
+                    "  var ret = $finalize($arr[i]); "
+                    "  if (ret !== undefined) "
+                    "    $arr[i] = ret; "
+                    "  } "
+                    "}" );
+                s->invoke( g , BSONObj() , 0 , true );
+            }
+            
+            result.appendArray( "retval" , s->getObject( "$arr" ) );
+            result.append( "count" , keynum - 1 );
+            result.append( "keys" , (int)(map.size()) );
+            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+            s->gc();
+
+            return true;
+        }
+
+        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            static DBDirectClient db;
+
+            /* db.$cmd.findOne( { group : <p> } ) */
+            const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck();
+
+            BSONObj q;
+            if ( p["cond"].type() == Object )
+                q = p["cond"].embeddedObject();
+            else if ( p["condition"].type() == Object )
+                q = p["condition"].embeddedObject();
+            else 
+                q = getQuery( p );
+
+            string ns = dbname;
+            ns = ns.substr( 0 , ns.size() - 4 );
+            string realdbname = ns.substr( 0 , ns.size() - 1 );
+
+            if ( p["ns"].type() != String ){
+                errmsg = "ns has to be set";
+                return false;
+            }
+
+            ns += p["ns"].valuestr();
+
+            auto_ptr<DBClientCursor> cursor = db.query( ns , q );
+
+            BSONObj key;
+            string keyf;
+            if ( p["key"].type() == Object ){
+                key = p["key"].embeddedObjectUserCheck();
+                if ( ! p["$keyf"].eoo() ){
+                    errmsg = "can't have key and $keyf";
+                    return false;
+                }
+            }
+            else if ( p["$keyf"].type() ){
+                keyf = p["$keyf"].ascode();
+            }
+            else {
+                // no key specified, will use entire object as key
+            }
+
+            BSONElement reduce = p["$reduce"];
+            if ( reduce.eoo() ){
+                errmsg = "$reduce has to be set";
+                return false;
+            }
+
+            BSONElement initial = p["initial"];
+            if ( initial.type() != Object ){
+                errmsg = "initial has to be an object";
+                return false;
+            }
+
+
+            string finalize;
+            if (p["finalize"].type())
+                finalize = p["finalize"].ascode();
+
+            return group( realdbname , cursor ,
+                          key , keyf , reduce.ascode() , reduce.type() != CodeWScope ? 0 : reduce.codeWScopeScopeData() ,
+                          initial.embeddedObject() , finalize ,
+                          errmsg , result );
+        }
+
+    } cmdGroup;
+
+
+    class DistinctCommand : public Command {
+    public:
+        DistinctCommand() : Command("distinct"){}
+        virtual bool slaveOk() { return true; }
+
+        virtual void help( stringstream &help ) const {
+            help << "{ distinct : 'collection name' , key : 'a.b' }";
+        }
+
+        bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            static DBDirectClient db;
+
+            string ns = cc().database()->name + '.' + cmdObj.findElement(name).valuestr();
+            string key = cmdObj["key"].valuestrsafe();
+
+            BSONObj keyPattern = BSON( key << 1 );
+
+            set<BSONObj,BSONObjCmp> map;
+
+            long long size = 0;
+
+            auto_ptr<DBClientCursor> cursor = db.query( ns , getQuery( cmdObj ) , 0 , 0 , &keyPattern );
+            while ( cursor->more() ){
+                BSONObj o = cursor->next();
+                BSONObj value = o.extractFields( keyPattern );
+                if ( value.isEmpty() )
+                    continue;
+                if ( map.insert( value ).second ){
+                    size += o.objsize() + 20;
+                    uassert( 10044 ,  "distinct too big, 4mb cap" , size < 4 * 1024 * 1024 );
+                }
+            }
+
+            assert( size <= 0x7fffffff );
+            BSONObjBuilder b( (int) size );
+            int n=0;
+            for ( set<BSONObj,BSONObjCmp>::iterator i = map.begin() ; i != map.end(); i++ ){
+                b.appendAs( i->firstElement() , b.numStr( n++ ).c_str() );
+            }
+
+            result.appendArray( "values" , b.obj() );
+
+            return true;
+        }
+
+    } distinctCmd;
+
+    /* Find and Modify an object returning either the old (default) or new value*/
+    class CmdFindAndModify : public Command {
+    public:
+        /* {findandmodify: "collection", query: {processed:false}, update: {$set: {processed:true}}, new: true}
+         * {findandmodify: "collection", query: {processed:false}, remove: true, sort: {priority:-1}}
+         * 
+         * either update or remove is required, all other fields have default values
+         * output is in the "value" field
+         */
+        CmdFindAndModify() : Command("findandmodify") { }
+        virtual bool logTheOp() {
+            return false; // the modification will be logged directly
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            static DBDirectClient db;
+
+            string ns = nsToDatabase(dbname) + '.' + cmdObj.firstElement().valuestr();
+
+            Query q (cmdObj.getObjectField("query")); // defaults to {}
+            BSONElement sort = cmdObj["sort"];
+            if (!sort.eoo())
+                q.sort(sort.embeddedObjectUserCheck());
+
+            BSONObj out = db.findOne(ns, q);
+            if (out.firstElement().eoo()){
+                errmsg = "No matching object found";
+                return false;
+            }
+
+            q = QUERY( "_id" << out["_id"]);
+
+            if (cmdObj["remove"].trueValue()){
+                uassert(12515, "can't remove and update", cmdObj["update"].eoo());
+                db.remove(ns, q, 1);
+            } else {
+                BSONElement update = cmdObj["update"];
+                uassert(12516, "must specify remove or update", !update.eoo());
+                db.update(ns, q, update.embeddedObjectUserCheck());
+
+                if (cmdObj["new"].trueValue())
+                    out = db.findOne(ns, q);
+            }
+
+            result.append("value", out);
+
+            return true;
+        }
+    } cmdFindAndModify;
+    
+    bool commandIsReadOnly(BSONObj& _cmdobj) { 
+        BSONObj jsobj;
+        {
+            BSONElement e = _cmdobj.firstElement();
+            if ( e.type() == Object && string("query") == e.fieldName() ) {
+                jsobj = e.embeddedObject();
+            }
+            else {
+                jsobj = _cmdobj;
+            }
+        }
+        BSONElement e = jsobj.firstElement();
+        if ( ! e.type() )
+            return false;
+        return Command::readOnly( e.fieldName() );
+    }
+
+    /* TODO make these all command objects -- legacy stuff here
+
+       usage:
+         abc.$cmd.findOne( { ismaster:1 } );
+
+       returns true if ran a cmd
+    */
+    bool _runCommands(const char *ns, BSONObj& _cmdobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+        if( logLevel >= 1 ) 
+            log() << "run command " << ns << ' ' << _cmdobj << endl;
+
+        const char *p = strchr(ns, '.');
+        if ( !p ) return false;
+        if ( strcmp(p, ".$cmd") != 0 ) return false;
+
+        BSONObj jsobj;
+        {
+            BSONElement e = _cmdobj.firstElement();
+            if ( e.type() == Object && string("query") == e.fieldName() ) {
+                jsobj = e.embeddedObject();
+            }
+            else {
+                jsobj = _cmdobj;
+            }
+        }
+
+        bool ok = false;
+
+        BSONElement e = jsobj.firstElement();
+
+        Command * c = e.type() ? Command::findCommand( e.fieldName() ) : 0;
+        if ( c ){
+            string errmsg;
+            AuthenticationInfo *ai = currentClient.get()->ai;
+            uassert( 10045 , "unauthorized", ai->isAuthorized(cc().database()->name.c_str()) || !c->requiresAuth());
+
+            bool admin = c->adminOnly();
+
+            if( admin && c->localHostOnlyIfNoAuth(jsobj) && noauth && !ai->isLocalHost ) { 
+                ok = false;
+                errmsg = "unauthorized: this command must run from localhost when running db without auth";
+                log() << "command denied: " << jsobj.toString() << endl;
+            }
+            else if ( admin && !fromRepl && strncmp(ns, "admin", 5) != 0 ) {
+                ok = false;
+                errmsg = "access denied";
+                log() << "command denied: " << jsobj.toString() << endl;
+            }
+            else if ( isMaster() ||
+                      c->slaveOk() ||
+                      ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) ||
+                      fromRepl ){
+                if ( jsobj.getBoolField( "help" ) ) {
+                    stringstream help;
+                    help << "help for: " << e.fieldName() << " ";
+                    c->help( help );
+                    anObjBuilder.append( "help" , help.str() );
+                } 
+                else {
+                    if( admin )
+                        log( 2 ) << "command: " << jsobj << endl;
+                    try {
+                        ok = c->run(ns, jsobj, errmsg, anObjBuilder, fromRepl);
+                    }
+                    catch ( AssertionException& e ){
+                        ok = false;
+                        errmsg = "assertion: ";
+                        errmsg += e.what();
+                    }
+                    if ( ok && c->logTheOp() && !fromRepl )
+                        logOp("c", ns, jsobj);
+                }
+            }
+            else {
+                ok = false;
+                errmsg = "not master";
+            }
+            if ( !ok )
+                anObjBuilder.append("errmsg", errmsg);
+        }
+        else {
+            anObjBuilder.append("errmsg", "no such cmd");
+            anObjBuilder.append("bad cmd" , _cmdobj );
+        }
+        anObjBuilder.append("ok", ok?1.0:0.0);
+        BSONObj x = anObjBuilder.done();
+        b.append((void*) x.objdata(), x.objsize());
+        return true;
+    }
+
+} // namespace mongo
diff --git a/db/dbcommands_admin.cpp b/db/dbcommands_admin.cpp
new file mode 100644
index 0000000..91052bf
--- /dev/null
+++ b/db/dbcommands_admin.cpp
@@ -0,0 +1,356 @@
+// dbcommands_admin.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+   this file has dbcommands that are for dba type administration
+   mostly around dbs and collections
+   NOT system stuff
+*/
+
+
+#include "stdafx.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "namespace.h"
+#include "commands.h"
+#include "cmdline.h"
+#include "btree.h"
+#include "curop.h"
+#include "../util/background.h"
+
+namespace mongo {
+
+    class CleanCmd : public Command {
+    public:
+        CleanCmd() : Command( "clean" ){}
+
+        virtual bool slaveOk(){ return true; }
+
+        bool run(const char *nsRaw, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            string dropns = cc().database()->name + "." + cmdObj.firstElement().valuestrsafe();
+            
+            if ( !cmdLine.quiet )
+                log() << "CMD: clean " << dropns << endl;
+            
+            NamespaceDetails *d = nsdetails(dropns.c_str());
+            
+            if ( ! d ){
+                errmsg = "ns not found";
+                return 0;
+            }
+
+            for ( int i = 0; i < Buckets; i++ )
+                d->deletedList[i].Null();
+
+            result.append("ns", dropns.c_str());
+            return 1;
+        }
+        
+    } cleanCmd;
+    
+    class ValidateCmd : public Command {
+    public:
+        ValidateCmd() : Command( "validate" ){}
+
+        virtual bool slaveOk(){
+            return true;
+        }
+        
+        //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] } */
+        
+        bool run(const char *nsRaw, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            string ns = cc().database()->name + "." + cmdObj.firstElement().valuestrsafe();
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( !cmdLine.quiet )
+                log() << "CMD: validate " << ns << endl;
+
+            if ( ! d ){
+                errmsg = "ns not found";
+                return 0;
+            }
+            
+            result.append( "ns", ns );
+            result.append( "result" , validateNS( ns.c_str() , d, &cmdObj ) );
+            return 1;
+        }
+                    
+        
+        string validateNS(const char *ns, NamespaceDetails *d, BSONObj *cmdObj) {
+            bool scanData = true;
+            if( cmdObj && cmdObj->hasElement("scandata") && !cmdObj->getBoolField("scandata") )
+                scanData = false;
+            bool valid = true;
+            stringstream ss;
+            ss << "\nvalidate\n";
+            ss << "  details: " << hex << d << " ofs:" << nsindex(ns)->detailsOffset(d) << dec << endl;
+            if ( d->capped )
+                ss << "  capped:" << d->capped << " max:" << d->max << '\n';
+            
+            ss << "  firstExtent:" << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.buf << '\n';
+            ss << "  lastExtent:" << d->lastExtent.toString()    << " ns:" << d->lastExtent.ext()->nsDiagnostic.buf << '\n';
+            try {
+                d->firstExtent.ext()->assertOk();
+                d->lastExtent.ext()->assertOk();
+                
+                DiskLoc el = d->firstExtent;
+                int ne = 0;
+                while( !el.isNull() ) {
+                    Extent *e = el.ext();
+                    e->assertOk();
+                    el = e->xnext;
+                    ne++;
+                    killCurrentOp.checkForInterrupt();
+                }
+                ss << "  # extents:" << ne << '\n';
+            } catch (...) {
+                valid=false;
+                ss << " extent asserted ";
+            }
+
+            ss << "  datasize?:" << d->datasize << " nrecords?:" << d->nrecords << " lastExtentSize:" << d->lastExtentSize << '\n';
+            ss << "  padding:" << d->paddingFactor << '\n';
+            try {
+
+                try {
+                    ss << "  first extent:\n";
+                    d->firstExtent.ext()->dump(ss);
+                    valid = valid && d->firstExtent.ext()->validates();
+                }
+                catch (...) {
+                    ss << "\n    exception firstextent\n" << endl;
+                }
+
+                set<DiskLoc> recs;
+                if( scanData ) {
+                    auto_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+                    int n = 0;
+                    long long len = 0;
+                    long long nlen = 0;
+                    int outOfOrder = 0;
+                    DiskLoc cl_last;
+                    while ( c->ok() ) {
+                        n++;
+
+                        DiskLoc cl = c->currLoc();
+                        if ( n < 1000000 )
+                            recs.insert(cl);
+                        if ( d->capped ) {
+                            if ( cl < cl_last )
+                                outOfOrder++;
+                            cl_last = cl;
+                        }
+
+                        Record *r = c->_current();
+                        len += r->lengthWithHeaders;
+                        nlen += r->netLength();
+                        c->advance();
+                    }
+                    if ( d->capped ) {
+                        ss << "  capped outOfOrder:" << outOfOrder;
+                        if ( outOfOrder > 1 ) {
+                            valid = false;
+                            ss << " ???";
+                        }
+                        else ss << " (OK)";
+                        ss << '\n';
+                    }
+                    ss << "  " << n << " objects found, nobj:" << d->nrecords << "\n";
+                    ss << "  " << len << " bytes data w/headers\n";
+                    ss << "  " << nlen << " bytes data wout/headers\n";
+                }
+
+                ss << "  deletedList: ";
+                for ( int i = 0; i < Buckets; i++ ) {
+                    ss << (d->deletedList[i].isNull() ? '0' : '1');
+                }
+                ss << endl;
+                int ndel = 0;
+                long long delSize = 0;
+                int incorrect = 0;
+                for ( int i = 0; i < Buckets; i++ ) {
+                    DiskLoc loc = d->deletedList[i];
+                    try {
+                        int k = 0;
+                        while ( !loc.isNull() ) {
+                            if ( recs.count(loc) )
+                                incorrect++;
+                            ndel++;
+
+                            if ( loc.questionable() ) {
+                                if( d->capped && !loc.isValid() && i == 1 ) { 
+                                    /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
+                                       see comments in namespace.h
+                                    */
+                                    break;
+                                }
+
+                                if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) {
+                                    ss << "    ?bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k << endl;
+                                    valid = false;
+                                    break;
+                                }
+                            }
+
+                            DeletedRecord *d = loc.drec();
+                            delSize += d->lengthWithHeaders;
+                            loc = d->nextDeleted;
+                            k++;
+                            killCurrentOp.checkForInterrupt();
+                        }
+                    } catch (...) {
+                        ss <<"    ?exception in deleted chain for bucket " << i << endl;
+                        valid = false;
+                    }
+                }
+                ss << "  deleted: n: " << ndel << " size: " << delSize << endl;
+                if ( incorrect ) {
+                    ss << "    ?corrupt: " << incorrect << " records from datafile are in deleted list\n";
+                    valid = false;
+                }
+
+                int idxn = 0;
+                try  {
+                    ss << "  nIndexes:" << d->nIndexes << endl;
+                    NamespaceDetails::IndexIterator i = d->ii();
+                    while( i.more() ) {
+                        IndexDetails& id = i.next();
+                        ss << "    " << id.indexNamespace() << " keys:" <<
+                            id.head.btree()->fullValidate(id.head, id.keyPattern()) << endl;
+                    }
+                }
+                catch (...) {
+                    ss << "\n    exception during index validate idxn:" << idxn << endl;
+                    valid=false;
+                }
+
+            }
+            catch (AssertionException) {
+                ss << "\n    exception during validate\n" << endl;
+                valid = false;
+            }
+
+            if ( !valid )
+                ss << " ns corrupt, requires dbchk\n";
+
+            return ss.str();
+        }
+    } validateCmd;
+
+    extern bool unlockRequested;
+    extern unsigned lockedForWriting;
+    extern boost::mutex lockedForWritingMutex;
+
+/*
+    class UnlockCommand : public Command { 
+    public:
+        UnlockCommand() : Command( "unlock" ) { }
+        virtual bool readOnly() { return true; }
+        virtual bool slaveOk(){ return true; }
+        virtual bool adminOnly(){ return true; }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( lockedForWriting ) { 
+				log() << "command: unlock requested" << endl;
+                errmsg = "unlock requested";
+                unlockRequested = true;
+            }
+            else { 
+                errmsg = "not locked, so cannot unlock";
+                return 0;
+            }
+            return 1;
+        }
+        
+    } unlockCommand;
+*/
+    /* see unlockFsync() for unlocking:
+       db.$cmd.sys.unlock.findOne()
+    */
+    class FSyncCommand : public Command {
+        class LockDBJob : public BackgroundJob { 
+        protected:
+            void run() { 
+                {
+                    boostlock lk(lockedForWritingMutex);
+                    lockedForWriting++;
+                }
+                readlock lk("");
+                MemoryMappedFile::flushAll(true);
+                log() << "db is now locked for snapshotting, no writes allowed. use db.$cmd.sys.unlock.findOne() to unlock" << endl;
+                _ready = true;
+                while( 1 ) { 
+                    if( unlockRequested ) { 
+                        unlockRequested = false;
+                        break;
+                    }
+                    sleepmillis(20);
+                }
+                {
+                    boostlock lk(lockedForWritingMutex);
+                    lockedForWriting--;
+                }
+            }
+        public:
+            bool& _ready;
+            LockDBJob(bool& ready) : _ready(ready) {
+                deleteSelf = true;
+                _ready = false;
+            }
+        };
+    public:
+        FSyncCommand() : Command( "fsync" ){}
+
+        virtual bool slaveOk(){ return true; }
+        virtual bool adminOnly(){ return true; }
+        /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { 
+            string x = cmdObj["exec"].valuestrsafe();
+            return !x.empty();
+        }*/
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            /* async means do an fsync, but return immediately */
+            bool sync = ! cmdObj["async"].trueValue();
+            bool lock = cmdObj["lock"].trueValue();
+            log() << "CMD fsync:  sync:" << sync << " lock:" << lock << endl;
+
+            if( lock ) { 
+                uassert(12034, "fsync: can't lock while an unlock is pending", !unlockRequested);
+                uassert(12032, "fsync: sync option must be true when using lock", sync);
+                /* With releaseEarly(), we must be extremely careful we don't do anything 
+                   where we would have assumed we were locked.  profiling is one of those things. 
+                   Perhaps at profile time we could check if we released early -- however, 
+                   we need to be careful to keep that code very fast it's a very common code path when on.
+                */
+                uassert(12033, "fsync: profiling must be off to enter locked mode", cc().database()->profile == 0);
+                bool ready = false;
+                LockDBJob *l = new LockDBJob(ready);
+                dbMutex.releaseEarly();
+                l->go();
+                // don't return until background thread has acquired the write lock
+                while( !ready ) { 
+                    sleepmillis(10);
+                }
+                result.append("info", "now locked against writes, use db.$cmd.sys.unlock.findOne() to unlock");
+            }
+            else {
+                result.append( "numFiles" , MemoryMappedFile::flushAll( sync ) );
+            }
+            return 1;
+        }
+        
+    } fsyncCmd;
+    
+}
+
diff --git a/db/dbeval.cpp b/db/dbeval.cpp
new file mode 100644
index 0000000..e729135
--- /dev/null
+++ b/db/dbeval.cpp
@@ -0,0 +1,120 @@
+/* commands.cpp
+   db "commands" (sent via db.$cmd.findOne(...))
+ */
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "json.h"
+#include "repl.h"
+#include "commands.h"
+#include "cmdline.h"
+
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+    const int edebug=0;
+
+    bool dbEval(const char *ns, BSONObj& cmd, BSONObjBuilder& result, string& errmsg) {
+        BSONElement e = cmd.firstElement();
+        uassert( 10046 ,  "eval needs Code" , e.type() == Code || e.type() == CodeWScope || e.type() == String );
+
+        const char *code = 0;
+        switch ( e.type() ) {
+        case String:
+        case Code:
+            code = e.valuestr();
+            break;
+        case CodeWScope:
+            code = e.codeWScopeCode();
+            break;
+        default:
+            assert(0);
+        }
+        assert( code );
+
+        if ( ! globalScriptEngine ) {
+            errmsg = "db side execution is disabled";
+            return false;
+        }
+
+        auto_ptr<Scope> s = globalScriptEngine->getPooledScope( ns );
+        ScriptingFunction f = s->createFunction(code);
+        if ( f == 0 ) {
+            errmsg = (string)"compile failed: " + s->getError();
+            return false;
+        }
+        
+        if ( e.type() == CodeWScope )
+            s->init( e.codeWScopeScopeData() );
+        s->localConnect( cc().database()->name.c_str() );
+
+        BSONObj args;
+        {
+            BSONElement argsElement = cmd.findElement("args");
+            if ( argsElement.type() == Array ) {
+                args = argsElement.embeddedObject();
+                if ( edebug ) {
+                    out() << "args:" << args.toString() << endl;
+                    out() << "code:\n" << code << endl;
+                }
+            }
+        }
+
+        int res;
+        {
+            Timer t;
+            res = s->invoke(f,args, cmdLine.quota ? 10 * 60 * 1000 : 0 );
+            int m = t.millis();
+            if ( m > cmdLine.slowMS ) {
+                out() << "dbeval slow, time: " << dec << m << "ms " << ns << endl;
+                if ( m >= 1000 ) log() << code << endl;
+                else OCCASIONALLY log() << code << endl;
+            }
+        }
+        if ( res ) {
+            result.append("errno", (double) res);
+            errmsg = "invoke failed: ";
+            errmsg += s->getError();
+            return false;
+        }
+        
+        s->append( result , "retval" , "return" );
+
+        return true;
+    }
+
+    class CmdEval : public Command {
+    public:
+        virtual bool slaveOk() {
+            return false;
+        }
+        CmdEval() : Command("$eval") { }
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            return dbEval(ns, cmdObj, result, errmsg);
+        }
+    } cmdeval;
+
+} // namespace mongo
diff --git a/db/dbhelpers.cpp b/db/dbhelpers.cpp
new file mode 100644
index 0000000..ee221ab
--- /dev/null
+++ b/db/dbhelpers.cpp
@@ -0,0 +1,253 @@
+// dbhelpers.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "query.h"
+#include "json.h"
+#include "queryoptimizer.h"
+#include "btree.h"
+#include "pdfile.h"
+
+namespace mongo {
+
+    CursorIterator::CursorIterator( auto_ptr<Cursor> c , BSONObj filter )
+        : _cursor( c ){
+            if ( ! filter.isEmpty() )
+                _matcher.reset( new CoveredIndexMatcher( filter , BSONObj() ) );
+            _advance();
+    }
+
+    BSONObj CursorIterator::next(){
+        BSONObj o = _o;
+        _advance();
+        return o;
+    }
+    
+    bool CursorIterator::hasNext(){
+        return ! _o.isEmpty();
+    }
+
+    void CursorIterator::_advance(){
+        if ( ! _cursor->ok() ){
+            _o = BSONObj();
+            return;
+        }
+        
+        while ( _cursor->ok() ){
+            _o = _cursor->current();
+            _cursor->advance();
+            if ( _matcher.get() == 0 || _matcher->matches( _o ) )
+                return;
+        }
+
+        _o = BSONObj();
+    }
+
+    void Helpers::ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name) {
+        NamespaceDetails *d = nsdetails(ns);
+        if( d == 0 )
+            return;
+
+        {
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                if( i.next().keyPattern().woCompare(keyPattern) == 0 )
+                    return;
+            }
+        }
+
+        if( d->nIndexes >= NamespaceDetails::NIndexesMax ) { 
+            problem() << "Helper::ensureIndex fails, MaxIndexes exceeded " << ns << '\n';
+            return;
+        }
+
+        string system_indexes = cc().database()->name + ".system.indexes";
+
+        BSONObjBuilder b;
+        b.append("name", name);
+        b.append("ns", ns);
+        b.append("key", keyPattern);
+        b.appendBool("unique", unique);
+        BSONObj o = b.done();
+
+        theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize());
+    }
+
+    class FindOne : public QueryOp {
+    public:
+        FindOne( bool requireIndex ) : requireIndex_( requireIndex ) {}
+        virtual void init() {
+            if ( requireIndex_ && strcmp( qp().indexKey().firstElement().fieldName(), "$natural" ) == 0 )
+                throw MsgAssertionException( 9011 , "Not an index cursor" );
+            c_ = qp().newCursor();
+            if ( !c_->ok() )
+                setComplete();
+            else
+                matcher_.reset( new CoveredIndexMatcher( qp().query(), qp().indexKey() ) );
+        }
+        virtual void next() {
+            if ( !c_->ok() ) {
+                setComplete();
+                return;
+            }
+            if ( matcher_->matches( c_->currKey(), c_->currLoc() ) ) {
+                one_ = c_->current();
+                setComplete();
+            } else {
+                c_->advance();
+            }
+        }
+        virtual bool mayRecordPlan() const { return false; }
+        virtual QueryOp *clone() const { return new FindOne( requireIndex_ ); }
+        BSONObj one() const { return one_; }
+    private:
+        bool requireIndex_;
+        auto_ptr< Cursor > c_;
+        auto_ptr< CoveredIndexMatcher > matcher_;
+        BSONObj one_;
+    };
+    
+    /* fetch a single object from collection ns that matches query 
+       set your db SavedContext first
+    */
+    bool Helpers::findOne(const char *ns, BSONObj query, BSONObj& result, bool requireIndex) { 
+        QueryPlanSet s( ns, query, BSONObj(), 0, !requireIndex );
+        FindOne original( requireIndex );
+        shared_ptr< FindOne > res = s.runOp( original );
+        massert( 10302 ,  res->exceptionMessage(), res->complete() );
+        if ( res->one().isEmpty() )
+            return false;
+        result = res->one();
+        return true;
+    }
+
+    auto_ptr<CursorIterator> Helpers::find( const char *ns , BSONObj query , bool requireIndex ){
+        uassert( 10047 ,  "requireIndex not supported in Helpers::find yet" , ! requireIndex );
+        auto_ptr<CursorIterator> i;
+        i.reset( new CursorIterator( DataFileMgr::findAll( ns ) , query ) );
+        return i;
+    }
+    
+    
+    bool Helpers::findById(Client& c, const char *ns, BSONObj query, BSONObj& result ,
+                           bool * nsFound , bool * indexFound ){
+        Database *database = c.database();
+        assert( database );
+        NamespaceDetails *d = database->namespaceIndex.details(ns);
+        if ( ! d )
+            return false;
+        if ( nsFound )
+            *nsFound = 1;
+        
+        int idxNo = d->findIdIndex();
+        if ( idxNo < 0 )
+            return false;
+        if ( indexFound )
+            *indexFound = 1;
+
+        IndexDetails& i = d->idx( idxNo );
+        
+        BSONObj key = i.getKeyFromQuery( query );
+        
+        DiskLoc loc = i.head.btree()->findSingle( i , i.head , key );
+        if ( loc.isNull() )
+            return false;
+        result = loc.obj();
+        return true;
+    }
+
+    /* Get the first object from a collection.  Generally only useful if the collection
+       only ever has a single object -- which is a "singleton collection.
+
+       Returns: true if object exists.
+    */
+    bool Helpers::getSingleton(const char *ns, BSONObj& result) {
+        Client::Context context(ns);
+
+        auto_ptr<Cursor> c = DataFileMgr::findAll(ns);
+        if ( !c->ok() )
+            return false;
+
+        result = c->current();
+        return true;
+    }
+
+    void Helpers::putSingleton(const char *ns, BSONObj obj) {
+        OpDebug debug;
+        Client::Context context(ns);
+        updateObjects(ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , true , debug );
+    }
+
+    void Helpers::emptyCollection(const char *ns) {
+        Client::Context context(ns);
+        deleteObjects(ns, BSONObj(), false);
+    }
+
+    DbSet::~DbSet() {
+        if ( name_.empty() )
+            return;
+        try {
+            Client::Context c( name_.c_str() );
+            if ( nsdetails( name_.c_str() ) ) {
+                string errmsg;
+                BSONObjBuilder result;
+                dropCollection( name_, errmsg, result );
+            }
+        } catch ( ... ) {
+            problem() << "exception cleaning up DbSet" << endl;
+        }
+    }
+    
+    void DbSet::reset( const string &name, const BSONObj &key ) {
+        if ( !name.empty() )
+            name_ = name;
+        if ( !key.isEmpty() )
+            key_ = key.getOwned();
+        Client::Context c( name_.c_str() );
+        if ( nsdetails( name_.c_str() ) ) {
+            Helpers::emptyCollection( name_.c_str() );
+        } else {
+            string err;
+            massert( 10303 ,  err, userCreateNS( name_.c_str(), fromjson( "{autoIndexId:false}" ), err, false ) );
+        }
+        Helpers::ensureIndex( name_.c_str(), key_, true, "setIdx" );            
+    }
+    
+    bool DbSet::get( const BSONObj &obj ) const {
+        Client::Context c( name_.c_str() );
+        BSONObj temp;
+        return Helpers::findOne( name_.c_str(), obj, temp, true );
+    }
+    
+    void DbSet::set( const BSONObj &obj, bool val ) {
+        Client::Context c( name_.c_str() );
+        if ( val ) {
+            try {
+                BSONObj k = obj;
+                theDataFileMgr.insert( name_.c_str(), k, false );
+            } catch ( DBException& ) {
+                // dup key - already in set
+            }
+        } else {
+            deleteObjects( name_.c_str(), obj, true, false, false );
+        }                        
+    }
+        
+} // namespace mongo
diff --git a/db/dbhelpers.h b/db/dbhelpers.h
new file mode 100644
index 0000000..3c223d8
--- /dev/null
+++ b/db/dbhelpers.h
@@ -0,0 +1,122 @@
+// dbhelpers.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* db helpers are helper functions and classes that let us easily manipulate the local
+   database instance.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "client.h"
+#include "db.h"
+
+namespace mongo {
+
+    class Cursor;
+    class CoveredIndexMatcher;
+
+    class CursorIterator {
+    public:
+        CursorIterator( auto_ptr<Cursor> c , BSONObj filter = BSONObj() );
+        BSONObj next();
+        bool hasNext();
+
+    private:
+        void _advance();
+
+        auto_ptr<Cursor> _cursor;
+        auto_ptr<CoveredIndexMatcher> _matcher;
+        BSONObj _o;
+    };
+
+    /**
+       all helpers assume locking is handled above them
+     */
+    struct Helpers { 
+
+        /* ensure the specified index exists.
+
+           @param keyPattern key pattern, e.g., { ts : 1 }
+           @param name index name, e.g., "name_1"
+
+           This method can be a little (not much) cpu-slow, so you may wish to use
+             OCCASIONALLY ensureIndex(...);
+
+           Note: use ensureHaveIdIndex() for the _id index: it is faster.
+           Note: does nothing if collection does not yet exist.
+        */
+        static void ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name);
+
+        /* fetch a single object from collection ns that matches query.
+           set your db SavedContext first.
+
+           @param requireIndex if true, complain if no index for the query.  a way to guard against
+           writing a slow query.
+
+           @return true if object found
+        */
+        static bool findOne(const char *ns, BSONObj query, BSONObj& result, bool requireIndex = false);
+        
+
+        /**
+         * @param foundIndex if passed in will be set to 1 if ns and index found
+         * @return true if object found
+         */
+        static bool findById(Client&, const char *ns, BSONObj query, BSONObj& result , 
+                             bool * nsFound = 0 , bool * indexFound = 0 );
+
+        static auto_ptr<CursorIterator> find( const char *ns , BSONObj query = BSONObj() , bool requireIndex = false );
+
+        /* Get/put the first object from a collection.  Generally only useful if the collection
+           only ever has a single object -- which is a "singleton collection".
+
+		   You do not need to set the database before calling.
+		   
+		   Returns: true if object exists.
+        */
+        static bool getSingleton(const char *ns, BSONObj& result);
+        static void putSingleton(const char *ns, BSONObj obj);
+
+
+        /* Remove all objects from a collection.
+        You do not need to set the database before calling.
+        */
+        static void emptyCollection(const char *ns);
+
+    };
+
+    class Database;
+
+    // manage a set using collection backed storage
+    class DbSet {
+    public:
+        DbSet( const string &name = "", const BSONObj &key = BSONObj() ) :
+            name_( name ),
+            key_( key.getOwned() ) {
+        }
+        ~DbSet();
+        void reset( const string &name = "", const BSONObj &key = BSONObj() );
+        bool get( const BSONObj &obj ) const;
+        void set( const BSONObj &obj, bool val );
+    private:
+        string name_;
+        BSONObj key_;
+    };
+    
+} // namespace mongo
diff --git a/db/dbmessage.h b/db/dbmessage.h
new file mode 100644
index 0000000..54a2ac3
--- /dev/null
+++ b/db/dbmessage.h
@@ -0,0 +1,267 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "storage.h"
+#include "jsobj.h"
+#include "namespace.h"
+#include "../util/message.h"
+
+namespace mongo {
+
+    /* db response format
+
+       Query or GetMore: // see struct QueryResult
+          int resultFlags;
+          int64 cursorID;
+          int startingFrom;
+          int nReturned;
+          list of marshalled JSObjects;
+    */
+
+    extern bool objcheck;
+    
+#pragma pack(1)
+    struct QueryResult : public MsgData {
+        enum ResultFlagType {
+            /* returned, with zero results, when getMore is called but the cursor id 
+               is not valid at the server. */
+            ResultFlag_CursorNotFound = 1,   
+
+            /* { $err : ... } is being returned */
+            ResultFlag_ErrSet = 2,           
+
+            /* Have to update config from the server, usually $err is also set */
+            ResultFlag_ShardConfigStale = 4,  
+
+            /* for backward compatability: this let's us know the server supports 
+               the QueryOption_AwaitData option. if it doesn't, a repl slave client should sleep 
+               a little between getMore's.
+            */
+            ResultFlag_AwaitCapable = 8
+        };
+
+        long long cursorId;
+        int startingFrom;
+        int nReturned;
+        const char *data() {
+            return (char *) (((int *)&nReturned)+1);
+        }
+        int resultFlags() {
+            return dataAsInt();
+        }
+        int& _resultFlags() {
+            return dataAsInt();
+        }
+        void setResultFlagsToOk() { 
+            _resultFlags() = 0; // ResultFlag_AwaitCapable
+        }
+    };
+#pragma pack()
+
+    /* For the database/server protocol, these objects and functions encapsulate
+       the various messages transmitted over the connection.
+    */
+
+    class DbMessage {
+    public:
+        DbMessage(const Message& _m) : m(_m) {
+            theEnd = _m.data->_data + _m.data->dataLen();
+            int *r = (int *) _m.data->_data;
+            reserved = *r;
+            r++;
+            data = (const char *) r;
+            nextjsobj = data;
+        }
+
+        const char * getns() {
+            return data;
+        }
+        void getns(Namespace& ns) {
+            ns = data;
+        }
+        
+        
+        void resetPull(){
+            nextjsobj = data;
+        }
+        int pullInt() {
+            if ( nextjsobj == data )
+                nextjsobj += strlen(data) + 1; // skip namespace
+            int i = *((int *)nextjsobj);
+            nextjsobj += 4;
+            return i;
+        }
+        long long pullInt64() const {
+            return pullInt64();
+        }
+        long long &pullInt64() {
+            if ( nextjsobj == data )
+                nextjsobj += strlen(data) + 1; // skip namespace
+            long long &i = *((long long *)nextjsobj);
+            nextjsobj += 8;
+            return i;
+        }
+
+        OID* getOID() {
+            return (OID *) (data + strlen(data) + 1); // skip namespace
+        }
+
+        void getQueryStuff(const char *&query, int& ntoreturn) {
+            int *i = (int *) (data + strlen(data) + 1);
+            ntoreturn = *i;
+            i++;
+            query = (const char *) i;
+        }
+
+        /* for insert and update msgs */
+        bool moreJSObjs() {
+            return nextjsobj != 0;
+        }
+        BSONObj nextJsObj() {
+            if ( nextjsobj == data )
+                nextjsobj += strlen(data) + 1; // skip namespace
+            massert( 10304 ,  "Remaining data too small for BSON object", theEnd - nextjsobj > 3 );
+            BSONObj js(nextjsobj);
+            massert( 10305 ,  "Invalid object size", js.objsize() > 3 );
+            massert( 10306 ,  "Next object larger than available space",
+                    js.objsize() < ( theEnd - data ) );
+            if ( objcheck && !js.valid() ) {
+                massert( 10307 , "bad object in message", false);
+            }            
+            nextjsobj += js.objsize();
+            if ( nextjsobj >= theEnd )
+                nextjsobj = 0;
+            return js;
+        }
+
+        const Message& msg() {
+            return m;
+        }
+
+        void markSet(){
+            mark = nextjsobj;
+        }
+        
+        void markReset(){
+            nextjsobj = mark;
+        }
+
+    private:
+        const Message& m;
+        int reserved;
+        const char *data;
+        const char *nextjsobj;
+        const char *theEnd;
+
+        const char * mark;
+    };
+
+
+    /* a request to run a query, received from the database */
+    class QueryMessage {
+    public:
+        const char *ns;
+        int ntoskip;
+        int ntoreturn;
+        int queryOptions;
+        BSONObj query;
+        auto_ptr< FieldMatcher > fields;
+        
+        /* parses the message into the above fields */
+        QueryMessage(DbMessage& d) {
+            ns = d.getns();
+            ntoskip = d.pullInt();
+            ntoreturn = d.pullInt();
+            query = d.nextJsObj();
+            if ( d.moreJSObjs() ) {
+                BSONObj o = d.nextJsObj();
+                if (!o.isEmpty()){
+                    fields = auto_ptr< FieldMatcher >(new FieldMatcher() );
+                    fields->add( o );
+                }
+            }
+            queryOptions = d.msg().data->dataAsInt();
+        }
+    };
+
+} // namespace mongo
+
+#include "../client/dbclient.h"
+
+namespace mongo {
+
+    inline void replyToQuery(int queryResultFlags,
+                             AbstractMessagingPort* p, Message& requestMsg,
+                             void *data, int size,
+                             int nReturned, int startingFrom = 0,
+                             long long cursorId = 0
+                            ) {
+        BufBuilder b(32768);
+        b.skip(sizeof(QueryResult));
+        b.append(data, size);
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->_resultFlags() = queryResultFlags;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->cursorId = cursorId;
+        qr->startingFrom = startingFrom;
+        qr->nReturned = nReturned;
+        b.decouple();
+        Message *resp = new Message();
+        resp->setData(qr, true); // transport will free
+        p->reply(requestMsg, *resp, requestMsg.data->id);
+    }
+
+} // namespace mongo
+
+//#include "bsonobj.h"
+#include "instance.h"
+
+namespace mongo {
+
+    /* object reply helper. */
+    inline void replyToQuery(int queryResultFlags,
+                             AbstractMessagingPort* p, Message& requestMsg,
+                             BSONObj& responseObj)
+    {
+        replyToQuery(queryResultFlags,
+                     p, requestMsg,
+                     (void *) responseObj.objdata(), responseObj.objsize(), 1);
+    }
+
+    /* helper to do a reply using a DbResponse object */
+    inline void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) {
+        BufBuilder b;
+        b.skip(sizeof(QueryResult));
+        b.append((void*) obj.objdata(), obj.objsize());
+        QueryResult* msgdata = (QueryResult *) b.buf();
+        b.decouple();
+        QueryResult *qr = msgdata;
+        qr->_resultFlags() = queryResultFlags;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->cursorId = 0;
+        qr->startingFrom = 0;
+        qr->nReturned = 1;
+        Message *resp = new Message();
+        resp->setData(msgdata, true); // transport will free
+        dbresponse.response = resp;
+        dbresponse.responseTo = m.data->id;
+    }
+
+} // namespace mongo
diff --git a/db/dbstats.cpp b/db/dbstats.cpp
new file mode 100644
index 0000000..902b57b
--- /dev/null
+++ b/db/dbstats.cpp
@@ -0,0 +1,43 @@
+// dbstats.cpp
+
+#include "stdafx.h"
+#include "dbstats.h"
+
+namespace mongo {
+
+    OpCounters::OpCounters(){
+        int zero = 0;
+
+        BSONObjBuilder b;
+        b.append( "insert" , zero );
+        b.append( "query" , zero );
+        b.append( "update" , zero );
+        b.append( "delete" , zero );
+        b.append( "getmore" , zero );
+        _obj = b.obj();
+
+        _insert = (int*)_obj["insert"].value();
+        _query = (int*)_obj["query"].value();
+        _update = (int*)_obj["update"].value();
+        _delete = (int*)_obj["delete"].value();
+        _getmore = (int*)_obj["getmore"].value();
+    }
+
+    void OpCounters::gotOp( int op ){
+        switch ( op ){
+        case dbInsert: gotInsert(); break;
+        case dbQuery: gotQuery(); break;
+        case dbUpdate: gotUpdate(); break;
+        case dbDelete: gotDelete(); break;
+        case dbGetMore: gotGetMore(); break;
+        case dbKillCursors:
+        case opReply:
+        case dbMsg:
+            break;
+        default: log() << "OpCounters::gotOp unknown op: " << op << endl;
+        }
+    }
+
+
+    OpCounters globalOpCounters;
+}
diff --git a/db/dbstats.h b/db/dbstats.h
new file mode 100644
index 0000000..c7d6340
--- /dev/null
+++ b/db/dbstats.h
@@ -0,0 +1,44 @@
+// dbstats.h
+
+#include "../stdafx.h"
+#include "jsobj.h"
+#include "../util/message.h"
+
+namespace mongo {
+
+    /**
+     * for storing operation counters
+     * note: not thread safe.  ok with that for speed
+     */
+    class OpCounters {
+    public:
+        
+        OpCounters();
+
+        int * getInsert(){ return _insert; }
+        int * getQuery(){ return _query; }
+        int * getUpdate(){ return _update; }
+        int * getDelete(){ return _delete; }
+        int * getGetGore(){ return _getmore; }
+
+        void gotInsert(){ _insert[0]++; }
+        void gotQuery(){ _query[0]++; }
+        void gotUpdate(){ _update[0]++; }
+        void gotDelete(){ _delete[0]++; }
+        void gotGetMore(){ _getmore[0]++; }
+
+        void gotOp( int op );
+
+        BSONObj& getObj(){ return _obj; }
+    private:
+        BSONObj _obj;
+        int * _insert;
+        int * _query;
+        int * _update;
+        int * _delete;
+        int * _getmore;
+    };
+
+    extern OpCounters globalOpCounters;
+
+}
diff --git a/db/dbwebserver.cpp b/db/dbwebserver.cpp
new file mode 100644
index 0000000..0e1483c
--- /dev/null
+++ b/db/dbwebserver.cpp
@@ -0,0 +1,499 @@
+/* dbwebserver.cpp
+
+   This is the administrative web page displayed on port 28017.
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "../util/miniwebserver.h"
+#include "../util/md5.hpp"
+#include "db.h"
+#include "repl.h"
+#include "replset.h"
+#include "instance.h"
+#include "security.h"
+
+#include <pcrecpp.h>
+#include <boost/date_time/posix_time/posix_time.hpp>
+#undef assert
+#define assert xassert
+
+namespace mongo {
+
+    extern string bind_ip;
+    extern const char *replInfo;
+
+    bool getInitialSyncCompleted();
+
+    time_t started = time(0);
+
+    /*
+        string toString() {
+            stringstream ss;
+            unsigned long long dt = last - start;
+            ss << dt/1000;
+            ss << '\t';
+            ss << timeLocked/1000 << '\t';
+            if( dt )
+                ss << (timeLocked*100)/dt << '%';
+            return ss.str();
+        }
+    */
+
+    struct Timing {
+        Timing() {
+            start = timeLocked = 0;
+        }
+        unsigned long long start, timeLocked;
+    };
+    Timing tlast;
+    const int NStats = 32;
+    string lockStats[NStats];
+    unsigned q = 0;
+
+    void statsThread() {
+        /*cout << "TEMP disabled statsthread" << endl;
+        if( 1 ) 
+            return;*/
+        Client::initThread("stats");
+        unsigned long long timeLastPass = 0;
+        while ( 1 ) {
+            {
+                /* todo: do we even need readlock here?  if so for what? */
+                readlock lk("");
+                Top::completeSnapshot();
+                q = (q+1)%NStats;
+                Timing timing;
+                dbMutex.info().getTimingInfo(timing.start, timing.timeLocked);
+                unsigned long long now = curTimeMicros64();
+                if ( timeLastPass ) {
+                    unsigned long long dt = now - timeLastPass;
+                    unsigned long long dlocked = timing.timeLocked - tlast.timeLocked;
+                    {
+                        stringstream ss;
+                        ss << dt / 1000 << '\t';
+                        ss << dlocked / 1000 << '\t';
+                        if ( dt )
+                            ss << (dlocked*100)/dt << '%';
+                        string s = ss.str();
+                        if ( cmdLine.cpu )
+                            log() << "cpu: " << s << endl;
+                        lockStats[q] = s;
+                        ClientCursor::idleTimeReport( (unsigned) ((dt - dlocked)/1000) );
+                    }
+                }
+                timeLastPass = now;
+                tlast = timing;
+            }
+            sleepsecs(4);
+        }
+    }
+
+    bool _bold;
+    string bold(bool x) {
+        _bold = x;
+        return x ? "<b>" : "";
+    }
+    string bold() {
+        return _bold ? "</b>" : "";
+    }
+
+    class DbWebServer : public MiniWebServer {
+    public:
+        // caller locks
+        void doLockedStuff(stringstream& ss) {
+            ss << "# databases: " << dbHolder.size() << '\n';
+            if ( cc().database() ) {
+                ss << "curclient: " << cc().database()->name; // TODO: isn't this useless?
+                ss << '\n';
+            }
+            ss << bold(ClientCursor::byLocSize()>10000) << "Cursors byLoc.size(): " << ClientCursor::byLocSize() << bold() << '\n';
+            ss << "\n<b>replication</b>\n";
+            ss << "master: " << master << '\n';
+            ss << "slave:  " << slave << '\n';
+            if ( replPair ) {
+                ss << "replpair:\n";
+                ss << replPair->getInfo();
+            }
+            bool seemCaughtUp = getInitialSyncCompleted();
+            if ( !seemCaughtUp ) ss << "<b>";
+            ss <<   "initialSyncCompleted: " << seemCaughtUp;
+            if ( !seemCaughtUp ) ss << "</b>";
+            ss << '\n';
+
+            ss << "\n<b>DBTOP</b>\n";
+            ss << "<table border=1><tr align='left'><th>Namespace</th><th>%</th><th>Reads</th><th>Writes</th><th>Calls</th><th>Time</th>";
+            vector< Top::Usage > usage;
+            Top::usage( usage );
+            for( vector< Top::Usage >::iterator i = usage.begin(); i != usage.end(); ++i )
+                ss << setprecision( 2 ) << fixed << "<tr><td>" << i->ns << "</td><td>" << i->pct << "</td><td>"
+                   << i->reads << "</td><td>" << i->writes << "</td><td>" << i->calls << "</td><td>" << i->time << "</td></tr>\n";
+            ss << "</table>";
+            
+            ss << "\n<b>dt\ttlocked</b>\n";
+            unsigned i = q;
+            while ( 1 ) {
+                ss << lockStats[i] << '\n';
+                i = (i-1)%NStats;
+                if ( i == q )
+                    break;
+            }
+        }
+
+        void doUnlockedStuff(stringstream& ss) {
+            /* this is in the header already ss << "port:      " << port << '\n'; */
+            ss << mongodVersion() << "\n";
+            ss << "git hash: " << gitVersion() << "\n";
+            ss << "sys info: " << sysInfo() << "\n";
+            ss << "\n";
+            ss << "dbwritelocked:  " << dbMutex.info().isLocked() << " (initial)\n";
+            ss << "uptime:    " << time(0)-started << " seconds\n";
+            if ( replAllDead )
+                ss << "<b>replication replAllDead=" << replAllDead << "</b>\n";
+            ss << "\nassertions:\n";
+            for ( int i = 0; i < 4; i++ ) {
+                if ( lastAssert[i].isSet() ) {
+                    ss << "<b>";
+                    if ( i == 3 ) ss << "usererr";
+                    else ss << i;
+                    ss << "</b>" << ' ' << lastAssert[i].toString();
+                }
+            }
+
+            ss << "\nreplInfo:  " << replInfo << "\n\n";
+
+            ss << "Clients:\n";
+            ss << "<table border=1><tr align='left'><th>Thread</th><th>Current op</th>\n";
+            {
+                boostlock bl(Client::clientsMutex);
+                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { 
+                    Client *c = *i;
+                    CurOp& co = *(c->curop());
+                    ss << "<tr><td>" << c->desc() << "</td><td";
+                    BSONObj info = co.infoNoauth();
+                    /*
+                    if( info.getIntField("inLock") > 0 )
+                        ss << "style='color:red'";
+                    else if( info.getIntField("inLock") < 0 ) 
+                        ss << "style='color:green'";
+                        */
+                    ss << ">" << info << "</td></tr>\n";
+                }
+            }
+            ss << "</table>\n";
+        }
+        
+        bool allowed( const char * rq , vector<string>& headers, const SockAddr &from ){
+            
+            if ( from.localhost() )
+                return true;
+            
+            if ( db.findOne( "admin.system.users" , BSONObj() ).isEmpty() )
+                return true;
+            
+            string auth = getHeader( rq , "Authorization" );
+
+            if ( auth.size() > 0 && auth.find( "Digest " ) == 0 ){
+                auth = auth.substr( 7 ) + ", ";
+
+                map<string,string> parms;
+                pcrecpp::StringPiece input( auth );
+                
+                string name, val;
+                pcrecpp::RE re("(\\w+)=\"?(.*?)\"?, ");
+                while ( re.Consume( &input, &name, &val) ){
+                    parms[name] = val;
+                }
+
+                BSONObj user = db.findOne( "admin.system.users" , BSON( "user" << parms["username"] ) );
+                if ( ! user.isEmpty() ){
+                    string ha1 = user["pwd"].str();
+                    string ha2 = md5simpledigest( (string)"GET" + ":" + parms["uri"] );
+                    
+                    string r = ha1 + ":" + parms["nonce"];
+                    if ( parms["nc"].size() && parms["cnonce"].size() && parms["qop"].size() ){
+                        r += ":";
+                        r += parms["nc"];
+                        r += ":";
+                        r += parms["cnonce"];
+                        r += ":";
+                        r += parms["qop"];
+                    }
+                    r += ":";
+                    r += ha2;
+                    r = md5simpledigest( r );
+                    
+                    if ( r == parms["response"] )
+                        return true;
+                }
+
+                
+            }
+            
+            stringstream authHeader;
+            authHeader 
+                << "WWW-Authenticate: "
+                << "Digest realm=\"mongo\", "
+                << "nonce=\"abc\", " 
+                << "algorithm=MD5, qop=\"auth\" "
+                ;
+            
+            headers.push_back( authHeader.str() );
+            return 0;
+        }
+
+        virtual void doRequest(
+            const char *rq, // the full request
+            string url,
+            // set these and return them:
+            string& responseMsg,
+            int& responseCode,
+            vector<string>& headers, // if completely empty, content-type: text/html will be added
+            const SockAddr &from
+        )
+        {
+            //out() << "url [" << url << "]" << endl;
+            
+            if ( url.size() > 1 ) {
+                if ( ! allowed( rq , headers, from ) ){
+                    responseCode = 401;
+                    responseMsg = "not allowed\n";
+                    return;
+                }                
+                handleRESTRequest( rq , url , responseMsg , responseCode , headers );
+                return;
+            }
+
+
+            responseCode = 200;
+            stringstream ss;
+            ss << "<html><head><title>";
+
+            string dbname;
+            {
+                stringstream z;
+                z << "mongodb " << getHostName() << ':' << mongo::cmdLine.port << ' ';
+                dbname = z.str();
+            }
+            ss << dbname << "</title></head><body><h2>" << dbname << "</h2><p>\n<pre>";
+
+            doUnlockedStuff(ss);
+
+            int n = 2000;
+            Timer t;
+            while ( 1 ) {
+                if ( !dbMutex.info().isLocked() ) {
+                    {
+                        readlock lk("");
+                        ss << "time to get dblock: " << t.millis() << "ms\n";
+                        doLockedStuff(ss);
+                    }
+                    break;
+                }
+                sleepmillis(1);
+                if ( --n < 0 ) {
+                    ss << "\n<b>timed out getting dblock</b>\n";
+                    break;
+                }
+            }
+
+            ss << "</pre></body></html>";
+            responseMsg = ss.str();
+
+            // we want to return SavedContext from before the authentication was performed
+            if ( ! allowed( rq , headers, from ) ){
+                responseCode = 401;
+                responseMsg = "not allowed\n";
+                return;
+            }            
+        }
+
+        void handleRESTRequest( const char *rq, // the full request
+                                string url,
+                                string& responseMsg,
+                                int& responseCode,
+                                vector<string>& headers // if completely empty, content-type: text/html will be added
+                              ) {
+
+            string::size_type first = url.find( "/" , 1 );
+            if ( first == string::npos ) {
+                responseCode = 400;
+                return;
+            }
+
+            string method = parseMethod( rq );
+            string dbname = url.substr( 1 , first - 1 );
+            string coll = url.substr( first + 1 );
+            string action = "";
+
+            map<string,string> params;
+            if ( coll.find( "?" ) != string::npos ) {
+                parseParams( params , coll.substr( coll.find( "?" ) + 1 ) );
+                coll = coll.substr( 0 , coll.find( "?" ) );
+            }
+
+            string::size_type last = coll.find_last_of( "/" );
+            if ( last == string::npos ) {
+                action = coll;
+                coll = "_defaultCollection";
+            }
+            else {
+                action = coll.substr( last + 1 );
+                coll = coll.substr( 0 , last );
+            }
+
+            for ( string::size_type i=0; i<coll.size(); i++ )
+                if ( coll[i] == '/' )
+                    coll[i] = '.';
+
+            string fullns = dbname + "." + coll;
+
+            headers.push_back( (string)"x-action: " + action );
+            headers.push_back( (string)"x-ns: " + fullns );
+            headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+
+            stringstream ss;
+
+            if ( method == "GET" ) {
+                responseCode = 200;
+                handleRESTQuery( fullns , action , params , responseCode , ss  );
+            }
+            else if ( method == "POST" ) {
+                responseCode = 201;
+                handlePost( fullns , body( rq ) , params , responseCode , ss  );
+            }
+            else {
+                responseCode = 400;
+                headers.push_back( "X_err: bad request" );
+                ss << "don't know how to handle a [" << method << "]";
+                out() << "don't know how to handle a [" << method << "]" << endl;
+            }
+
+            responseMsg = ss.str();
+        }
+
+        void handleRESTQuery( string ns , string action , map<string,string> & params , int & responseCode , stringstream & out ) {
+            Timer t;
+
+            int skip = _getOption( params["skip"] , 0 );
+            int num = _getOption( params["limit"] , _getOption( params["count" ] , 1000 ) ); // count is old, limit is new
+
+            int one = 0;
+            if ( params["one"].size() > 0 && tolower( params["one"][0] ) == 't' ) {
+                num = 1;
+                one = 1;
+            }
+
+            BSONObjBuilder queryBuilder;
+
+            for ( map<string,string>::iterator i = params.begin(); i != params.end(); i++ ) {
+                if ( ! i->first.find( "filter_" ) == 0 )
+                    continue;
+
+                const char * field = i->first.substr( 7 ).c_str();
+                const char * val = i->second.c_str();
+
+                char * temp;
+
+                // TODO: this is how i guess if something is a number.  pretty lame right now
+                double number = strtod( val , &temp );
+                if ( temp != val )
+                    queryBuilder.append( field , number );
+                else
+                    queryBuilder.append( field , val );
+            }
+
+            BSONObj query = queryBuilder.obj();
+
+            auto_ptr<DBClientCursor> cursor = db.query( ns.c_str() , query, num , skip );
+
+            if ( one ) {
+                if ( cursor->more() ) {
+                    BSONObj obj = cursor->next();
+                    out << obj.jsonString() << "\n";
+                }
+                else {
+                    responseCode = 404;
+                }
+                return;
+            }
+
+            out << "{\n";
+            out << "  \"offset\" : " << skip << ",\n";
+            out << "  \"rows\": [\n";
+
+            int howMany = 0;
+            while ( cursor->more() ) {
+                if ( howMany++ )
+                    out << " ,\n";
+                BSONObj obj = cursor->next();
+                out << "    " << obj.jsonString();
+
+            }
+            out << "\n  ],\n\n";
+
+            out << "  \"total_rows\" : " << howMany << " ,\n";
+            out << "  \"query\" : " << query.jsonString() << " ,\n";
+            out << "  \"millis\" : " << t.millis() << "\n";
+            out << "}\n";
+        }
+
+        // TODO Generate id and revision per couch POST spec
+        void handlePost( string ns, const char *body, map<string,string> & params, int & responseCode, stringstream & out ) {
+            try {
+                BSONObj obj = fromjson( body );
+                db.insert( ns.c_str(), obj );
+            } catch ( ... ) {
+                responseCode = 400; // Bad Request.  Seems reasonable for now.
+                out << "{ \"ok\" : false }";
+                return;
+            }
+
+            responseCode = 201;
+            out << "{ \"ok\" : true }";
+        }
+
+        int _getOption( string val , int def ) {
+            if ( val.size() == 0 )
+                return def;
+            return atoi( val.c_str() );
+        }
+
+    private:
+        static DBDirectClient db;
+    };
+
+    DBDirectClient DbWebServer::db;
+
+    void webServerThread() {
+        boost::thread thr(statsThread);
+        Client::initThread("websvr");
+        DbWebServer mini;
+        int p = cmdLine.port + 1000;
+        if ( mini.init(bind_ip, p) ) {
+            ListeningSockets::get()->add( mini.socket() );
+            log() << "web admin interface listening on port " << p << endl;
+            mini.run();
+        }
+        else { 
+            log() << "warning: web admin interface failed to initialize on port " << p << endl;
+        }
+        cc().shutdown();
+    }
+
+} // namespace mongo
diff --git a/db/extsort.cpp b/db/extsort.cpp
new file mode 100644
index 0000000..08b343a
--- /dev/null
+++ b/db/extsort.cpp
@@ -0,0 +1,227 @@
+// extsort.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+
+#include "extsort.h"
+#include "namespace.h"
+#include "../util/file.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+namespace mongo {
+    
+    unsigned long long BSONObjExternalSorter::_compares = 0;
+    
+    BSONObjExternalSorter::BSONObjExternalSorter( const BSONObj & order , long maxFileSize )
+        : _order( order.getOwned() ) , _maxFilesize( maxFileSize ) , 
+          _cur(0), _curSizeSoFar(0), _sorted(0){
+        
+        stringstream rootpath;
+        rootpath << dbpath;
+        if ( dbpath[dbpath.size()-1] != '/' )
+            rootpath << "/";
+        rootpath << "_tmp/esort." << time(0) << "." << rand() << "/";
+        _root = rootpath.str();
+        
+        log(1) << "external sort root: " << _root.string() << endl;
+
+        create_directories( _root );
+        _compares = 0;
+    }
+    
+    BSONObjExternalSorter::~BSONObjExternalSorter(){
+        if ( _cur ){
+            delete _cur;
+            _cur = 0;
+        }
+        
+        unsigned long removed = remove_all( _root );
+        wassert( removed == 1 + _files.size() );
+    }
+
+    void BSONObjExternalSorter::sort(){
+        uassert( 10048 ,  "already sorted" , ! _sorted );
+
+        _sorted = true;
+
+        if ( _cur && _files.size() == 0 ){
+            _cur->sort( MyCmp( _order ) );
+            log(1) << "\t\t not using file.  size:" << _curSizeSoFar << " _compares:" << _compares << endl;
+            return;
+        }
+        
+        if ( _cur ){
+            finishMap();
+        }
+        
+        if ( _cur ){
+            delete _cur;
+            _cur = 0;
+        }
+        
+        if ( _files.size() == 0 )
+            return;
+        
+    }
+
+    void BSONObjExternalSorter::add( const BSONObj& o , const DiskLoc & loc ){
+        uassert( 10049 ,  "sorted already" , ! _sorted );
+        
+        if ( ! _cur ){
+            _cur = new InMemory();
+        }
+        
+        _cur->push_back( pair<BSONObj,DiskLoc>( o.getOwned() , loc ) );
+
+        long size = o.objsize();
+        _curSizeSoFar += size + sizeof( DiskLoc );
+        
+        if ( _curSizeSoFar > _maxFilesize )
+            finishMap();
+
+    }
+    
+    void BSONObjExternalSorter::finishMap(){
+        uassert( 10050 ,  "bad" , _cur );
+        
+        _curSizeSoFar = 0;
+        if ( _cur->size() == 0 )
+            return;
+        
+        _cur->sort( MyCmp( _order ) );
+        
+        stringstream ss;
+        ss << _root.string() << "/file." << _files.size();
+        string file = ss.str();
+        
+        ofstream out;
+        out.open( file.c_str() , ios_base::out | ios_base::binary );
+        uassert( 10051 ,  (string)"couldn't open file: " + file , out.good() );
+        
+        int num = 0;
+        for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); i++ ){
+            Data p = *i;
+            out.write( p.first.objdata() , p.first.objsize() );
+            out.write( (char*)(&p.second) , sizeof( DiskLoc ) );
+            num++;
+        }
+        
+        _cur->clear();
+        
+        _files.push_back( file );
+        out.close();
+
+        log(2) << "Added file: " << file << " with " << num << "objects for external sort" << endl;
+    }
+    
+    // ---------------------------------
+
+    BSONObjExternalSorter::Iterator::Iterator( BSONObjExternalSorter * sorter ) :
+        _cmp( sorter->_order ) , _in( 0 ){
+        
+        for ( list<string>::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ){
+            _files.push_back( new FileIterator( *i ) );
+            _stash.push_back( pair<Data,bool>( Data( BSONObj() , DiskLoc() ) , false ) );
+        }
+        
+        if ( _files.size() == 0 && sorter->_cur ){
+            _in = sorter->_cur;
+            _it = sorter->_cur->begin();
+        }
+
+        
+    }
+    
+    BSONObjExternalSorter::Iterator::~Iterator(){
+        for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
+            delete *i;
+        _files.clear();
+    }
+    
+    bool BSONObjExternalSorter::Iterator::more(){
+
+        if ( _in )
+            return _it != _in->end();
+        
+        for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
+            if ( (*i)->more() )
+                return true;
+        for ( vector< pair<Data,bool> >::iterator i=_stash.begin(); i!=_stash.end(); i++ )
+            if ( i->second )
+                return true;
+        return false;
+    }
+        
+    pair<BSONObj,DiskLoc> BSONObjExternalSorter::Iterator::next(){
+        
+        if ( _in ){
+            return *(_it++);
+        }
+        
+        Data best;
+        int slot = -1;
+        
+        for ( unsigned i=0; i<_stash.size(); i++ ){
+
+            if ( ! _stash[i].second ){
+                if ( _files[i]->more() )
+                    _stash[i] = pair<Data,bool>( _files[i]->next() , true );
+                else
+                    continue;
+            }
+            
+            if ( slot == -1 || _cmp( best , _stash[i].first ) == 0 ){
+                best = _stash[i].first;
+                slot = i;
+            }
+                
+        }
+        
+        assert( slot >= 0 );
+        _stash[slot].second = false;
+
+        return best;
+    }
+
+    // -----------------------------------
+    
+    BSONObjExternalSorter::FileIterator::FileIterator( string file ){
+        long length;
+        _buf = (char*)_file.map( file.c_str() , length );
+        massert( 10308 ,  "mmap failed" , _buf );
+        assert( (unsigned long)length == file_size( file ) );
+        _end = _buf + length;
+    }
+    BSONObjExternalSorter::FileIterator::~FileIterator(){
+    }
+    
+    bool BSONObjExternalSorter::FileIterator::more(){
+        return _buf < _end;
+    }
+    
+    pair<BSONObj,DiskLoc> BSONObjExternalSorter::FileIterator::next(){
+        BSONObj o( _buf );
+        _buf += o.objsize();
+        DiskLoc * l = (DiskLoc*)_buf;
+        _buf += 8;
+        return Data( o , *l );
+    }
+    
+}
diff --git a/db/extsort.h b/db/extsort.h
new file mode 100644
index 0000000..5bfa86f
--- /dev/null
+++ b/db/extsort.h
@@ -0,0 +1,123 @@
+// extsort.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "jsobj.h"
+#include "namespace.h"
+#include "curop.h"
+
+namespace mongo {
+
+    /**
+       for sorting by BSONObj and attaching a value
+     */
+    class BSONObjExternalSorter : boost::noncopyable {
+    public:
+        
+        typedef pair<BSONObj,DiskLoc> Data;
+        
+    private:
+        class FileIterator : boost::noncopyable {
+        public:
+            FileIterator( string file );
+            ~FileIterator();
+            bool more();
+            Data next();            
+        private:
+            MemoryMappedFile _file;
+            char * _buf;
+            char * _end;
+        };
+
+        class MyCmp {
+        public:
+            MyCmp( const BSONObj & order = BSONObj() ) : _order( order ){}
+            bool operator()( const Data &l, const Data &r ) const {
+                RARELY killCurrentOp.checkForInterrupt();
+                _compares++;
+                int x = l.first.woCompare( r.first , _order );
+                if ( x )
+                    return x < 0;
+                return l.second.compare( r.second ) < 0;
+            };
+        private:
+            BSONObj _order;
+        };
+        
+    public:
+
+        typedef list<Data> InMemory;
+
+        class Iterator : boost::noncopyable {
+        public:
+            
+            Iterator( BSONObjExternalSorter * sorter );
+            ~Iterator();
+            bool more();
+            Data next();
+            
+        private:
+            MyCmp _cmp;
+            vector<FileIterator*> _files;
+            vector< pair<Data,bool> > _stash;
+            
+            InMemory * _in;
+            InMemory::iterator _it;
+            
+        };
+        
+        BSONObjExternalSorter( const BSONObj & order = BSONObj() , long maxFileSize = 1024 * 1024 * 100 );
+        ~BSONObjExternalSorter();
+        
+        void add( const BSONObj& o , const DiskLoc & loc );
+        void add( const BSONObj& o , int a , int b ){
+            add( o , DiskLoc( a , b ) );
+        }
+
+        /* call after adding values, and before fetching the iterator */
+        void sort();
+        
+        auto_ptr<Iterator> iterator(){
+            uassert( 10052 ,  "not sorted" , _sorted );
+            return auto_ptr<Iterator>( new Iterator( this ) );
+        }
+        
+        int numFiles(){
+            return _files.size();
+        }
+
+    private:
+        
+        void sort( string file );
+        void finishMap();
+        
+        BSONObj _order;
+        long _maxFilesize;
+        path _root;
+        
+        InMemory * _cur;
+        long _curSizeSoFar;
+        
+        list<string> _files;
+        bool _sorted;
+
+        static unsigned long long _compares;
+    };
+}
diff --git a/db/filever.h b/db/filever.h
new file mode 100644
index 0000000..4aa18d4
--- /dev/null
+++ b/db/filever.h
@@ -0,0 +1,30 @@
+/* filever.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+inline void checkDataFileVersion(NamespaceDetails& d) { 
+}
+
+inline void checkIndexFileVersion(NamespaceDetails& d) { 
+}
+
+}
+
diff --git a/db/flushtest.cpp b/db/flushtest.cpp
new file mode 100644
index 0000000..a301e0e
--- /dev/null
+++ b/db/flushtest.cpp
@@ -0,0 +1,134 @@
+#include "stdafx.h"
+#include <stdio.h>
+#include "../util/goodies.h"
+#include <fcntl.h>
+
+namespace mongo {
+
+#if defined(F_FULLFSYNC)
+    void fullsync(int f) {
+        fcntl( f, F_FULLFSYNC );
+    }
+#else
+    void fullsync(int f) {
+        fdatasync(f);
+    }
+#endif
+
+    int main(int argc, char* argv[], char *envp[] ) {
+        cout << "hello" << endl;
+
+        FILE *f = fopen("/data/db/temptest", "a");
+
+        if ( f == 0 ) {
+            cout << "can't open file\n";
+            return 1;
+        }
+
+        {
+            Timer t;
+            for ( int i = 0; i < 50000; i++ )
+                fwrite("abc", 3, 1, f);
+            cout << "small writes: " << t.millis() << "ms" << endl;
+        }
+
+        {
+            Timer t;
+            for ( int i = 0; i < 10000; i++ ) {
+                fwrite("abc", 3, 1, f);
+                fflush(f);
+                fsync( fileno( f ) );
+            }
+            int ms = t.millis();
+            cout << "flush: " << ms << "ms, " << ms / 10000.0 << "ms/request" << endl;
+        }
+
+        {
+            Timer t;
+            for ( int i = 0; i < 500; i++ ) {
+                fwrite("abc", 3, 1, f);
+                fflush(f);
+                fsync( fileno( f ) );
+                sleepmillis(2);
+            }
+            int ms = t.millis() - 500 * 2;
+            cout << "flush with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl;
+        }
+
+        char buf[8192];
+        for ( int pass = 0; pass < 2; pass++ ) {
+            cout << "pass " << pass << endl;
+            {
+                Timer t;
+                int n = 500;
+                for ( int i = 0; i < n; i++ ) {
+                    if ( pass == 0 )
+                        fwrite("abc", 3, 1, f);
+                    else
+                        fwrite(buf, 8192, 1, f);
+                    buf[0]++;
+                    fflush(f);
+                    fullsync(fileno(f));
+                }
+                int ms = t.millis();
+                cout << "fullsync: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+            }
+
+            {
+                Timer t;
+                for ( int i = 0; i < 500; i++ ) {
+                    if ( pass == 0 )
+                        fwrite("abc", 3, 1, f);
+                    else
+                        fwrite(buf, 8192, 1, f);
+                    buf[0]++;
+                    fflush(f);
+                    fullsync(fileno(f));
+                    sleepmillis(2);
+                }
+                int ms = t.millis() - 2 * 500;
+                cout << "fullsync with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl;
+            }
+        }
+
+        // without growing
+        {
+            fclose(f);
+            /* try from beginning of the file, where we aren't appending and changing the file length,
+               to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect).
+            */
+            f = fopen("/data/db/temptest", "r+");
+            Timer t;
+            int n = 500;
+            for ( int i = 0; i < n; i++ ) {
+                fwrite("xyz", 3, 1, f);
+                fflush(f);
+                fullsync(fileno(f));
+            }
+            int ms = t.millis();
+            cout << "fullsync without growing: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+        }
+
+        // without growing, with delay
+        {
+            fclose(f);
+            /* try from beginning of the file, where we aren't appending and changing the file length,
+               to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect).
+            */
+            f = fopen("/data/db/temptest", "r+");
+            Timer t;
+            int n = 500;
+            for ( int i = 0; i < n; i++ ) {
+                fwrite("xyz", 3, 1, f);
+                fflush(f);
+                fullsync(fileno(f));
+                sleepmillis(2);
+            }
+            int ms = t.millis() - 2 * 500;
+            cout << "fullsync without growing with sleeps: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+        }
+
+        return 0;
+    }
+
+} // namespace mongo
diff --git a/db/index.cpp b/db/index.cpp
new file mode 100644
index 0000000..fab6918
--- /dev/null
+++ b/db/index.cpp
@@ -0,0 +1,306 @@
+// index.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "namespace.h"
+#include "index.h"
+#include "btree.h"
+#include "query.h"
+
+namespace mongo {
+
+    /* delete this index.  does NOT clean up the system catalog
+       (system.indexes or system.namespaces) -- only NamespaceIndex.
+    */
+    void IndexDetails::kill_idx() {
+        string ns = indexNamespace(); // e.g. foo.coll.$ts_1
+        
+        // clean up parent namespace index cache
+        NamespaceDetailsTransient::get_w( parentNS().c_str() ).deletedIndex();
+
+        BSONObjBuilder b;
+        b.append("name", indexName().c_str());
+        b.append("ns", parentNS().c_str());
+        BSONObj cond = b.done(); // e.g.: { name: "ts_1", ns: "foo.coll" }
+
+        /* important to catch exception here so we can finish cleanup below. */
+        try { 
+            btreeStore->drop(ns.c_str());
+        }
+        catch(DBException& ) { 
+            log(2) << "IndexDetails::kill(): couldn't drop ns " << ns << endl;
+        }
+        head.setInvalid();
+        info.setInvalid();
+
+        // clean up in system.indexes.  we do this last on purpose.  note we have 
+        // to make the cond object before the drop() above though.
+        string system_indexes = cc().database()->name + ".system.indexes";
+        int n = deleteObjects(system_indexes.c_str(), cond, false, false, true);
+        wassert( n == 1 );
+    }
+
+    void IndexSpec::_init(){
+        assert( keys.objsize() );
+        
+        BSONObjIterator i( keys );
+        BSONObjBuilder nullKeyB;
+        while( i.more() ) {
+            _fieldNames.push_back( i.next().fieldName() );
+            _fixed.push_back( BSONElement() );
+            nullKeyB.appendNull( "" );
+        }
+        
+        _nullKey = nullKeyB.obj();
+
+        BSONObjBuilder b;
+        b.appendNull( "" );
+        _nullObj = b.obj();
+        _nullElt = _nullObj.firstElement();
+    }
+
+
+    void IndexSpec::getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
+        vector<const char*> fieldNames( _fieldNames );
+        vector<BSONElement> fixed( _fixed );
+        _getKeys( fieldNames , fixed , obj, keys );
+        if ( keys.empty() )
+            keys.insert( _nullKey );
+    }
+
+    void IndexSpec::_getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
+        BSONElement arrElt;
+        unsigned arrIdx = ~0;
+        for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+            if ( *fieldNames[ i ] == '\0' )
+                continue;
+            BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] );
+            if ( e.eoo() )
+                e = _nullElt; // no matching field
+            if ( e.type() != Array )
+                fieldNames[ i ] = ""; // no matching field or non-array match
+            if ( *fieldNames[ i ] == '\0' )
+                fixed[ i ] = e; // no need for further object expansion (though array expansion still possible)
+            if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here
+                arrIdx = i;
+                arrElt = e;
+            }
+            // enforce single array path here
+            uassert( 10088 ,  "cannot index parallel arrays", e.type() != Array || e.rawdata() == arrElt.rawdata() );
+        }
+
+        bool allFound = true; // have we found elements for all field names in the key spec?
+        for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ){
+            if ( **i != '\0' ){
+                allFound = false;
+                break;
+            }
+        }
+
+        if ( allFound ) {
+            if ( arrElt.eoo() ) {
+                // no terminal array element to expand
+                BSONObjBuilder b;
+                for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
+                    b.appendAs( *i, "" );
+                keys.insert( b.obj() );
+            } 
+            else {
+                // terminal array element to expand, so generate all keys
+                BSONObjIterator i( arrElt.embeddedObject() );
+                if ( i.more() ){
+                    while( i.more() ) {
+                        BSONObjBuilder b;
+                        for( unsigned j = 0; j < fixed.size(); ++j ) {
+                            if ( j == arrIdx )
+                                b.appendAs( i.next(), "" );
+                            else
+                                b.appendAs( fixed[ j ], "" );
+                        }
+                        keys.insert( b.obj() );
+                    }
+                }
+                else if ( fixed.size() > 1 ){
+                    // x : [] - need to insert undefined
+                    BSONObjBuilder b;
+                    for( unsigned j = 0; j < fixed.size(); ++j ) {
+                        if ( j == arrIdx )
+                            b.appendUndefined( "" );
+                        else
+                            b.appendAs( fixed[ j ], "" );
+                    }
+                    keys.insert( b.obj() );
+                }
+            }
+        } else {
+            // nonterminal array element to expand, so recurse
+            assert( !arrElt.eoo() );
+            BSONObjIterator i( arrElt.embeddedObject() );
+            while( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.type() == Object )
+                    _getKeys( fieldNames, fixed, e.embeddedObject(), keys );
+            }
+        }
+    }
+
+    /* Pull out the relevant key objects from obj, so we
+       can index them.  Note that the set is multiple elements
+       only when it's a "multikey" array.
+       Keys will be left empty if key not found in the object.
+    */
+    void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSetDefaultOrder& keys) const {
+        NamespaceDetailsTransient::get_w( info.obj()["ns"].valuestr() ).getIndexSpec( this ).getKeys( obj, keys );
+    }
+
+    void setDifference(BSONObjSetDefaultOrder &l, BSONObjSetDefaultOrder &r, vector<BSONObj*> &diff) {
+        BSONObjSetDefaultOrder::iterator i = l.begin();
+        BSONObjSetDefaultOrder::iterator j = r.begin();
+        while ( 1 ) {
+            if ( i == l.end() )
+                break;
+            while ( j != r.end() && j->woCompare( *i ) < 0 )
+                j++;
+            if ( j == r.end() || i->woCompare(*j) != 0  ) {
+                const BSONObj *jo = &*i;
+                diff.push_back( (BSONObj *) jo );
+            }
+            i++;
+        }
+    }
+
+    void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj) { 
+        v.resize(d.nIndexes);
+        NamespaceDetails::IndexIterator i = d.ii();
+        while( i.more() ) {
+            int j = i.pos();
+            IndexDetails& idx = i.next();
+            BSONObj idxKey = idx.info.obj().getObjectField("key"); // eg { ts : 1 }
+            IndexChanges& ch = v[j];
+            idx.getKeysFromObject(oldObj, ch.oldkeys);
+            idx.getKeysFromObject(newObj, ch.newkeys);
+            if( ch.newkeys.size() > 1 ) 
+                d.setIndexIsMultikey(j);
+            setDifference(ch.oldkeys, ch.newkeys, ch.removed);
+            setDifference(ch.newkeys, ch.oldkeys, ch.added);
+        }
+    }
+
+    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d) {
+        NamespaceDetails::IndexIterator i = d.ii();
+        while( i.more() ) {
+            int j = i.pos();
+            v[j].dupCheck(i.next());
+        }
+    }
+
+    // should be { <something> : <simpletype[1|-1]>, .keyp.. } 
+    static bool validKeyPattern(BSONObj kp) { 
+        BSONObjIterator i(kp);
+        while( i.moreWithEOO() ) { 
+            BSONElement e = i.next();
+            if( e.type() == Object || e.type() == Array ) 
+                return false;
+        }
+        return true;
+    }
+
+    /* Prepare to build an index.  Does not actually build it (except for a special _id case).
+       - We validate that the params are good
+       - That the index does not already exist
+       - Creates the source collection if it DNE
+
+       example of 'io':
+         { ns : 'test.foo', name : 'z', key : { z : 1 } }
+
+       throws DBException
+
+       @return 
+         true if ok to continue.  when false we stop/fail silently (index already exists)
+         sourceNS - source NS we are indexing
+         sourceCollection - its details ptr
+    */
+    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection) {
+        sourceCollection = 0;
+
+        // logical name of the index.  todo: get rid of the name, we don't need it!
+        const char *name = io.getStringField("name"); 
+        uassert(12523, "no index name specified", *name);
+
+        // the collection for which we are building an index
+        sourceNS = io.getStringField("ns");  
+        uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos);
+        uassert(10097, "bad table to index name on add index attempt", 
+            cc().database()->name == nsToDatabase(sourceNS.c_str()));
+
+        BSONObj key = io.getObjectField("key");
+        uassert(12524, "index key pattern too large", key.objsize() <= 2048);
+        if( !validKeyPattern(key) ) {
+            string s = string("bad index key pattern ") + key.toString();
+            uasserted(10098 , s.c_str());
+        }
+
+        if ( sourceNS.empty() || key.isEmpty() ) {
+            log(2) << "bad add index attempt name:" << (name?name:"") << "\n  ns:" <<
+                sourceNS << "\n  idxobj:" << io.toString() << endl;
+            string s = "bad add index attempt " + sourceNS + " key:" + key.toString();
+            uasserted(12504, s);
+        }
+
+        sourceCollection = nsdetails(sourceNS.c_str());
+        if( sourceCollection == 0 ) {
+            // try to create it
+            string err;
+            if ( !userCreateNS(sourceNS.c_str(), BSONObj(), err, false) ) {
+                problem() << "ERROR: failed to create collection while adding its index. " << sourceNS << endl;
+                return false;
+            }
+            sourceCollection = nsdetails(sourceNS.c_str());
+            log() << "info: creating collection " << sourceNS << " on add index\n";
+            assert( sourceCollection );
+        }
+
+        if ( sourceCollection->findIndexByName(name) >= 0 ) {
+            // index already exists.
+            return false;
+        }
+        if( sourceCollection->findIndexByKeyPattern(key) >= 0 ) {
+            log(2) << "index already exists with diff name " << name << ' ' << key.toString() << endl;
+            return false;
+        }
+
+        if ( sourceCollection->nIndexes >= NamespaceDetails::NIndexesMax ) {
+            stringstream ss;
+            ss << "add index fails, too many indexes for " << sourceNS << " key:" << key.toString();
+            string s = ss.str();
+            log() << s << '\n';
+            uasserted(12505,s);
+        }
+
+        /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to 
+           all be treated as the same pattern.
+        */
+        if ( !god && IndexDetails::isIdIndexPattern(key) ) {
+            ensureHaveIdIndex( sourceNS.c_str() );
+            return false;
+        }
+
+        return true;
+    }
+
+}
diff --git a/db/index.h b/db/index.h
new file mode 100644
index 0000000..696e84d
--- /dev/null
+++ b/db/index.h
@@ -0,0 +1,198 @@
+// index.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+
+namespace mongo {
+    
+    class IndexSpec {
+    public:
+        BSONObj keys;
+        BSONObj meta;
+        
+        IndexSpec(){
+        }
+
+        IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() )
+            : keys(k) , meta(m){
+            _init();
+        }
+
+        /**
+           this is a DickLock of an IndexDetails info
+           should have a key field 
+         */
+        IndexSpec( const DiskLoc& loc ){
+            reset( loc );
+        }
+        
+        void reset( const DiskLoc& loc ){
+            meta = loc.obj();
+            keys = meta["key"].embeddedObjectUserCheck();
+            if ( keys.objsize() == 0 ) {
+                out() << meta.toString() << endl;
+                assert(false);
+                
+            }
+            _init();
+        }
+        
+        void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const;
+
+    private:
+
+        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const;
+
+        vector<const char*> _fieldNames;
+        vector<BSONElement> _fixed;
+        BSONObj _nullKey;
+        
+        BSONObj _nullObj;
+        BSONElement _nullElt;
+        
+        void _init();
+    };
+
+	/* Details about a particular index. There is one of these effectively for each object in 
+	   system.namespaces (although this also includes the head pointer, which is not in that 
+	   collection).
+
+       ** MemoryMapped Record  **
+	 */
+    class IndexDetails {
+    public:
+        DiskLoc head; /* btree head disk location */
+
+        /* Location of index info object. Format:
+
+             { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
+               [, unique: <bool>, background: <bool>] 
+             }
+
+           This object is in the system.indexes collection.  Note that since we
+           have a pointer to the object here, the object in system.indexes MUST NEVER MOVE.
+        */
+        DiskLoc info;
+
+        /* extract key value from the query object
+           e.g., if key() == { x : 1 },
+                 { x : 70, y : 3 } -> { x : 70 }
+        */
+        BSONObj getKeyFromQuery(const BSONObj& query) const {
+            BSONObj k = keyPattern();
+            BSONObj res = query.extractFieldsUnDotted(k);
+            return res;
+        }
+
+        /* pull out the relevant key objects from obj, so we
+           can index them.  Note that the set is multiple elements
+           only when it's a "multikey" array.
+           keys will be left empty if key not found in the object.
+        */
+        void getKeysFromObject( const BSONObj& obj, BSONObjSetDefaultOrder& keys) const;
+
+        /* get the key pattern for this object.
+           e.g., { lastname:1, firstname:1 }
+        */
+        BSONObj keyPattern() const {
+            return info.obj().getObjectField("key");
+        }
+
+        /* true if the specified key is in the index */
+        bool hasKey(const BSONObj& key);
+
+        // returns name of this index's storage area
+        // database.table.$index
+        string indexNamespace() const {
+            BSONObj io = info.obj();
+            string s;
+            s.reserve(Namespace::MaxNsLen);
+            s = io.getStringField("ns");
+            assert( !s.empty() );
+            s += ".$";
+            s += io.getStringField("name");
+            return s;
+        }
+
+        string indexName() const { // e.g. "ts_1"
+            BSONObj io = info.obj();
+            return io.getStringField("name");
+        }
+
+        static bool isIdIndexPattern( const BSONObj &pattern ) {
+            BSONObjIterator i(pattern);
+            BSONElement e = i.next();
+            if( strcmp(e.fieldName(), "_id") != 0 ) return false;
+            return i.next().eoo();            
+        }
+        
+        /* returns true if this is the _id index. */
+        bool isIdIndex() const { 
+            return isIdIndexPattern( keyPattern() );
+        }
+
+        /* gets not our namespace name (indexNamespace for that),
+           but the collection we index, its name.
+           */
+        string parentNS() const {
+            BSONObj io = info.obj();
+            return io.getStringField("ns");
+        }
+
+        bool unique() const { 
+            BSONObj io = info.obj();
+            return io["unique"].trueValue() || 
+                /* temp: can we juse make unique:true always be there for _id and get rid of this? */
+                isIdIndex();
+        }
+
+        /* if set, when building index, if any duplicates, drop the duplicating object */
+        bool dropDups() const {
+            return info.obj().getBoolField( "dropDups" );
+        }
+
+        /* delete this index.  does NOT clean up the system catalog
+           (system.indexes or system.namespaces) -- only NamespaceIndex.
+        */
+        void kill_idx();
+
+        operator string() const {
+            return info.obj().toString();
+        }
+    };
+
+    struct IndexChanges/*on an update*/ {
+        BSONObjSetDefaultOrder oldkeys;
+        BSONObjSetDefaultOrder newkeys;
+        vector<BSONObj*> removed; // these keys were removed as part of the change
+        vector<BSONObj*> added;   // these keys were added as part of the change
+
+        void dupCheck(IndexDetails& idx) {
+            if( added.empty() || !idx.unique() )
+                return;
+            for( vector<BSONObj*>::iterator i = added.begin(); i != added.end(); i++ )
+                uassert( 11001 , "E11001 duplicate key on update", !idx.hasKey(**i));
+        }
+    };
+
+    class NamespaceDetails;
+    void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj);
+    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d);
+} // namespace mongo
diff --git a/db/instance.cpp b/db/instance.cpp
new file mode 100644
index 0000000..e8515c4
--- /dev/null
+++ b/db/instance.cpp
@@ -0,0 +1,767 @@
+// instance.cpp : Global state variables and functions.
+//
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "db.h"
+#include "query.h"
+#include "introspect.h"
+#include "repl.h"
+#include "dbmessage.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "json.h"
+#include "reccache.h"
+#include "replset.h"
+#include "../s/d_logic.h"
+#include "../util/file_allocator.h"
+#include "cmdline.h"
+#if !defined(_WIN32)
+#include <sys/file.h>
+#endif
+#include "dbstats.h"
+
+namespace mongo {
+
+    void receivedKillCursors(Message& m);
+    void receivedUpdate(Message& m, CurOp& op);
+    void receivedDelete(Message& m, CurOp& op);
+    void receivedInsert(Message& m, CurOp& op);
+    bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop );
+
+    CmdLine cmdLine;
+
+    int nloggedsome = 0;
+#define LOGSOME if( ++nloggedsome < 1000 || nloggedsome % 100 == 0 )
+
+    SlaveTypes slave = NotSlave;
+    bool master = false; // true means keep an op log
+    bool autoresync = false;
+    
+    /* we use new here so we don't have to worry about destructor orders at program shutdown */
+    MongoMutex &dbMutex( *(new MongoMutex) );
+//    MutexInfo dbMutexInfo;
+
+    string dbExecCommand;
+
+    string bind_ip = "";
+
+    char *appsrvPath = null;
+
+    DiagLog _diaglog;
+
+    int opIdMem = 100000000;
+
+    bool useCursors = true;
+    bool useHints = true;
+    
+    void closeAllSockets();
+    void flushOpLog( stringstream &ss ) {
+        if( _diaglog.f && _diaglog.f->is_open() ) {
+            ss << "flushing op log and files\n";
+            _diaglog.flush();
+        }
+    }
+
+    int ctr = 0;
+
+    KillCurrentOp killCurrentOp;
+    
+    int lockFile = 0;
+
+    // see FSyncCommand:
+    unsigned lockedForWriting; 
+    boost::mutex lockedForWritingMutex;
+    bool unlockRequested = false;
+
+    void inProgCmd( Message &m, DbResponse &dbresponse ) {
+        BSONObjBuilder b;
+
+        AuthenticationInfo *ai = cc().ai;
+        if( !ai->isAuthorized("admin") ) { 
+            BSONObjBuilder b;
+            b.append("err", "unauthorized");
+        }
+        else {
+            vector<BSONObj> vals;
+            {
+                boostlock bl(Client::clientsMutex);
+                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { 
+                    Client *c = *i;
+                    CurOp& co = *(c->curop());
+                    if( co.active() )
+                        vals.push_back( co.infoNoauth() );
+                }
+            }
+            b.append("inprog", vals);
+            unsigned x = lockedForWriting;
+            if( x ) {
+                b.append("fsyncLock", x);
+                b.append("info", "use command {unlock:0} to terminate the fsync write/snapshot lock");
+            }
+        }
+
+        replyToQuery(0, m, dbresponse, b.obj());
+    }
+    
+    void killOp( Message &m, DbResponse &dbresponse ) {
+        BSONObj obj;
+        AuthenticationInfo *ai = currentClient.get()->ai;
+        if( !ai->isAuthorized("admin") ) { 
+            obj = fromjson("{\"err\":\"unauthorized\"}");
+        }
+        /*else if( !dbMutexInfo.isLocked() ) 
+            obj = fromjson("{\"info\":\"no op in progress/not locked\"}");
+            */
+        else {
+            DbMessage d(m);
+            QueryMessage q(d);
+            BSONElement e = q.query.getField("op");
+            if( !e.isNumber() ) { 
+                obj = fromjson("{\"err\":\"no op number field specified?\"}");
+            }
+            else { 
+                obj = fromjson("{\"info\":\"attempting to kill op\"}");
+                killCurrentOp.kill( (unsigned) e.number() );
+            }
+        }
+        replyToQuery(0, m, dbresponse, obj);
+    }
+
+    void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) {
+        BSONObj obj;
+        AuthenticationInfo *ai = currentClient.get()->ai;
+        if( !ai->isAuthorized("admin") || strncmp(ns, "admin.", 6) != 0 ) { 
+            obj = fromjson("{\"err\":\"unauthorized\"}");
+        }
+        else {
+            if( lockedForWriting ) { 
+				log() << "command: unlock requested" << endl;
+                obj = fromjson("{ok:1,\"info\":\"unlock requested\"}");
+                unlockRequested = true;
+            }
+            else { 
+                obj = fromjson("{ok:0,\"errmsg\":\"not locked\"}");
+            }
+        }
+        replyToQuery(0, m, dbresponse, obj);
+    }
+
+    static bool receivedQuery(DbResponse& dbresponse, Message& m, 
+                              CurOp& op, bool logit, 
+                              mongolock& lock
+      ) {
+        bool ok = true;
+        MSGID responseTo = m.data->id;
+
+        DbMessage d(m);
+        QueryMessage q(d);
+        QueryResult* msgdata;
+
+        Client& c = cc();
+
+        try {
+            if (q.fields.get() && q.fields->errmsg)
+                uassert( 10053 , q.fields->errmsg, false);
+
+            /* note these are logged BEFORE authentication -- which is sort of ok */
+            if ( _diaglog.level && logit ) {
+                if ( strstr(q.ns, ".$cmd") ) {
+                    /* $cmd queries are "commands" and usually best treated as write operations */
+                    OPWRITE;
+                }
+                else {
+                    OPREAD;
+                }
+            }
+
+            setClient( q.ns, dbpath, &lock );
+            c.top.setRead();
+            c.curop()->setNS(q.ns);
+            msgdata = runQuery(m, q, op ).release();
+        }
+        catch ( AssertionException& e ) {
+            ok = false;
+            op.debug().str << " exception ";
+            LOGSOME problem() << " Caught Assertion in runQuery ns:" << q.ns << ' ' << e.toString() << '\n';
+            log() << "  ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << '\n';
+            if ( q.query.valid() )
+                log() << "  query:" << q.query.toString() << endl;
+            else
+                log() << "  query object is not valid!" << endl;
+
+            BSONObjBuilder err;
+            err.append("$err", e.msg.empty() ? "assertion during query" : e.msg);
+            BSONObj errObj = err.done();
+
+            BufBuilder b;
+            b.skip(sizeof(QueryResult));
+            b.append((void*) errObj.objdata(), errObj.objsize());
+
+            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
+            msgdata = (QueryResult *) b.buf();
+            b.decouple();
+            QueryResult *qr = msgdata;
+            qr->_resultFlags() = QueryResult::ResultFlag_ErrSet;
+            qr->len = b.len();
+            qr->setOperation(opReply);
+            qr->cursorId = 0;
+            qr->startingFrom = 0;
+            qr->nReturned = 1;
+
+        }
+        Message *resp = new Message();
+        resp->setData(msgdata, true); // transport will free
+        dbresponse.response = resp;
+        dbresponse.responseTo = responseTo;
+        Database *database = c.database();
+        if ( database ) {
+            if ( database->profile )
+                op.debug().str << " bytes:" << resp->data->dataLen();
+        }
+        else {
+            if ( strstr(q.ns, "$cmd") == 0 ) // (this condition is normal for $cmd dropDatabase)
+                log() << "ERROR: receiveQuery: database is null; ns=" << q.ns << endl;
+        }
+
+        return ok;
+    }
+
+    bool commandIsReadOnly(BSONObj& _cmdobj);
+
+    // Returns false when request includes 'end'
+    bool assembleResponse( Message &m, DbResponse &dbresponse, const sockaddr_in &client ) {
+
+        bool writeLock = true;
+
+        // before we lock...
+        int op = m.data->operation();
+        globalOpCounters.gotOp( op );
+        const char *ns = m.data->_data + 4;
+        if ( op == dbQuery ) {
+            if( strstr(ns, ".$cmd") ) {
+                if( strstr(ns, ".$cmd.sys.") ) { 
+                    if( strstr(ns, "$cmd.sys.inprog") ) {
+                        inProgCmd(m, dbresponse);
+                        return true;
+                    }
+                    if( strstr(ns, "$cmd.sys.killop") ) { 
+                        killOp(m, dbresponse);
+                        return true;
+                    }
+                    if( strstr(ns, "$cmd.sys.unlock") ) { 
+                        unlockFsync(ns, m, dbresponse);
+                        return true;
+                    }
+                }
+                DbMessage d( m );
+                QueryMessage q( d );
+                writeLock = !commandIsReadOnly(q.query);
+            }
+            else
+                writeLock = false;
+        }
+        else if( op == dbGetMore ) {
+            writeLock = false;
+        }
+        
+        if ( handlePossibleShardedMessage( m , dbresponse ) ){
+            /* important to do this before we lock
+               so if a message has to be forwarded, doesn't block for that
+            */
+            return true;
+        }
+
+        Client& c = cc();
+        c.clearns();
+        
+        auto_ptr<CurOp> nestedOp;
+        CurOp* currentOpP = c.curop();
+        if ( currentOpP->active() ){
+            nestedOp.reset( new CurOp() );
+            currentOpP = nestedOp.get();
+        }
+        CurOp& currentOp = *currentOpP;
+        currentOp.reset(client);
+        currentOp.setOp(op);
+        
+        OpDebug& debug = currentOp.debug();
+        StringBuilder& ss = debug.str;
+
+        int logThreshold = cmdLine.slowMS;
+        bool log = logLevel >= 1;
+
+        Timer t( currentOp.startTime() );
+
+        mongolock lk(writeLock);
+
+#if 0
+        /* use this if you only want to process operations for a particular namespace.
+         maybe add to cmd line parms or something fancier.
+         */
+        DbMessage ddd(m);
+        if ( strncmp(ddd.getns(), "clusterstock", 12) != 0 ) {
+            static int q;
+            if ( ++q < 20 )
+                out() << "TEMP skip " << ddd.getns() << endl;
+            goto skip;
+        }
+#endif
+
+        if ( op == dbQuery ) {
+            // receivedQuery() does its own authorization processing.
+            if ( ! receivedQuery(dbresponse, m, currentOp, true, lk) )
+                log = true;
+        }
+        else if ( op == dbGetMore ) {
+            // does its own authorization processing.
+            OPREAD;
+            DEV log = true;
+            ss << "getmore ";
+            if ( ! receivedGetMore(dbresponse, m, currentOp) )
+                log = true;
+        }
+        else if ( op == dbMsg ) {
+			/* deprecated / rarely used.  intended for connection diagnostics. */
+            ss << "msg ";
+            char *p = m.data->_data;
+            int len = strlen(p);
+            if ( len > 400 )
+                out() << curTimeMillis() % 10000 <<
+                     " long msg received, len:" << len <<
+                     " ends with: " << p + len - 10 << endl;
+            bool end = false; //strcmp("end", p) == 0;
+            Message *resp = new Message();
+            resp->setData(opReply, "i am fine");
+            dbresponse.response = resp;
+            dbresponse.responseTo = m.data->id;
+            //dbMsgPort.reply(m, resp);
+            if ( end )
+                return false;
+        }
+        else {
+            const char *ns = m.data->_data + 4;
+            char cl[256];
+            nsToDatabase(ns, cl);
+            currentOp.setNS(ns);
+            AuthenticationInfo *ai = currentClient.get()->ai;
+            if( !ai->isAuthorized(cl) ) { 
+                uassert_nothrow("unauthorized");
+            }
+            else if ( op == dbInsert ) {
+                OPWRITE;
+                try {
+                    ss << "insert ";
+                    receivedInsert(m, currentOp);
+                }
+                catch ( AssertionException& e ) {
+                    LOGSOME problem() << " Caught Assertion insert, continuing\n";
+                    ss << " exception " << e.toString();
+                    log = true;
+                }
+            }
+            else if ( op == dbUpdate ) {
+                OPWRITE;
+                try {
+                    ss << "update ";
+                    receivedUpdate(m, currentOp);
+                }
+                catch ( AssertionException& e ) {
+                    LOGSOME problem() << " Caught Assertion update, continuing" << endl;
+                    ss << " exception " << e.toString();
+                    log = true;
+                }
+            }
+            else if ( op == dbDelete ) {
+                OPWRITE;
+                try {
+                    ss << "remove ";
+                    receivedDelete(m, currentOp);
+                }
+                catch ( AssertionException& e ) {
+                    LOGSOME problem() << " Caught Assertion receivedDelete, continuing" << endl;
+                    ss << " exception " << e.toString();
+                    log = true;
+                }
+            }
+            else if ( op == dbKillCursors ) {
+                OPREAD;
+                try {
+                    logThreshold = 10;
+                    ss << "killcursors ";
+                    receivedKillCursors(m);
+                }
+                catch ( AssertionException& e ) {
+                    problem() << " Caught Assertion in kill cursors, continuing" << endl;
+                    ss << " exception " + e.toString();
+                    log = true;
+                }
+            }
+            else {
+                out() << "    operation isn't supported: " << op << endl;
+                currentOp.setActive(false);
+                assert(false);
+            }
+        }
+        int ms = t.millis();
+        log = log || (logLevel >= 2 && ++ctr % 512 == 0);
+        DEV log = true;
+        if ( log || ms > logThreshold ) {
+            ss << ' ' << ms << "ms";
+            mongo::log() << ss.str() << endl;
+        }
+        Database *database = c.database();
+        if ( database && database->profile >= 1 ) {
+            if ( database->profile >= 2 || ms >= cmdLine.slowMS ) {
+                // performance profiling is on
+                if ( dbMutex.getState() > 1 || dbMutex.getState() < -1 ){
+                    out() << "warning: not profiling because recursive lock" << endl;
+                }
+                else {
+                    string old_ns = c.ns();
+                    Database * old_db = c.database();
+                    lk.releaseAndWriteLock();
+                    Client::Context c( old_ns , old_db );
+                    profile(ss.str().c_str(), ms);
+                }
+            }
+        }
+
+        currentOp.setActive(false);
+        return true;
+    } /* assembleResponse() */
+
+    void killCursors(int n, long long *ids);
+    void receivedKillCursors(Message& m) {
+        int *x = (int *) m.data->_data;
+        x++; // reserved
+        int n = *x++;
+        assert( n >= 1 );
+        if ( n > 2000 ) {
+            problem() << "Assertion failure, receivedKillCursors, n=" << n << endl;
+            assert( n < 30000 );
+        }
+        killCursors(n, (long long *) x);
+    }
+
+    /* cl - database name
+       path - db directory
+    */
+    void closeDatabase( const char *cl, const string& path ) {
+        Database *database = cc().database();
+        assert( database );
+        assert( database->name == cl );
+		/*
+        if ( string("local") != cl ) {
+            DBInfo i(cl);
+            i.dbDropped();
+			}*/
+
+        /* important: kill all open cursors on the database */
+        string prefix(cl);
+        prefix += '.';
+        ClientCursor::invalidate(prefix.c_str());
+
+        NamespaceDetailsTransient::clearForPrefix( prefix.c_str() );
+
+        dbHolder.erase( cl, path );
+        delete database; // closes files
+        cc().clearns();
+    }
+
+    void receivedUpdate(Message& m, CurOp& op) {
+        DbMessage d(m);
+        const char *ns = d.getns();
+        assert(*ns);
+        uassert( 10054 ,  "not master", isMasterNs( ns ) );
+        setClient(ns);
+        Client& client = cc();
+        client.top.setWrite();
+        op.debug().str << ns << ' ';
+        int flags = d.pullInt();
+        BSONObj query = d.nextJsObj();
+
+        assert( d.moreJSObjs() );
+        assert( query.objsize() < m.data->dataLen() );
+        BSONObj toupdate = d.nextJsObj();
+        uassert( 10055 , "update object too large", toupdate.objsize() <= MaxBSONObjectSize);
+        assert( toupdate.objsize() < m.data->dataLen() );
+        assert( query.objsize() + toupdate.objsize() < m.data->dataLen() );
+        bool upsert = flags & UpdateOption_Upsert;
+        bool multi = flags & UpdateOption_Multi;
+        {
+            string s = query.toString();
+            /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. */
+            op.debug().str << " query: " << s;
+            CurOp& currentOp = *client.curop();
+            currentOp.setQuery(query);
+        }        
+        UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() );
+        /* TODO FIX: recordUpdate should take a long int for parm #2 */
+        recordUpdate( res.existing , (int) res.num ); // for getlasterror
+    }
+
+    void receivedDelete(Message& m, CurOp& op) {
+        DbMessage d(m);
+        const char *ns = d.getns();
+        assert(*ns);
+        uassert( 10056 ,  "not master", isMasterNs( ns ) );
+        setClient(ns);
+        Client& client = cc();
+        client.top.setWrite();
+        int flags = d.pullInt();
+        bool justOne = flags & 1;
+        assert( d.moreJSObjs() );
+        BSONObj pattern = d.nextJsObj();
+        {
+            string s = pattern.toString();
+            op.debug().str << " query: " << s;
+            CurOp& currentOp = *client.curop();
+            currentOp.setQuery(pattern);
+        }        
+        int n = deleteObjects(ns, pattern, justOne, true);
+        recordDelete( n );
+    }
+    
+    QueryResult* emptyMoreResult(long long);
+
+    bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) {
+        bool ok = true;
+        DbMessage d(m);
+        const char *ns = d.getns();
+        StringBuilder& ss = curop.debug().str;
+        ss << ns;
+        setClient(ns);
+        cc().top.setRead();
+        int ntoreturn = d.pullInt();
+        long long cursorid = d.pullInt64();
+        ss << " cid:" << cursorid;
+        ss << " ntoreturn:" << ntoreturn;
+        QueryResult* msgdata;
+        try {
+            AuthenticationInfo *ai = currentClient.get()->ai;
+            uassert( 10057 , "unauthorized", ai->isAuthorized(cc().database()->name.c_str()));
+            msgdata = getMore(ns, ntoreturn, cursorid, curop);
+        }
+        catch ( AssertionException& e ) {
+            ss << " exception " + e.toString();
+            msgdata = emptyMoreResult(cursorid);
+            ok = false;
+        }
+        Message *resp = new Message();
+        resp->setData(msgdata, true);
+        ss << " bytes:" << resp->data->dataLen();
+        ss << " nreturned:" << msgdata->nReturned;
+        dbresponse.response = resp;
+        dbresponse.responseTo = m.data->id;
+        //dbMsgPort.reply(m, resp);
+        return ok;
+    }
+
+    void receivedInsert(Message& m, CurOp& op) {
+        DbMessage d(m);
+		const char *ns = d.getns();
+		assert(*ns);
+        uassert( 10058 ,  "not master", isMasterNs( ns ) );
+		setClient(ns);
+        cc().top.setWrite();
+        op.debug().str << ns;
+		
+        while ( d.moreJSObjs() ) {
+            BSONObj js = d.nextJsObj();
+            uassert( 10059 , "object to insert too large", js.objsize() <= MaxBSONObjectSize);
+            theDataFileMgr.insert(ns, js, false);
+            logOp("i", ns, js);
+        }
+    }
+
+    class JniMessagingPort : public AbstractMessagingPort {
+    public:
+        JniMessagingPort(Message& _container) : container(_container) { }
+        void reply(Message& received, Message& response, MSGID) {
+            container = response;
+        }
+        void reply(Message& received, Message& response) {
+            container = response;
+        }
+        unsigned remotePort(){
+            return 1;
+        }
+        Message & container;
+    };
+    
+    void getDatabaseNames( vector< string > &names ) {
+        boost::filesystem::path path( dbpath );
+        for ( boost::filesystem::directory_iterator i( path );
+                i != boost::filesystem::directory_iterator(); ++i ) {
+            string fileName = boost::filesystem::path(*i).leaf();
+            if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" )
+                names.push_back( fileName.substr( 0, fileName.length() - 3 ) );
+        }
+    }
+
+    bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk ) {
+        SavedContext c;
+        if ( lastError._get() )
+            lastError.startRequest( toSend, lastError._get() );
+        DbResponse dbResponse;
+        assembleResponse( toSend, dbResponse );
+        assert( dbResponse.response );
+        response = *dbResponse.response;
+        return true;
+    }
+
+    void DBDirectClient::say( Message &toSend ) {
+        SavedContext c;
+        if ( lastError._get() )
+            lastError.startRequest( toSend, lastError._get() );
+        DbResponse dbResponse;
+        assembleResponse( toSend, dbResponse );
+    }
+
+    auto_ptr<DBClientCursor> DBDirectClient::query(const string &ns, Query query, int nToReturn , int nToSkip ,
+                                                   const BSONObj *fieldsToReturn , int queryOptions ){
+        
+        //if ( ! query.obj.isEmpty() || nToReturn != 0 || nToSkip != 0 || fieldsToReturn || queryOptions )
+        return DBClientBase::query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions );
+        //
+        //assert( query.obj.isEmpty() );
+        //throw UserException( (string)"yay:" + ns );
+    }
+
+
+    DBDirectClient::AlwaysAuthorized DBDirectClient::SavedContext::always;
+
+    DBClientBase * createDirectClient(){
+        return new DBDirectClient();
+    }
+
+    void recCacheCloseAll();
+
+    boost::mutex &exitMutex( *( new boost::mutex ) );
+    int numExitCalls = 0;
+    void shutdown();
+
+    bool inShutdown(){
+        return numExitCalls > 0;
+    }
+
+    void tryToOutputFatal( const string& s ){
+        try {
+            rawOut( s );
+            return;
+        }
+        catch ( ... ){}
+
+        try {
+            cerr << s << endl;
+            return;
+        }
+        catch ( ... ){}
+        
+        // uh - oh, not sure there is anything else we can do...
+    }
+
+    /* not using log() herein in case we are already locked */
+    void dbexit( ExitCode rc, const char *why) {        
+        {
+            boostlock lk( exitMutex );
+            if ( numExitCalls++ > 0 ) {
+                if ( numExitCalls > 5 ){
+                    // this means something horrible has happened
+                    ::_exit( rc );
+                }
+                stringstream ss;
+                ss << "dbexit: " << why << "; exiting immediately" << endl;
+                tryToOutputFatal( ss.str() );
+                ::exit( rc );                
+            }
+        }
+        
+        stringstream ss;
+        ss << "dbexit: " << why << endl;
+        tryToOutputFatal( ss.str() );
+        
+        try {
+            shutdown(); // gracefully shutdown instance
+        }
+        catch ( ... ){
+            tryToOutputFatal( "shutdown failed with exception" );
+        }
+        
+        tryToOutputFatal( "dbexit: really exiting now\n" );
+        ::exit(rc);
+    }
+    
+    void shutdown() {
+
+
+        log() << "\t shutdown: going to close listening sockets..." << endl;        
+        ListeningSockets::get()->closeAll();
+
+        log() << "\t shutdown: going to flush oplog..." << endl;
+        stringstream ss2;
+        flushOpLog( ss2 );
+        rawOut( ss2.str() );
+
+        /* must do this before unmapping mem or you may get a seg fault */
+        log() << "\t shutdown: going to close sockets..." << endl;
+        boost::thread close_socket_thread(closeAllSockets);
+
+        // wait until file preallocation finishes
+        // we would only hang here if the file_allocator code generates a
+        // synchronous signal, which we don't expect
+        log() << "\t shutdown: waiting for fs preallocator..." << endl;
+        theFileAllocator().waitUntilFinished();
+        
+        log() << "\t shutdown: closing all files..." << endl;
+        stringstream ss3;
+        MemoryMappedFile::closeAllFiles( ss3 );
+        rawOut( ss3.str() );
+
+        // should we be locked here?  we aren't. might be ok as-is.
+        recCacheCloseAll();
+        
+#if !defined(_WIN32) && !defined(__sunos__)
+        if ( lockFile ){
+            log() << "\t shutdown: removing fs lock..." << endl;
+            if( ftruncate( lockFile , 0 ) ) 
+                log() << "\t couldn't remove fs lock " << OUTPUT_ERRNO << endl;
+            flock( lockFile, LOCK_UN );
+        }
+#endif
+    }
+
+    void acquirePathLock() {
+#if !defined(_WIN32) && !defined(__sunos__)
+        string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
+        lockFile = open( name.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO );
+        massert( 10309 ,  "Unable to create / open lock file for dbpath: " + name, lockFile > 0 );
+        massert( 10310 ,  "Unable to acquire lock for dbpath: " + name, flock( lockFile, LOCK_EX | LOCK_NB ) == 0 );
+        
+        stringstream ss;
+        ss << getpid() << endl;
+        string s = ss.str();
+        const char * data = s.c_str();
+        assert( write( lockFile , data , strlen( data ) ) );
+        fsync( lockFile );
+#endif        
+    }
+    
+} // namespace mongo
diff --git a/db/instance.h b/db/instance.h
new file mode 100644
index 0000000..b2b2c94
--- /dev/null
+++ b/db/instance.h
@@ -0,0 +1,179 @@
+// instance.h : Global state functions.
+//
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../client/dbclient.h"
+#include "curop.h"
+#include "security.h"
+#include "cmdline.h"
+#include "client.h"
+
+namespace mongo {
+
+    extern string dbExecCommand;
+
+#define OPWRITE if( _diaglog.level & 1 ) _diaglog.write((char *) m.data, m.data->len);
+#define OPREAD if( _diaglog.level & 2 ) _diaglog.readop((char *) m.data, m.data->len);
+
+    struct DiagLog {
+        ofstream *f;
+        /* 0 = off; 1 = writes, 2 = reads, 3 = both
+           7 = log a few reads, and all writes.
+        */
+        int level;
+        DiagLog() : f(0) , level(0) { }
+        void init() {
+            if ( ! f && level ){
+                log() << "diagLogging = " << level << endl;
+                stringstream ss;
+                ss << "diaglog." << hex << time(0);
+                string name = ss.str();
+                f = new ofstream(name.c_str(), ios::out | ios::binary);
+                if ( ! f->good() ) {
+                    problem() << "couldn't open log stream" << endl;
+                    throw 1717;
+                }
+            }
+        }
+        /**
+         * @return old
+         */
+        int setLevel( int newLevel ){
+            int old = level;
+            level = newLevel;
+            init();
+            return old;
+        }
+        void flush() {
+            if ( level ) f->flush();
+        }
+        void write(char *data,int len) {
+            if ( level & 1 ) f->write(data,len);
+        }
+        void readop(char *data, int len) {
+            if ( level & 2 ) {
+                bool log = (level & 4) == 0;
+                OCCASIONALLY log = true;
+                if ( log )
+                    f->write(data,len);
+            }
+        }
+    };
+
+    extern DiagLog _diaglog;
+
+    /* we defer response until we unlock.  don't want a blocked socket to
+       keep things locked.
+    */
+    struct DbResponse {
+        Message *response;
+        MSGID responseTo;
+        DbResponse(Message *r, MSGID rt) : response(r), responseTo(rt) {
+        }
+        DbResponse() {
+            response = 0;
+        }
+        ~DbResponse() {
+            delete response;
+        }
+    };
+
+    static SockAddr unknownAddress( "0.0.0.0", 0 );
+    
+    bool assembleResponse( Message &m, DbResponse &dbresponse, const sockaddr_in &client = unknownAddress.sa );
+
+    void getDatabaseNames( vector< string > &names );
+
+// --- local client ---
+    
+    class DBDirectClient : public DBClientBase {
+        
+    public:
+        virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0,
+                                               const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
+
+        virtual bool isFailed() const {
+            return false;
+        }
+        virtual string toString() {
+            return "DBDirectClient";
+        }
+        virtual string getServerAddress() const{
+            return "localhost"; // TODO: should this have the port?
+        }
+        virtual bool call( Message &toSend, Message &response, bool assertOk=true );
+        virtual void say( Message &toSend );
+        virtual void sayPiggyBack( Message &toSend ) {
+            // don't need to piggy back when connected locally
+            return say( toSend );
+        }
+        class AlwaysAuthorized : public AuthenticationInfo {
+            virtual bool isAuthorized( const char *dbname ) {
+                return true;   
+            }
+        };
+
+        /* TODO: this looks bad that auth is set to always.  is that really always safe? */
+        class SavedContext {
+        public:
+            SavedContext() {
+                _save = dbMutex.atLeastReadLocked();
+
+                Client *c = currentClient.get();
+                oldAuth = c->ai;
+                // careful, don't want to free this:
+                c->ai = &always;
+
+                /* it only makes sense to manipulate a pointer - c->database() - if locked. 
+                   thus the _saved flag.
+                */
+                if( _save ) {
+                    if ( c->database() ) {
+                        dbMutex.assertAtLeastReadLocked();
+                        _oldName = c->database()->name;
+                    }
+                }
+            }
+            ~SavedContext() {
+                Client *c = currentClient.get();
+                c->ai = oldAuth;
+                if( _save ) {
+                    if ( !_oldName.empty() ) {
+                        dbMutex.assertAtLeastReadLocked();
+                        setClient( _oldName.c_str() );
+                    }
+                }
+                else {
+                    // defensive
+                    cc().clearns();
+                }
+            }
+        private:
+            bool _save;
+            static AlwaysAuthorized always;
+            AuthenticationInfo *oldAuth;
+            string _oldName;
+        };
+    };
+
+    extern int lockFile;
+    void acquirePathLock();
+    
+} // namespace mongo
diff --git a/db/introspect.cpp b/db/introspect.cpp
new file mode 100644
index 0000000..9cb477d
--- /dev/null
+++ b/db/introspect.cpp
@@ -0,0 +1,41 @@
+// introspect.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "introspect.h"
+#include "../util/builder.h"
+#include "../util/goodies.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "pdfile.h"
+
+namespace mongo {
+
+    void profile(const char *str,
+                 int millis)
+    {
+        BSONObjBuilder b;
+        b.appendDate("ts", jsTime());
+        b.append("info", str);
+        b.append("millis", (double) millis);
+        BSONObj p = b.done();
+        theDataFileMgr.insert(cc().database()->profileName.c_str(),
+                              p.objdata(), p.objsize(), true);
+    }
+
+} // namespace mongo
diff --git a/db/introspect.h b/db/introspect.h
new file mode 100644
index 0000000..1c0fe92
--- /dev/null
+++ b/db/introspect.h
@@ -0,0 +1,35 @@
+// introspect.h
+// system management stuff.
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "jsobj.h"
+#include "pdfile.h"
+
+namespace mongo {
+
+    /* --- profiling --------------------------------------------
+       do when database->profile is set
+    */
+
+    void profile(const char *str,
+                 int millis);
+
+} // namespace mongo
diff --git a/db/javatest.cpp b/db/javatest.cpp
new file mode 100644
index 0000000..22f2bdf
--- /dev/null
+++ b/db/javatest.cpp
@@ -0,0 +1,24 @@
+// javatest.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "javajs.h"
+
+int main() {
+    JavaJS = new JavaJSImpl();
+    javajstest();
+}
diff --git a/db/jsobj.cpp b/db/jsobj.cpp
new file mode 100644
index 0000000..1a299a5
--- /dev/null
+++ b/db/jsobj.cpp
@@ -0,0 +1,1636 @@
+/** @file jsobj.cpp - BSON implementation
+    http://www.mongodb.org/display/DOCS/BSON
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "stdafx.h"
+#include "jsobj.h"
+#include "nonce.h"
+#include "../util/goodies.h"
+#include "../util/base64.h"
+#include "../util/md5.hpp"
+#include <limits>
+#include "../util/unittest.h"
+#include "../util/embedded_builder.h"
+#include "json.h"
+#include "jsobjmanipulator.h"
+#include "../util/optime.h"
+#include <boost/static_assert.hpp>
+#undef assert
+#define assert xassert
+
+// make sure our assumptions are valid
+BOOST_STATIC_ASSERT( sizeof(int) == 4 );
+BOOST_STATIC_ASSERT( sizeof(long long) == 8 );
+BOOST_STATIC_ASSERT( sizeof(double) == 8 );
+BOOST_STATIC_ASSERT( sizeof(mongo::Date_t) == 8 );
+BOOST_STATIC_ASSERT( sizeof(mongo::OID) == 12 );
+
+namespace mongo {
+
+    BSONElement nullElement;
+
+    ostream& operator<<( ostream &s, const OID &o ) {
+        s << o.str();
+        return s;
+    }
+
+    IDLabeler GENOID;
+    BSONObjBuilder& operator<<(BSONObjBuilder& b, IDLabeler& id) {
+        OID oid;
+        oid.init();
+        b.appendOID("_id", &oid);
+        return b;
+    }
+
+    DateNowLabeler DATENOW;
+
+    string BSONElement::toString( bool includeFieldName ) const {
+        stringstream s;
+        if ( includeFieldName && type() != EOO )
+            s << fieldName() << ": ";
+        switch ( type() ) {
+        case EOO:
+            return "EOO";
+        case Date:
+            s << "new Date(" << date() << ')';
+            break;
+        case RegEx:
+            {
+                s << "/" << regex() << '/';
+                const char *p = regexFlags();
+                if ( p ) s << p;
+            }
+            break;
+        case NumberDouble:
+            {
+                stringstream tmp;
+                tmp.precision( 16 );
+                tmp << number();
+                string n = tmp.str();
+                s << n;
+                // indicate this is a double:
+                if( strchr(n.c_str(), '.') == 0 && strchr(n.c_str(), 'E') == 0 && strchr(n.c_str(), 'N') == 0 )
+                    s << ".0";
+            }
+            break;
+        case NumberLong:
+            s << _numberLong();
+            break;
+        case NumberInt:
+            s << _numberInt();
+            break;
+        case Bool:
+            s << ( boolean() ? "true" : "false" );
+            break;
+        case Object:
+        case Array:
+            s << embeddedObject().toString();
+            break;
+        case Undefined:
+            s << "undefined";
+            break;
+        case jstNULL:
+            s << "null";
+            break;
+        case MaxKey:
+            s << "MaxKey";
+            break;
+        case MinKey:
+            s << "MinKey";
+            break;
+        case CodeWScope:
+            s << "CodeWScope( "
+                << codeWScopeCode() << ", " << codeWScopeObject().toString() << ")";
+            break;
+        case Code:
+            if ( valuestrsize() > 80 )
+                s << string(valuestr()).substr(0, 70) << "...";
+            else {
+                s << valuestr();
+            }
+            break;
+        case Symbol:
+        case String:
+            if ( valuestrsize() > 80 )
+                s << '"' << string(valuestr()).substr(0, 70) << "...\"";
+            else {
+                s << '"' << valuestr() << '"';
+            }
+            break;
+        case DBRef:
+            s << "DBRef('" << valuestr() << "',";
+            {
+                OID *x = (OID *) (valuestr() + valuestrsize());
+                s << *x << ')';
+            }
+            break;
+        case jstOID:
+            s << "ObjId(";
+            s << __oid() << ')';
+            break;
+        case BinData:
+            s << "BinData";
+            break;
+        case Timestamp:
+            s << "Timestamp " << timestampTime() << "|" << timestampInc();
+            break;
+        default:
+            s << "?type=" << type();
+            break;
+        }
+        return s.str();
+    }
+
+    string escape( string s ) {
+        stringstream ret;
+        for ( string::iterator i = s.begin(); i != s.end(); ++i ) {
+            switch ( *i ) {
+            case '"':
+                ret << "\\\"";
+                break;
+            case '\\':
+                ret << "\\\\";
+                break;
+            case '/':
+                ret << "\\/";
+                break;
+            case '\b':
+                ret << "\\b";
+                break;
+            case '\f':
+                ret << "\\f";
+                break;
+            case '\n':
+                ret << "\\n";
+                break;
+            case '\r':
+                ret << "\\r";
+                break;
+            case '\t':
+                ret << "\\t";
+                break;
+            default:
+                if ( *i >= 0 && *i <= 0x1f ) {
+                    ret << "\\u";
+                    ret << hex;
+                    ret.width( 4 );
+                    ret.fill( '0' );
+                    ret << int( *i );
+                } else {
+                    ret << *i;
+                }
+            }
+        }
+        return ret.str();
+    }
+
+    string BSONElement::jsonString( JsonStringFormat format, bool includeFieldNames ) const {
+        stringstream s;
+        if ( includeFieldNames )
+            s << '"' << escape( fieldName() ) << "\" : ";
+        switch ( type() ) {
+        case String:
+        case Symbol:
+            s << '"' << escape( valuestr() ) << '"';
+            break;
+        case NumberLong:
+            s << _numberLong();
+            break;
+        case NumberInt:
+        case NumberDouble:
+            if ( number() >= -numeric_limits< double >::max() &&
+                    number() <= numeric_limits< double >::max() ) {
+                s.precision( 16 );
+                s << number();
+            } else {
+                stringstream ss;
+                ss << "Number " << number() << " cannot be represented in JSON";
+                string message = ss.str();
+                massert( 10311 ,  message.c_str(), false );
+            }
+            break;
+        case Bool:
+            s << ( boolean() ? "true" : "false" );
+            break;
+        case jstNULL:
+            s << "null";
+            break;
+        case Object:
+            s << embeddedObject().jsonString( format );
+            break;
+        case Array: {
+            if ( embeddedObject().isEmpty() ) {
+                s << "[]";
+                break;
+            }
+            s << "[ ";
+            BSONObjIterator i( embeddedObject() );
+            BSONElement e = i.next();
+            if ( !e.eoo() )
+                while ( 1 ) {
+                    s << e.jsonString( format, false );
+                    e = i.next();
+                    if ( e.eoo() )
+                        break;
+                    s << ", ";
+                }
+            s << " ]";
+            break;
+        }
+        case DBRef: {
+            OID *x = (OID *) (valuestr() + valuestrsize());
+            if ( format == TenGen )
+                s << "Dbref( ";
+            else
+                s << "{ \"$ref\" : ";
+            s << '"' << valuestr() << "\", ";
+            if ( format != TenGen )
+                s << "\"$id\" : ";
+            s << '"' << *x << "\" ";
+            if ( format == TenGen )
+                s << ')';
+            else
+                s << '}';
+            break;
+        }
+        case jstOID:
+            if ( format == TenGen ) {
+                s << "ObjectId( ";
+            } else {
+                s << "{ \"$oid\" : ";
+            }
+            s << '"' << __oid() << '"';
+            if ( format == TenGen ) {
+                s << " )";
+            } else {
+                s << " }";
+            }
+            break;
+        case BinData: {
+            int len = *(int *)( value() );
+            BinDataType type = BinDataType( *(char *)( (int *)( value() ) + 1 ) );
+            s << "{ \"$binary\" : \"";
+            char *start = ( char * )( value() ) + sizeof( int ) + 1;
+            base64::encode( s , start , len );
+            s << "\", \"$type\" : \"" << hex;
+            s.width( 2 );
+            s.fill( '0' );
+            s << type << dec;
+            s << "\" }";
+            break;
+        }
+        case Date:
+            if ( format == Strict )
+                s << "{ \"$date\" : ";
+            else
+                s << "Date( ";
+            s << date();
+            if ( format == Strict )
+                s << " }";
+            else
+                s << " )";
+            break;
+        case RegEx:
+            if ( format == Strict )
+                s << "{ \"$regex\" : \"";
+            else
+                s << "/";
+            s << escape( regex() );
+            if ( format == Strict )
+                s << "\", \"$options\" : \"" << regexFlags() << "\" }";
+            else {
+                s << "/";
+                // FIXME Worry about alpha order?
+                for ( const char *f = regexFlags(); *f; ++f )
+                    switch ( *f ) {
+                    case 'g':
+                    case 'i':
+                    case 'm':
+                        s << *f;
+                    default:
+                        break;
+                    }
+            }
+            break;
+
+        case Code:
+            s << ascode();
+            break;
+
+        case Timestamp:
+            s << "{ \"t\" : " << timestampTime() << " , \"i\" : " << timestampInc() << " }";
+            break;
+
+        default:
+            stringstream ss;
+            ss << "Cannot create a properly formatted JSON string with "
+            << "element: " << toString() << " of type: " << type();
+            string message = ss.str();
+            massert( 10312 ,  message.c_str(), false );
+        }
+        return s.str();
+    }
+
+    int BSONElement::size( int maxLen ) const {
+        if ( totalSize >= 0 )
+            return totalSize;
+
+        int remain = maxLen - fieldNameSize() - 1;
+
+        int x = 0;
+        switch ( type() ) {
+        case EOO:
+        case Undefined:
+        case jstNULL:
+        case MaxKey:
+        case MinKey:
+            break;
+        case Bool:
+            x = 1;
+            break;
+        case NumberInt:
+            x = 4;
+            break;
+        case Timestamp:
+        case Date:
+        case NumberDouble:
+        case NumberLong:
+            x = 8;
+            break;
+        case jstOID:
+            x = 12;
+            break;
+        case Symbol:
+        case Code:
+        case String:
+            massert( 10313 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = valuestrsize() + 4;
+            break;
+        case CodeWScope:
+            massert( 10314 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = objsize();
+            break;
+
+        case DBRef:
+            massert( 10315 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = valuestrsize() + 4 + 12;
+            break;
+        case Object:
+        case Array:
+            massert( 10316 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = objsize();
+            break;
+        case BinData:
+            massert( 10317 ,  "Insufficient bytes to calculate element size", maxLen == -1 || remain > 3 );
+            x = valuestrsize() + 4 + 1/*subtype*/;
+            break;
+        case RegEx:
+        {
+            const char *p = value();
+            int len1 = ( maxLen == -1 ) ? strlen( p ) : strnlen( p, remain );
+            massert( 10318 ,  "Invalid regex string", len1 != -1 );
+            p = p + len1 + 1;
+            int len2 = ( maxLen == -1 ) ? strlen( p ) : strnlen( p, remain - len1 - 1 );
+            massert( 10319 ,  "Invalid regex options string", len2 != -1 );
+            x = len1 + 1 + len2 + 1;
+        }
+        break;
+        default: {
+            stringstream ss;
+            ss << "BSONElement: bad type " << (int) type();
+            massert( 10320 , ss.str().c_str(),false);
+        }
+        }
+        totalSize =  x + fieldNameSize() + 1; // BSONType
+
+        return totalSize;
+    }
+
+    int BSONElement::getGtLtOp( int def ) const {
+        const char *fn = fieldName();
+        if ( fn[0] == '$' && fn[1] ) {
+            if ( fn[2] == 't' ) {
+                if ( fn[1] == 'g' ) {
+                    if ( fn[3] == 0 ) return BSONObj::GT;
+                    else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::GTE;
+                }
+                else if ( fn[1] == 'l' ) {
+                    if ( fn[3] == 0 ) return BSONObj::LT;
+                    else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::LTE;
+                }
+            }
+            else if ( fn[1] == 'n' && fn[2] == 'e' && fn[3] == 0)
+                return BSONObj::NE;
+            else if ( fn[1] == 'm' && fn[2] == 'o' && fn[3] == 'd' && fn[4] == 0 )
+                return BSONObj::opMOD;
+            else if ( fn[1] == 't' && fn[2] == 'y' && fn[3] == 'p' && fn[4] == 'e' && fn[5] == 0 )
+                return BSONObj::opTYPE;
+            else if ( fn[1] == 'i' && fn[2] == 'n' && fn[3] == 0 )
+                return BSONObj::opIN;
+            else if ( fn[1] == 'n' && fn[2] == 'i' && fn[3] == 'n' && fn[4] == 0 )
+                return BSONObj::NIN;
+            else if ( fn[1] == 'a' && fn[2] == 'l' && fn[3] == 'l' && fn[4] == 0 )
+                return BSONObj::opALL;
+            else if ( fn[1] == 's' && fn[2] == 'i' && fn[3] == 'z' && fn[4] == 'e' && fn[5] == 0 )
+                return BSONObj::opSIZE;
+            else if ( fn[1] == 'e' ){
+                if ( fn[2] == 'x' && fn[3] == 'i' && fn[4] == 's' && fn[5] == 't' && fn[6] == 's' && fn[7] == 0 )
+                    return BSONObj::opEXISTS;
+                if ( fn[2] == 'l' && fn[3] == 'e' && fn[4] == 'm' && fn[5] == 'M' && fn[6] == 'a' && fn[7] == 't' && fn[8] == 'c' && fn[9] == 'h' && fn[10] == 0 )
+                    return BSONObj::opELEM_MATCH;
+            }
+            else if ( fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'g' && fn[4] == 'e' && fn[5] == 'x' && fn[6] == 0 )
+                return BSONObj::opREGEX;
+            else if ( fn[1] == 'o' && fn[2] == 'p' && fn[3] == 't' && fn[4] == 'i' && fn[5] == 'o' && fn[6] == 'n' && fn[7] == 's' && fn[8] == 0 )
+                return BSONObj::opOPTIONS;
+        }
+        return def;
+    }
+
+    /* wo = "well ordered" */
+    int BSONElement::woCompare( const BSONElement &e,
+                                bool considerFieldName ) const {
+        int lt = (int) canonicalType();
+        int rt = (int) e.canonicalType();
+        int x = lt - rt;
+        if( x != 0 && (!isNumber() || !e.isNumber()) )
+            return x;
+        if ( considerFieldName ) {
+            x = strcmp(fieldName(), e.fieldName());
+            if ( x != 0 )
+                return x;
+        }
+        x = compareElementValues(*this, e);
+        return x;
+    }
+
+    /* must be same type when called, unless both sides are #s
+    */
+    int compareElementValues(const BSONElement& l, const BSONElement& r) {
+        int f;
+        double x;
+
+        switch ( l.type() ) {
+        case EOO:
+        case Undefined:
+        case jstNULL:
+        case MaxKey:
+        case MinKey:
+            f = l.canonicalType() - r.canonicalType();
+            if ( f<0 ) return -1;
+            return f==0 ? 0 : 1;
+        case Bool:
+            return *l.value() - *r.value();
+        case Timestamp:
+        case Date:
+            if ( l.date() < r.date() )
+                return -1;
+            return l.date() == r.date() ? 0 : 1;
+        case NumberLong:
+            if( r.type() == NumberLong ) {
+                long long L = l._numberLong();
+                long long R = r._numberLong();
+                if( L < R ) return -1;
+                if( L == R ) return 0;
+                return 1;
+            }
+            // else fall through
+        case NumberInt:
+        case NumberDouble: {
+            double left = l.number();
+            double right = r.number();
+            bool lNan = !( left <= numeric_limits< double >::max() &&
+                         left >= -numeric_limits< double >::max() );
+            bool rNan = !( right <= numeric_limits< double >::max() &&
+                         right >= -numeric_limits< double >::max() );
+            if ( lNan ) {
+                if ( rNan ) {
+                    return 0;
+                } else {
+                    return -1;
+                }
+            } else if ( rNan ) {
+                return 1;
+            }
+            x = left - right;
+            if ( x < 0 ) return -1;
+            return x == 0 ? 0 : 1;
+            }
+        case jstOID:
+            return memcmp(l.value(), r.value(), 12);
+        case Code:
+        case Symbol:
+        case String:
+            /* todo: utf version */
+            return strcmp(l.valuestr(), r.valuestr());
+        case Object:
+        case Array:
+            return l.embeddedObject().woCompare( r.embeddedObject() );
+        case DBRef:
+        case BinData: {
+            int lsz = l.valuesize();
+            int rsz = r.valuesize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value(), r.value(), lsz);
+        }
+        case RegEx:
+        {
+            int c = strcmp(l.regex(), r.regex());
+            if ( c )
+                return c;
+            return strcmp(l.regexFlags(), r.regexFlags());
+        }
+        case CodeWScope : {
+            f = l.canonicalType() - r.canonicalType();
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeScopeData() , r.codeWScopeScopeData() );
+            if ( f )
+                return f;
+            return 0;
+        }
+        default:
+            out() << "compareElementValues: bad type " << (int) l.type() << endl;
+            assert(false);
+        }
+        return -1;
+    }
+
+    void BSONElement::validate() const {
+        switch( type() ) {
+            case DBRef:
+            case Code:
+            case Symbol:
+            case String:
+                massert( 10321 ,  "Invalid dbref/code/string/symbol size",
+                        valuestrsize() > 0 &&
+                        valuestrsize() - 1 == strnlen( valuestr(), valuestrsize() ) );
+                break;
+            case CodeWScope: {
+                int totalSize = *( int * )( value() );
+                massert( 10322 ,  "Invalid CodeWScope size", totalSize >= 8 );
+                int strSizeWNull = *( int * )( value() + 4 );
+                massert( 10323 ,  "Invalid CodeWScope string size", totalSize >= strSizeWNull + 4 + 4 );
+                massert( 10324 ,  "Invalid CodeWScope string size",
+                        strSizeWNull > 0 &&
+                        strSizeWNull - 1 == strnlen( codeWScopeCode(), strSizeWNull ) );
+                massert( 10325 ,  "Invalid CodeWScope size", totalSize >= strSizeWNull + 4 + 4 + 4 );
+                int objSize = *( int * )( value() + 4 + 4 + strSizeWNull );
+                massert( 10326 ,  "Invalid CodeWScope object size", totalSize == 4 + 4 + strSizeWNull + objSize );
+                // Subobject validation handled elsewhere.
+            }
+            case Object:
+                // We expect Object size validation to be handled elsewhere.
+            default:
+                break;
+        }
+    }
+
+    /* Matcher --------------------------------------*/
+
+// If the element is something like:
+//   a : { $gt : 3 }
+// we append
+//   a : 3
+// else we just append the element.
+//
+    void appendElementHandlingGtLt(BSONObjBuilder& b, const BSONElement& e) {
+        if ( e.type() == Object ) {
+            BSONElement fe = e.embeddedObject().firstElement();
+            const char *fn = fe.fieldName();
+            if ( fn[0] == '$' && fn[1] && fn[2] == 't' ) {
+                b.appendAs(fe, e.fieldName());
+                return;
+            }
+        }
+        b.append(e);
+    }
+
+    int getGtLtOp(const BSONElement& e) {
+        if ( e.type() != Object )
+            return BSONObj::Equality;
+
+        BSONElement fe = e.embeddedObject().firstElement();
+        return fe.getGtLtOp();
+    }
+
+    FieldCompareResult compareDottedFieldNames( const string& l , const string& r ){
+        size_t lstart = 0;
+        size_t rstart = 0;
+        while ( 1 ){
+            if ( lstart >= l.size() ){
+                if ( rstart >= r.size() )
+                    return SAME;
+                return RIGHT_SUBFIELD;
+            }
+            if ( rstart >= r.size() )
+                return LEFT_SUBFIELD;
+
+            size_t a = l.find( '.' , lstart );
+            size_t b = r.find( '.' , rstart );
+
+            size_t lend = a == string::npos ? l.size() : a;
+            size_t rend = b == string::npos ? r.size() : b;
+
+            const string& c = l.substr( lstart , lend - lstart );
+            const string& d = r.substr( rstart , rend - rstart );
+
+            int x = c.compare( d );
+
+            if ( x < 0 )
+                return LEFT_BEFORE;
+            if ( x > 0 )
+                return RIGHT_BEFORE;
+
+            lstart = lend + 1;
+            rstart = rend + 1;
+        }
+    }
+
+    /* BSONObj ------------------------------------------------------------*/
+
+    BSONObj::EmptyObject BSONObj::emptyObject;
+
+    string BSONObj::toString() const {
+        if ( isEmpty() ) return "{}";
+
+        stringstream s;
+        s << "{ ";
+        BSONObjIterator i(*this);
+        bool first = true;
+        while ( 1 ) {
+            massert( 10327 ,  "Object does not end with EOO", i.moreWithEOO() );
+            BSONElement e = i.next( true );
+            massert( 10328 ,  "Invalid element size", e.size() > 0 );
+            massert( 10329 ,  "Element too large", e.size() < ( 1 << 30 ) );
+            int offset = e.rawdata() - this->objdata();
+            massert( 10330 ,  "Element extends past end of object",
+                    e.size() + offset <= this->objsize() );
+            e.validate();
+            bool end = ( e.size() + offset == this->objsize() );
+            if ( e.eoo() ) {
+                massert( 10331 ,  "EOO Before end of object", end );
+                break;
+            }
+            if ( first )
+                first = false;
+            else
+                s << ", ";
+            s << e.toString();
+        }
+        s << " }";
+        return s.str();
+    }
+
+    string BSONObj::md5() const {
+        md5digest d;
+        md5_state_t st;
+        md5_init(&st);
+        md5_append( &st , (const md5_byte_t*)_objdata , objsize() );
+        md5_finish(&st, d);
+        return digestToString( d );
+    }
+
+    string BSONObj::jsonString( JsonStringFormat format ) const {
+
+        if ( isEmpty() ) return "{}";
+
+        stringstream s;
+        s << "{ ";
+        BSONObjIterator i(*this);
+        BSONElement e = i.next();
+        if ( !e.eoo() )
+            while ( 1 ) {
+                s << e.jsonString( format );
+                e = i.next();
+                if ( e.eoo() )
+                    break;
+                s << ", ";
+            }
+        s << " }";
+        return s.str();
+    }
+
+// todo: can be a little faster if we don't use toString() here.
+    bool BSONObj::valid() const {
+        try {
+            toString();
+        }
+        catch (...) {
+            return false;
+        }
+        return true;
+    }
+
+    /* well ordered compare */
+    int BSONObj::woCompare(const BSONObj &r, const BSONObj &idxKey,
+                           bool considerFieldName) const {
+        if ( isEmpty() )
+            return r.isEmpty() ? 0 : -1;
+        if ( r.isEmpty() )
+            return 1;
+
+        bool ordered = !idxKey.isEmpty();
+
+        BSONObjIterator i(*this);
+        BSONObjIterator j(r);
+        BSONObjIterator k(idxKey);
+        while ( 1 ) {
+            // so far, equal...
+
+            BSONElement l = i.next();
+            BSONElement r = j.next();
+            BSONElement o;
+            if ( ordered )
+                o = k.next();
+            if ( l.eoo() )
+                return r.eoo() ? 0 : -1;
+            if ( r.eoo() )
+                return 1;
+
+            int x = l.woCompare( r, considerFieldName );
+            if ( ordered && o.number() < 0 )
+                x = -x;
+            if ( x != 0 )
+                return x;
+        }
+        return -1;
+    }
+
+    BSONObj staticNull = fromjson( "{'':null}" );
+
+    /* well ordered compare */
+    int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey ) const{
+        if ( isEmpty() )
+            return other.isEmpty() ? 0 : -1;
+        if ( other.isEmpty() )
+            return 1;
+
+        uassert( 10060 ,  "woSortOrder needs a non-empty sortKey" , ! sortKey.isEmpty() );
+
+        BSONObjIterator i(sortKey);
+        while ( 1 ){
+            BSONElement f = i.next();
+            if ( f.eoo() )
+                return 0;
+
+            BSONElement l = getField( f.fieldName() );
+            if ( l.eoo() )
+                l = staticNull.firstElement();
+            BSONElement r = other.getField( f.fieldName() );
+            if ( r.eoo() )
+                r = staticNull.firstElement();
+
+            int x = l.woCompare( r, false );
+            if ( f.number() < 0 )
+                x = -x;
+            if ( x != 0 )
+                return x;
+        }
+        return -1;
+    }
+
+
+    BSONElement BSONObj::getField(const char *name) const {
+        BSONObjIterator i(*this);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            if ( strcmp(e.fieldName(), name) == 0 )
+                return e;
+        }
+        return nullElement;
+    }
+
+    /* return has eoo() true if no match
+       supports "." notation to reach into embedded objects
+    */
+    BSONElement BSONObj::getFieldDotted(const char *name) const {
+        BSONElement e = getField( name );
+        if ( e.eoo() ) {
+            const char *p = strchr(name, '.');
+            if ( p ) {
+                string left(name, p-name);
+                BSONObj sub = getObjectField(left.c_str());
+                return sub.isEmpty() ? nullElement : sub.getFieldDotted(p+1);
+            }
+        }
+
+        return e;
+    }
+
+    /* jul09 : 'deep' and this function will be going away in the future - kept only for backward compatibility of datafiles for now. */
+    void trueDat( bool *deep ) {
+        if( deep )
+            *deep = true;
+    }
+
+    void BSONObj::getFieldsDotted(const char *name, BSONElementSet &ret, bool *deep ) const {
+        BSONElement e = getField( name );
+        if ( e.eoo() ) {
+            const char *p = strchr(name, '.');
+            if ( p ) {
+                string left(name, p-name);
+                BSONElement e = getField( left );
+                if ( e.type() == Array ) {
+                    trueDat( deep );
+                    BSONObjIterator i( e.embeddedObject() );
+                    while( i.moreWithEOO() ) {
+                        BSONElement f = i.next();
+                        if ( f.eoo() )
+                            break;
+                        if ( f.type() == Object )
+                            f.embeddedObject().getFieldsDotted(p+1, ret);
+                    }
+                } else if ( e.type() == Object ) {
+                    e.embeddedObject().getFieldsDotted(p+1, ret);
+                }
+            }
+        } else {
+            if ( e.type() == Array ) {
+                trueDat( deep );
+                BSONObjIterator i( e.embeddedObject() );
+                while( i.moreWithEOO() ) {
+                    BSONElement f = i.next();
+                    if ( f.eoo() )
+                        break;
+                    ret.insert( f );
+                }
+            } else {
+                ret.insert( e );
+            }
+        }
+        if ( ret.empty() && deep )
+            *deep = false;
+    }
+
+    BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const {
+        const char *p = strchr(name, '.');
+        string left;
+        if ( p ) {
+            left = string(name, p-name);
+            name = p + 1;
+        } else {
+            left = string(name);
+            name = name + strlen(name);
+        }
+        BSONElement sub = getField(left.c_str());
+        if ( sub.eoo() )
+            return nullElement;
+        else if ( sub.type() == Array || strlen( name ) == 0 )
+            return sub;
+        else if ( sub.type() == Object )
+            return sub.embeddedObject().getFieldDottedOrArray( name );
+        else
+            return nullElement;
+    }
+
+    /* makes a new BSONObj with the fields specified in pattern.
+       fields returned in the order they appear in pattern.
+       if any field missing or undefined in the original object, that field
+       in the output will be null.
+
+       n^2 implementation bad if pattern and object have lots
+       of fields - normally pattern doesn't so should be fine.
+    */
+    BSONObj BSONObj::extractFieldsDotted(BSONObj pattern) const {
+        BSONObjBuilder b;
+        BSONObjIterator i(pattern);
+        while (i.more()) {
+            BSONElement e = i.next();
+            const char *name = e.fieldName();
+
+            BSONElement x = getFieldDotted( name );
+            if ( x.eoo() || x.type() == Undefined ) {
+                b.appendNull(name);
+            } else {
+                b.appendAs(x, name);
+            }
+        }
+        return b.done();
+    }
+
+    /**
+     sets element field names to empty string
+     If a field in pattern is missing, it is omitted from the returned
+     object.
+     */
+    BSONObj BSONObj::extractFieldsUnDotted(BSONObj pattern) const {
+        BSONObjBuilder b;
+        BSONObjIterator i(pattern);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            BSONElement x = getField(e.fieldName());
+            if ( !x.eoo() )
+                b.appendAs(x, "");
+        }
+        return b.obj();
+    }
+
+    BSONObj BSONObj::extractFields(const BSONObj& pattern , bool fillWithNull ) const {
+        BSONObjBuilder b(32); // scanandorder.h can make a zillion of these, so we start the allocation very small
+        BSONObjIterator i(pattern);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            BSONElement x = getFieldDotted(e.fieldName());
+            if ( ! x.eoo() )
+                b.appendAs( x, e.fieldName() );
+            else if ( fillWithNull )
+                b.appendNull( e.fieldName() );
+        }
+        return b.obj();
+    }
+
+    BSONObj BSONObj::filterFieldsUndotted( const BSONObj &filter, bool inFilter ) const {
+        BSONObjBuilder b;
+        BSONObjIterator i( *this );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            BSONElement x = filter.getField( e.fieldName() );
+            if ( ( x.eoo() && !inFilter ) ||
+                ( !x.eoo() && inFilter ) )
+                b.append( e );
+        }
+        return b.obj();
+    }
+
+    BSONElement BSONObj::getFieldUsingIndexNames(const char *fieldName, const BSONObj &indexKey) const {
+        BSONObjIterator i( indexKey );
+        int j = 0;
+        while( i.moreWithEOO() ) {
+            BSONElement f = i.next();
+            if ( f.eoo() )
+                return BSONElement();
+            if ( strcmp( f.fieldName(), fieldName ) == 0 )
+                break;
+            ++j;
+        }
+        BSONObjIterator k( *this );
+        while( k.moreWithEOO() ) {
+            BSONElement g = k.next();
+            if ( g.eoo() )
+                return BSONElement();
+            if ( j == 0 ) {
+                return g;
+            }
+            --j;
+        }
+        return BSONElement();
+    }
+
+    int BSONObj::getIntField(const char *name) const {
+        BSONElement e = getField(name);
+        return e.isNumber() ? (int) e.number() : INT_MIN;
+    }
+
+    bool BSONObj::getBoolField(const char *name) const {
+        BSONElement e = getField(name);
+        return e.type() == Bool ? e.boolean() : false;
+    }
+
+    const char * BSONObj::getStringField(const char *name) const {
+        BSONElement e = getField(name);
+        return e.type() == String ? e.valuestr() : "";
+    }
+
+    BSONObj BSONObj::getObjectField(const char *name) const {
+        BSONElement e = getField(name);
+        BSONType t = e.type();
+        return t == Object || t == Array ? e.embeddedObject() : BSONObj();
+    }
+
+    int BSONObj::nFields() const {
+        int n = 0;
+        BSONObjIterator i(*this);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            n++;
+        }
+        return n;
+    }
+
+    /* grab names of all the fields in this object */
+    int BSONObj::getFieldNames(set<string>& fields) const {
+        int n = 0;
+        BSONObjIterator i(*this);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            fields.insert(e.fieldName());
+            n++;
+        }
+        return n;
+    }
+
+    /* note: addFields always adds _id even if not specified
+       returns n added not counting _id unless requested.
+    */
+    int BSONObj::addFields(BSONObj& from, set<string>& fields) {
+        assert( isEmpty() && !isOwned() ); /* partial implementation for now... */
+
+        BSONObjBuilder b;
+
+        int N = fields.size();
+        int n = 0;
+        BSONObjIterator i(from);
+        bool gotId = false;
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            const char *fname = e.fieldName();
+            if ( fields.count(fname) ) {
+                b.append(e);
+                ++n;
+                gotId = gotId || strcmp(fname, "_id")==0;
+                if ( n == N && gotId )
+                    break;
+            } else if ( strcmp(fname, "_id")==0 ) {
+                b.append(e);
+                gotId = true;
+                if ( n == N && gotId )
+                    break;
+            }
+        }
+
+        if ( n ) {
+            int len;
+            init( b.decouple(len), true );
+        }
+
+        return n;
+    }
+
+    BSONObj BSONObj::clientReadable() const {
+        BSONObjBuilder b;
+        BSONObjIterator i( *this );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            switch( e.type() ) {
+                case MinKey: {
+                    BSONObjBuilder m;
+                    m.append( "$minElement", 1 );
+                    b.append( e.fieldName(), m.done() );
+                    break;
+                }
+                case MaxKey: {
+                    BSONObjBuilder m;
+                    m.append( "$maxElement", 1 );
+                    b.append( e.fieldName(), m.done() );
+                    break;
+                }
+                default:
+                    b.append( e );
+            }
+        }
+        return b.obj();
+    }
+
+    BSONObj BSONObj::replaceFieldNames( const BSONObj &names ) const {
+        BSONObjBuilder b;
+        BSONObjIterator i( *this );
+        BSONObjIterator j( names );
+        BSONElement f = j.moreWithEOO() ? j.next() : BSONObj().firstElement();
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            if ( !f.eoo() ) {
+                b.appendAs( e, f.fieldName() );
+                f = j.next();
+            } else {
+                b.append( e );
+            }
+        }
+        return b.obj();
+    }
+
+    bool BSONObj::okForStorage() const {
+        BSONObjIterator i( *this );
+        while ( i.more() ){
+            BSONElement e = i.next();
+            const char * name = e.fieldName();
+            
+            if ( strchr( name , '.' ) ||
+                 strchr( name , '$' ) ){
+                return false;
+            }
+            
+            if ( e.mayEncapsulate() ){
+                switch ( e.type() ){
+                case Object:
+                case Array:
+                    if ( ! e.embeddedObject().okForStorage() )
+                        return false;
+                    break;
+                case CodeWScope:
+                    if ( ! e.codeWScopeObject().okForStorage() )
+                        return false;
+                    break;
+                default:
+                    uassert( 12579, "unhandled cases in BSONObj okForStorage" , 0 );
+                }
+                
+            }
+        }
+        return true;
+    }
+
+    string BSONObj::hexDump() const {
+        stringstream ss;
+        const char *d = objdata();
+        int size = objsize();
+        for( int i = 0; i < size; ++i ) {
+            ss.width( 2 );
+            ss.fill( '0' );
+            ss << hex << (unsigned)(unsigned char)( d[ i ] ) << dec;
+            if ( ( d[ i ] >= '0' && d[ i ] <= '9' ) || ( d[ i ] >= 'A' && d[ i ] <= 'z' ) )
+                ss << '\'' << d[ i ] << '\'';
+            if ( i != size - 1 )
+                ss << ' ';
+        }
+        return ss.str();
+    }
+
+    ostream& operator<<( ostream &s, const BSONObj &o ) {
+        return s << o.toString();
+    }
+
+    ostream& operator<<( ostream &s, const BSONElement &e ) {
+        return s << e.toString();
+    }
+
+    void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base){
+        BSONObjIterator it(obj);
+        while (it.more()){
+            BSONElement e = it.next();
+            if (e.type() == Object){
+                string newbase = base + e.fieldName() + ".";
+                nested2dotted(b, e.embeddedObject(), newbase);
+            }else{
+                string newbase = base + e.fieldName();
+                b.appendAs(e, newbase.c_str());
+            }
+        }
+    }
+
+    void dotted2nested(BSONObjBuilder& b, const BSONObj& obj){
+        //use map to sort fields
+        BSONMap sorted = bson2map(obj);
+        EmbeddedBuilder eb(&b);
+        for(BSONMap::const_iterator it=sorted.begin(); it!=sorted.end(); ++it){
+            eb.appendAs(it->second, it->first);
+        }
+        eb.done();
+    }
+
+    /*-- test things ----------------------------------------------------*/
+
+#pragma pack(1)
+    struct MaxKeyData {
+        MaxKeyData() {
+            totsize=7;
+            maxkey=MaxKey;
+            name=0;
+            eoo=EOO;
+        }
+        int totsize;
+        char maxkey;
+        char name;
+        char eoo;
+    } maxkeydata;
+    BSONObj maxKey((const char *) &maxkeydata);
+
+    struct MinKeyData {
+        MinKeyData() {
+            totsize=7;
+            minkey=MinKey;
+            name=0;
+            eoo=EOO;
+        }
+        int totsize;
+        char minkey;
+        char name;
+        char eoo;
+    } minkeydata;
+    BSONObj minKey((const char *) &minkeydata);
+
+    struct JSObj0 {
+        JSObj0() {
+            totsize = 5;
+            eoo = EOO;
+        }
+        int totsize;
+        char eoo;
+    } js0;
+#pragma pack()
+
+    BSONElement::BSONElement() {
+        data = &js0.eoo;
+        fieldNameSize_ = 0;
+        totalSize = 1;
+    }
+
+    struct BsonUnitTest : public UnitTest {
+        void testRegex() {
+
+            BSONObjBuilder b;
+            b.appendRegex("x", "foo");
+            BSONObj o = b.done();
+
+            BSONObjBuilder c;
+            c.appendRegex("x", "goo");
+            BSONObj p = c.done();
+
+            assert( !o.woEqual( p ) );
+            assert( o.woCompare( p ) < 0 );
+
+        }
+        void testoid() {
+            OID id;
+            id.init();
+            //            sleepsecs(3);
+
+            OID b;
+            // goes with sleep above...
+            // b.init();
+            // assert( memcmp(id.getData(), b.getData(), 12) < 0 );
+
+            b.init( id.str() );
+            assert( b == id );
+        }
+
+        void testbounds(){
+            BSONObj l , r;
+            {
+                BSONObjBuilder b;
+                b.append( "x" , numeric_limits<long long>::max() );
+                l = b.obj();
+            }
+            {
+                BSONObjBuilder b;
+                b.append( "x" , numeric_limits<double>::max() );
+                r = b.obj();
+            }
+            assert( l.woCompare( r ) < 0 );
+            assert( r.woCompare( l ) > 0 );
+            {
+                BSONObjBuilder b;
+                b.append( "x" , numeric_limits<int>::max() );
+                l = b.obj();
+            }
+            assert( l.woCompare( r ) < 0 );
+            assert( r.woCompare( l ) > 0 );
+        }
+
+        void testorder(){
+            {
+                BSONObj x,y,z;
+                { BSONObjBuilder b; b.append( "x" , (long long)2 ); x = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (int)3 ); y = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (long long)4 ); z = b.obj(); }
+                assert( x.woCompare( y ) < 0 );
+                assert( x.woCompare( z ) < 0 );
+                assert( y.woCompare( x ) > 0 );
+                assert( z.woCompare( x ) > 0 );
+                assert( y.woCompare( z ) < 0 );
+                assert( z.woCompare( y ) > 0 );
+            }
+
+            {
+                BSONObj ll,d,i,n,u;
+                { BSONObjBuilder b; b.append( "x" , (long long)2 ); ll = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (double)2 ); d = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (int)2 ); i = b.obj(); }
+                { BSONObjBuilder b; b.appendNull( "x" ); n = b.obj(); }
+                { BSONObjBuilder b; u = b.obj(); }
+
+                assert( ll.woCompare( u ) == d.woCompare( u ) );
+                assert( ll.woCompare( u ) == i.woCompare( u ) );
+                BSONObj k = BSON( "x" << 1 );
+                assert( ll.woCompare( u , k ) == d.woCompare( u , k ) );
+                assert( ll.woCompare( u , k ) == i.woCompare( u , k ) );
+
+                assert( u.woCompare( ll ) == u.woCompare( d ) );
+                assert( u.woCompare( ll ) == u.woCompare( i ) );
+                assert( u.woCompare( ll , k ) == u.woCompare( d , k ) );
+                assert( u.woCompare( ll , k ) == u.woCompare( d , k ) );
+
+                assert( i.woCompare( n ) == d.woCompare( n ) );
+
+                assert( ll.woCompare( n ) == d.woCompare( n ) );
+                assert( ll.woCompare( n ) == i.woCompare( n ) );
+                assert( ll.woCompare( n , k ) == d.woCompare( n , k ) );
+                assert( ll.woCompare( n , k ) == i.woCompare( n , k ) );
+
+                assert( n.woCompare( ll ) == n.woCompare( d ) );
+                assert( n.woCompare( ll ) == n.woCompare( i ) );
+                assert( n.woCompare( ll , k ) == n.woCompare( d , k ) );
+                assert( n.woCompare( ll , k ) == n.woCompare( d , k ) );
+            }
+
+            {
+                BSONObj l,r;
+                { BSONObjBuilder b; b.append( "x" , "eliot" ); l = b.obj(); }
+                { BSONObjBuilder b; b.appendSymbol( "x" , "eliot" ); r = b.obj(); }
+                assert( l.woCompare( r ) == 0 );
+                assert( r.woCompare( l ) == 0 );
+            }
+        }
+
+        void run() {
+            testRegex();
+            BSONObjBuilder A,B,C;
+            A.append("x", 2);
+            B.append("x", 2.0);
+            C.append("x", 2.1);
+            BSONObj a = A.done();
+            BSONObj b = B.done();
+            BSONObj c = C.done();
+            assert( !a.woEqual( b ) ); // comments on operator==
+            int cmp = a.woCompare(b);
+            assert( cmp == 0 );
+            cmp = a.woCompare(c);
+            assert( cmp < 0 );
+            testoid();
+            testbounds();
+            testorder();
+        }
+    } bson_unittest;
+
+/*
+    BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const char * value ) {
+        _builder->append( _fieldName , value );
+        return *_builder;
+    }
+
+    BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const int value ) {
+        _builder->append( _fieldName , value );
+        return *_builder;
+    }
+
+    BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const double value ) {
+        _builder->append( _fieldName , value );
+        return *_builder;
+    }
+*/
+
+    unsigned OID::_machine = (unsigned) security.getNonceInitSafe();
+    void OID::newState(){
+        // using fresh Security object to avoid buffered devrandom
+        _machine = (unsigned) Security().getNonce();
+    }
+    
+    void OID::init() {
+        static WrappingInt inc = (unsigned) security.getNonce();
+        unsigned t = (unsigned) time(0);
+        char *T = (char *) &t;
+        data[0] = T[3];
+        data[1] = T[2];
+        data[2] = T[1];
+        data[3] = T[0];
+
+        (unsigned&) data[4] = _machine;
+
+        int new_inc = inc.atomicIncrement();
+        T = (char *) &new_inc;
+        char * raw = (char*)&b;
+        raw[0] = T[3];
+        raw[1] = T[2];
+        raw[2] = T[1];
+        raw[3] = T[0];
+    }
+
+    void OID::init( string s ){
+        assert( s.size() == 24 );
+        const char *p = s.c_str();
+        char buf[3];
+        buf[2] = 0;
+        for( int i = 0; i < 12; i++ ) {
+            buf[0] = p[0];
+            buf[1] = p[1];
+            p += 2;
+            stringstream ss(buf);
+            unsigned z;
+            ss >> hex >> z;
+            data[i] = z;
+        }
+
+/*
+        string as = s.substr( 0 , 16 );
+        string bs = s.substr( 16 );
+
+        stringstream ssa(as);
+        ssa >> hex >> a;
+
+        stringstream ssb(bs);
+        ssb >> hex >> b;
+*/
+    }
+
+    Labeler::Label GT( "$gt" );
+    Labeler::Label GTE( "$gte" );
+    Labeler::Label LT( "$lt" );
+    Labeler::Label LTE( "$lte" );
+    Labeler::Label NE( "$ne" );
+    Labeler::Label SIZE( "$size" );
+
+    void BSONElementManipulator::initTimestamp() {
+        massert( 10332 ,  "Expected CurrentTime type", element_.type() == Timestamp );
+        unsigned long long &timestamp = *( reinterpret_cast< unsigned long long* >( value() ) );
+        if ( timestamp == 0 )
+            timestamp = OpTime::now().asDate();
+    }
+
+
+    void BSONObjBuilder::appendMinForType( const string& field , int t ){
+        switch ( t ){
+        case MinKey: appendMinKey( field.c_str() ); return;
+        case MaxKey: appendMinKey( field.c_str() ); return;
+        case NumberInt:
+        case NumberDouble:
+        case NumberLong:
+            append( field.c_str() , - numeric_limits<double>::max() ); return;
+        case jstOID:
+            {
+                OID o;
+                memset(&o, 0, sizeof(o));
+                appendOID( field.c_str() , &o);
+                return;
+            }
+        case Bool: appendBool( field.c_str() , false); return;
+        case Date: appendDate( field.c_str() , 0); return;
+        case jstNULL: appendNull( field.c_str() ); return;
+        case Symbol:
+        case String: append( field.c_str() , "" ); return;
+        case Object: append( field.c_str() , BSONObj() ); return;
+        case Array:
+            appendArray( field.c_str() , BSONObj() ); return;
+        case BinData:
+            appendBinData( field.c_str() , 0 , Function , (const char *) 0 ); return;
+        case Undefined:
+            appendUndefined( field.c_str() ); return;
+        case RegEx: appendRegex( field.c_str() , "" ); return;
+        case DBRef:
+            {
+                OID o;
+                memset(&o, 0, sizeof(o));
+                appendDBRef( field.c_str() , "" , o );
+                return;
+            }
+        case Code: appendCode( field.c_str() , "" ); return;
+        case CodeWScope: appendCodeWScope( field.c_str() , "" , BSONObj() ); return;
+        case Timestamp: appendTimestamp( field.c_str() , 0); return;
+
+        };
+        log() << "type not support for appendMinElementForType: " << t << endl;
+        uassert( 10061 ,  "type not supported for appendMinElementForType" , false );
+    }
+
+    void BSONObjBuilder::appendMaxForType( const string& field , int t ){
+        switch ( t ){
+        case MinKey: appendMaxKey( field.c_str() );  break;
+        case MaxKey: appendMaxKey( field.c_str() ); break;
+        case NumberInt:
+        case NumberDouble:
+        case NumberLong:
+            append( field.c_str() , numeric_limits<double>::max() );
+            break;
+        case BinData:
+            appendMinForType( field , jstOID );
+            break;
+        case jstOID:
+            {
+                OID o;
+                memset(&o, 0xFF, sizeof(o));
+                appendOID( field.c_str() , &o);
+                break;
+            }
+        case Undefined:
+        case jstNULL:
+            appendMinForType( field , NumberInt );
+        case Bool: appendBool( field.c_str() , true); break;
+        case Date: appendDate( field.c_str() , 0xFFFFFFFFFFFFFFFFLL ); break;
+        case Symbol:
+        case String: append( field.c_str() , BSONObj() ); break;
+        case Code:
+        case CodeWScope:
+            appendCodeWScope( field.c_str() , "ZZZ" , BSONObj() ); break;
+        case Timestamp:
+            appendTimestamp( field.c_str() , numeric_limits<unsigned long long>::max() ); break;
+        default:
+            appendMinForType( field , t + 1 );
+        }
+    }
+
+    const string BSONObjBuilder::numStrs[] = {
+         "0",  "1",  "2",  "3",  "4",  "5",  "6",  "7",  "8",  "9",
+        "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
+        "20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
+        "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
+        "40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
+        "50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
+        "60", "61", "62", "63", "64", "65", "66", "67", "68", "69",
+        "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
+        "80", "81", "82", "83", "84", "85", "86", "87", "88", "89",
+        "90", "91", "92", "93", "94", "95", "96", "97", "98", "99",
+    };
+
+    bool BSONObjBuilder::appendAsNumber( const string& fieldName , const string& data ){
+        if ( data.size() == 0 )
+            return false;
+        
+        unsigned int pos=0;
+        if ( data[0] == '-' )
+            pos++;
+        
+        bool hasDec = false;
+        
+        for ( ; pos<data.size(); pos++ ){
+            if ( isdigit(data[pos]) )
+                continue;
+
+            if ( data[pos] == '.' ){
+                if ( hasDec )
+                    return false;
+                hasDec = true;
+                continue;
+            }
+            
+            return false;
+        }
+        
+        if ( hasDec ){
+            double d = atof( data.c_str() );
+            append( fieldName.c_str() , d );
+            return true;
+        }
+        
+        if ( data.size() < 8 ){
+            append( fieldName , atoi( data.c_str() ) );
+            return true;
+        }
+        
+        try {
+            long long num = boost::lexical_cast<long long>( data );
+            append( fieldName , num );
+            return true;
+        }
+        catch(bad_lexical_cast &){
+            return false;
+        }
+
+    }
+
+
+    int BSONElementFieldSorter( const void * a , const void * b ){
+        const char * x = *((const char**)a);
+        const char * y = *((const char**)b);
+        x++; y++;
+        return strcmp( x , y );
+    }
+    
+    BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ){
+        _nfields = o.nFields();
+        _fields = new const char*[_nfields];
+        int x = 0;
+        BSONObjIterator i( o );
+        while ( i.more() ){
+            _fields[x++] = i.next().rawdata();
+            assert( _fields[x-1] );
+        }
+        assert( x == _nfields );
+        qsort( _fields , _nfields , sizeof(char*) , BSONElementFieldSorter );
+        _cur = 0;
+    }
+
+
+} // namespace mongo
diff --git a/db/jsobj.h b/db/jsobj.h
new file mode 100644
index 0000000..4030122
--- /dev/null
+++ b/db/jsobj.h
@@ -0,0 +1,1869 @@
+/** @file jsobj.h 
+    BSON classes
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+   BSONObj and its helpers
+
+   "BSON" stands for "binary JSON" -- ie a binary way to represent objects that would be
+   represented in JSON (plus a few extensions useful for databases & other languages).
+
+   http://www.mongodb.org/display/DOCS/BSON
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "../util/builder.h"
+#include "../util/optime.h"
+#include "boost/utility.hpp"
+
+#include <set>
+
+namespace mongo {
+
+    class BSONObj;
+    struct BSONArray; // empty subclass of BSONObj useful for overloading
+    class BSONElement;
+    class Record;
+    class BSONObjBuilder;
+    class BSONArrayBuilder;
+    class BSONObjBuilderValueStream;
+
+#pragma pack(1)
+
+    /** 
+        the complete list of valid BSON types
+    */
+    enum BSONType {
+        /** smaller than all other types */
+        MinKey=-1,
+        /** end of object */
+        EOO=0,
+        /** double precision floating point value */
+        NumberDouble=1,
+        /** character string, stored in utf8 */
+        String=2,
+        /** an embedded object */
+        Object=3,
+        /** an embedded array */
+        Array=4,
+        /** binary data */
+        BinData=5,
+        /** Undefined type */
+        Undefined=6,
+        /** ObjectId */
+        jstOID=7,
+        /** boolean type */
+        Bool=8,
+        /** date type */
+        Date=9,
+        /** null type */
+        jstNULL=10,
+        /** regular expression, a pattern with options */
+        RegEx=11,
+        /** deprecated / will be redesigned */
+        DBRef=12,
+        /** deprecated / use CodeWScope */
+        Code=13,
+        /** a programming language (e.g., Python) symbol */
+        Symbol=14,
+        /** javascript code that can execute on the database server, with SavedContext */
+        CodeWScope=15,
+        /** 32 bit signed integer */
+        NumberInt = 16,
+        /** Updated to a Date with value next OpTime on insert */
+        Timestamp = 17,
+        /** 64 bit integer */
+        NumberLong = 18,
+        /** max type that is not MaxKey */
+        JSTypeMax=18,
+        /** larger than all other types */
+        MaxKey=127
+    };
+
+    /* subtypes of BinData.
+       bdtCustom and above are ones that the JS compiler understands, but are
+       opaque to the database.
+    */
+    enum BinDataType { Function=1, ByteArray=2, bdtUUID = 3, MD5Type=5, bdtCustom=128 };
+    
+    /**	Object ID type.
+        BSON objects typically have an _id field for the object id.  This field should be the first 
+        member of the object when present.  class OID is a special type that is a 12 byte id which 
+        is likely to be unique to the system.  You may also use other types for _id's.
+        When _id field is missing from a BSON object, on an insert the database may insert one 
+        automatically in certain circumstances.
+
+        Warning: You must call OID::newState() after a fork().
+    */
+    class OID {
+        union {
+            struct{
+                long long a;
+                unsigned b;
+            };
+            unsigned char data[12];
+        };
+        static unsigned _machine;
+    public:
+        /** call this after a fork */
+        static void newState();
+
+		/** initialize to 'null' */
+		void clear() { a = 0; b = 0; }
+
+        const unsigned char *getData() const { return data; }
+
+        bool operator==(const OID& r) {
+            return a==r.a&&b==r.b;
+        }
+        bool operator!=(const OID& r) {
+            return a!=r.a||b!=r.b;
+        }
+
+        /** The object ID output as 24 hex digits. */
+        string str() const {
+            stringstream s;
+            s << hex;
+            //            s.fill( '0' );
+            //            s.width( 2 );
+            // fill wasn't working so doing manually...
+            for( int i = 0; i < 8; i++ ) {
+                unsigned u = data[i];
+                if( u < 16 ) s << '0';
+                s << u;
+            }
+            const unsigned char * raw = (const unsigned char*)&b;
+            for( int i = 0; i < 4; i++ ) {
+                unsigned u = raw[i];
+                if( u < 16 ) s << '0';
+                s << u;
+            }
+            /*
+            s.width( 16 );
+            s << a;
+            s.width( 8 );
+            s << b;
+            s << dec;
+            */
+            return s.str();
+        }
+        
+        /**
+           sets the contents to a new oid / randomized value
+         */
+        void init();
+
+        /** Set to the hex string value specified. */
+        void init( string s );
+        
+    };
+    ostream& operator<<( ostream &s, const OID &o );
+
+    /** Formatting mode for generating JSON from BSON.
+        See <http://mongodb.onconfluence.com/display/DOCS/Mongo+Extended+JSON>
+        for details.
+     */
+    enum JsonStringFormat {
+        /** strict RFC format */
+        Strict,
+        /** 10gen format, which is close to JS format.  This form is understandable by
+        javascript running inside the Mongo server via eval() */
+        TenGen,
+        /** Javascript JSON compatible */
+        JS
+    };
+
+    /* l and r MUST have same type when called: check that first. */
+    int compareElementValues(const BSONElement& l, const BSONElement& r);
+
+#pragma pack()
+
+    /* internals
+       <type><fieldName    ><value>
+       -------- size() ------------
+             -fieldNameSize-
+                            value()
+       type()
+    */
+    /** BSONElement represents an "element" in a BSONObj.  So for the object { a : 3, b : "abc" },
+       'a : 3' is the first element (key+value).
+       
+       The BSONElement object points into the BSONObj's data.  Thus the BSONObj must stay in scope
+       for the life of the BSONElement.
+    */
+    class BSONElement {
+        friend class BSONObjIterator;
+        friend class BSONObj;
+    public:
+        string toString( bool includeFieldName = true ) const;
+        operator string() const { return toString(); }
+        string jsonString( JsonStringFormat format, bool includeFieldNames = true ) const;
+
+        /** Returns the type of the element */
+        BSONType type() const {
+            return (BSONType) *data;
+        }
+        
+        /** returns the tyoe of the element fixed for the main type
+            the main purpose is numbers.  any numeric type will return NumberDouble
+            Note: if the order changes, indexes have to be re-built or than can be corruption
+         */
+        int canonicalType() const {
+            BSONType t = type();
+            switch ( t ){
+            case MinKey:
+            case MaxKey:
+                return t;
+            case EOO:
+            case Undefined:
+                return 0;
+            case jstNULL:
+                return 5;
+            case NumberDouble:
+            case NumberInt:
+            case NumberLong:
+                return 10;
+            case String:
+            case Symbol:
+                return 15;
+            case Object:
+                return 20;
+            case Array:
+                return 25;
+            case BinData:
+                return 30;
+            case jstOID:
+                return 35;
+            case Bool:
+                return 40;
+            case Date:
+            case Timestamp:
+                return 45;
+            case RegEx:
+                return 50;
+            case DBRef:
+                return 55;
+            case Code:
+                return 60;
+            case CodeWScope:
+                return 65;
+            default:
+                assert(0);
+                return -1;
+            }
+        }
+
+        /** Indicates if it is the end-of-object element, which is present at the end of 
+            every BSON object. 
+        */
+        bool eoo() const {
+            return type() == EOO;
+        }
+
+        /** Size of the element.
+            @param maxLen If maxLen is specified, don't scan more than maxLen bytes to calculate size. 
+        */
+        int size( int maxLen = -1 ) const;
+
+        /** Wrap this element up as a singleton object. */
+        BSONObj wrap() const;
+
+        /** Wrap this element up as a singleton object with a new name. */
+        BSONObj wrap( const char* newName) const;
+
+        /** field name of the element.  e.g., for 
+           name : "Joe"
+           "name" is the fieldname
+        */
+        const char * fieldName() const {
+            if ( eoo() ) return ""; // no fieldname for it.
+            return data + 1;
+        }
+
+        /** raw data of the element's value (so be careful). */
+        const char * value() const {
+            return (data + fieldNameSize() + 1);
+        }
+        /** size in bytes of the element's value (when applicable). */
+        int valuesize() const {
+            return size() - fieldNameSize() - 1;
+        }
+
+        bool isBoolean() const {
+            return type() == Bool;
+        }
+
+        /** @return value of a boolean element.  
+            You must assure element is a boolean before 
+            calling. */
+        bool boolean() const {
+            return *value() ? true : false;
+        }
+
+        /** Retrieve a java style date value from the element. 
+            Ensure element is of type Date before calling.
+        */
+        Date_t date() const {
+            return *reinterpret_cast< const Date_t* >( value() );
+        }
+
+        /** Convert the value to boolean, regardless of its type, in a javascript-like fashion 
+            (i.e., treat zero and null as false).
+            */
+        bool trueValue() const {
+            switch( type() ) {
+                case NumberLong:
+                    return *reinterpret_cast< const long long* >( value() ) != 0;
+                case NumberDouble:
+                    return *reinterpret_cast< const double* >( value() ) != 0;
+                case NumberInt:
+                    return *reinterpret_cast< const int* >( value() ) != 0;
+                case Bool:
+                    return boolean();
+                case EOO:
+                case jstNULL:
+                case Undefined:
+                    return false;
+                
+                default:
+                    ;
+            }
+            return true;
+        }
+
+        /** True if element is of a numeric type. */
+        bool isNumber() const {
+            switch( type() ) {
+                case NumberLong:
+                case NumberDouble:
+                case NumberInt:
+                    return true;
+                default: 
+                    return false;
+            }
+        }
+
+        bool isSimpleType() const {
+            switch( type() ){
+            case NumberLong:
+            case NumberDouble:
+            case NumberInt:
+            case String:
+            case Bool:
+            case Date:
+            case jstOID:
+                return true;
+            default: 
+                return false;
+            }
+        }
+
+        /** Return double value for this field. MUST be NumberDouble type. */
+        double _numberDouble() const {return *reinterpret_cast< const double* >( value() ); }
+        /** Return double value for this field. MUST be NumberInt type. */
+        int _numberInt() const {return *reinterpret_cast< const int* >( value() ); }
+        /** Return double value for this field. MUST be NumberLong type. */
+        long long _numberLong() const {return *reinterpret_cast< const long long* >( value() ); }
+
+        /** Retrieve int value for the element safely.  Zero returned if not a number. */
+        int numberInt() const { 
+            switch( type() ) {
+                case NumberDouble:
+                    return (int) _numberDouble();
+                case NumberInt:
+                    return _numberInt();
+                case NumberLong:
+                    return (int) _numberLong();
+                default:
+                    return 0;
+            }
+        }
+
+        /** Retrieve long value for the element safely.  Zero returned if not a number. */
+        long long numberLong() const { 
+            switch( type() ) {
+                case NumberDouble:
+                    return (long long) _numberDouble();
+                case NumberInt:
+                    return _numberInt();
+                case NumberLong:
+                    return _numberLong();
+                default:
+                    return 0;
+            }
+        }
+
+        /** Retrieve the numeric value of the element.  If not of a numeric type, returns 0. 
+            NOTE: casts to double, data loss may occur with large (>52 bit) NumberLong values.
+        */
+        double numberDouble() const {
+            switch( type() ) {
+                case NumberDouble:
+                    return _numberDouble();
+                case NumberInt:
+                    return *reinterpret_cast< const int* >( value() );
+                case NumberLong:
+                    return (double) *reinterpret_cast< const long long* >( value() );
+                default:
+                    return 0;
+            }
+        }
+        /** Retrieve the numeric value of the element.  If not of a numeric type, returns 0. 
+            NOTE: casts to double, data loss may occur with large (>52 bit) NumberLong values.
+        */
+        double number() const { return numberDouble(); }
+
+        /** Retrieve the object ID stored in the object. 
+            You must ensure the element is of type jstOID first. */
+        const OID &__oid() const {
+            return *reinterpret_cast< const OID* >( value() );
+        }
+
+        /** True if element is null. */
+        bool isNull() const {
+            return type() == jstNULL;
+        }
+        
+        /** Size (length) of a string element.  
+            You must assure of type String first.  */
+        int valuestrsize() const {
+            return *reinterpret_cast< const int* >( value() );
+        }
+
+        // for objects the size *includes* the size of the size field
+        int objsize() const {
+            return *reinterpret_cast< const int* >( value() );
+        }
+
+        /** Get a string's value.  Also gives you start of the real data for an embedded object. 
+            You must assure data is of an appropriate type first -- see also valuestrsafe().
+        */
+        const char * valuestr() const {
+            return value() + 4;
+        }
+
+        /** Get the string value of the element.  If not a string returns "". */
+        const char *valuestrsafe() const {
+            return type() == String ? valuestr() : "";
+        }
+        /** Get the string value of the element.  If not a string returns "". */
+        string str() const { return valuestrsafe(); }
+
+        /** Get javascript code of a CodeWScope data element. */
+        const char * codeWScopeCode() const {
+            return value() + 8;
+        }
+        /** Get the scope SavedContext of a CodeWScope data element. */
+        const char * codeWScopeScopeData() const {
+            // TODO fix
+            return codeWScopeCode() + strlen( codeWScopeCode() ) + 1;
+        }
+
+        /** Get the embedded object this element holds. */
+        BSONObj embeddedObject() const;
+
+        /* uasserts if not an object */
+        BSONObj embeddedObjectUserCheck();
+
+        BSONObj codeWScopeObject() const;
+
+        string ascode() const {
+            switch( type() ){
+            case String:
+            case Code:
+                return valuestr();
+            case CodeWScope:
+                return codeWScopeCode();
+            default:
+                log() << "can't convert type: " << (int)(type()) << " to code" << endl;
+            }
+            uassert( 10062 ,  "not code" , 0 );
+            return "";
+        }
+
+        /** Get binary data.  Element must be of type BinData */
+        const char *binData(int& len) const { 
+            // BinData: <int len> <byte subtype> <byte[len] data>
+            assert( type() == BinData );
+            len = valuestrsize();
+            return value() + 5;
+        }
+        
+        BinDataType binDataType() const {
+            // BinData: <int len> <byte subtype> <byte[len] data>
+            assert( type() == BinData );
+            char c = (value() + 4)[0];
+            return (BinDataType)c;
+        }
+
+        /** Retrieve the regex string for a Regex element */
+        const char *regex() const {
+            assert(type() == RegEx);
+            return value();
+        }
+
+        /** Retrieve the regex flags (options) for a Regex element */
+        const char *regexFlags() const {
+            const char *p = regex();
+            return p + strlen(p) + 1;
+        }
+
+        /** like operator== but doesn't check the fieldname,
+           just the value.
+        */
+        bool valuesEqual(const BSONElement& r) const {
+            switch( type() ) {
+                case NumberLong:
+                    return _numberLong() == r.numberLong() && r.isNumber();
+                case NumberDouble:
+                    return _numberDouble() == r.number() && r.isNumber();
+                case NumberInt:
+                    return _numberInt() == r.numberInt() && r.isNumber();
+                default:
+                    ;
+            }
+            bool match= valuesize() == r.valuesize() &&
+                        memcmp(value(),r.value(),valuesize()) == 0;
+            return match && canonicalType() == r.canonicalType();
+        }
+
+        /** Returns true if elements are equal. */
+        bool operator==(const BSONElement& r) const {
+            if ( strcmp(fieldName(), r.fieldName()) != 0 )
+                return false;
+            return valuesEqual(r);
+        }
+
+
+        /** Well ordered comparison.
+         @return <0: l<r. 0:l==r. >0:l>r
+         order by type, field name, and field value.
+         If considerFieldName is true, pay attention to the field name.
+         */
+        int woCompare( const BSONElement &e, bool considerFieldName = true ) const;
+
+        const char * rawdata() const {
+            return data;
+        }
+        
+        /** 0 == Equality, just not defined yet */
+        int getGtLtOp( int def = 0 ) const;
+
+        /** Constructs an empty element */
+        BSONElement();
+        
+        /** Check that data is internally consistent. */
+        void validate() const;
+
+        /** True if this element may contain subobjects. */
+        bool mayEncapsulate() const {
+            return type() == Object ||
+                type() == Array ||
+                type() == CodeWScope;
+        }
+
+        Date_t timestampTime() const{
+            unsigned long long t = ((unsigned int*)(value() + 4 ))[0];
+            return t * 1000;
+        }
+        unsigned int timestampInc() const{
+            return ((unsigned int*)(value() ))[0];
+        }
+
+        const char * dbrefNS() const {
+            uassert( 10063 ,  "not a dbref" , type() == DBRef );
+            return value() + 4;
+        }
+
+        const OID& dbrefOID() const {
+            uassert( 10064 ,  "not a dbref" , type() == DBRef );
+            const char * start = value();
+            start += 4 + *reinterpret_cast< const int* >( start );
+            return *reinterpret_cast< const OID* >( start );
+        }
+
+        bool operator<( const BSONElement& other ) const {
+            int x = (int)canonicalType() - (int)other.canonicalType();
+            if ( x < 0 ) return true;
+            else if ( x > 0 ) return false;
+            return compareElementValues(*this,other) < 0;
+        }
+        
+        // If maxLen is specified, don't scan more than maxLen bytes.
+        BSONElement(const char *d, int maxLen = -1) : data(d) {
+            fieldNameSize_ = -1;
+            if ( eoo() )
+                fieldNameSize_ = 0;
+            else {
+                if ( maxLen != -1 ) {
+                    int size = strnlen( fieldName(), maxLen - 1 );
+                    massert( 10333 ,  "Invalid field name", size != -1 );
+                    fieldNameSize_ = size + 1;
+                }
+            }
+            totalSize = -1;
+        }
+    private:
+        const char *data;
+        mutable int fieldNameSize_; // cached value
+        int fieldNameSize() const {
+            if ( fieldNameSize_ == -1 )
+                fieldNameSize_ = strlen( fieldName() ) + 1;
+            return fieldNameSize_;
+        }
+        mutable int totalSize; /* caches the computed size */
+    };
+    
+    int getGtLtOp(const BSONElement& e);
+
+    struct BSONElementCmpWithoutField {
+        bool operator()( const BSONElement &l, const BSONElement &r ) const {
+            return l.woCompare( r, false );
+        }
+    };
+    
+    typedef set< BSONElement, BSONElementCmpWithoutField > BSONElementSet;
+    
+    /**
+	   C++ representation of a "BSON" object -- that is, an extended JSON-style 
+       object in a binary representation.
+
+       Note that BSONObj's have a smart pointer capability built in -- so you can 
+       pass them around by value.  The reference counts used to implement this
+       do not use locking, so copying and destroying BSONObj's are not thread-safe
+       operations.
+
+     BSON object format:
+     
+     \code
+     <unsigned totalSize> {<byte BSONType><cstring FieldName><Data>}* EOO
+     
+     totalSize includes itself.
+     
+     Data:
+     Bool:      <byte>
+     EOO:       nothing follows
+     Undefined: nothing follows
+     OID:       an OID object
+     NumberDouble: <double>
+     NumberInt: <int32>
+     String:    <unsigned32 strsizewithnull><cstring>
+     Date:      <8bytes>
+     Regex:     <cstring regex><cstring options>
+     Object:    a nested object, leading with its entire size, which terminates with EOO.
+     Array:     same as object
+     DBRef:     <strlen> <cstring ns> <oid>
+     DBRef:     a database reference: basically a collection name plus an Object ID
+     BinData:   <int len> <byte subtype> <byte[len] data>
+     Code:      a function (not a closure): same format as String.
+     Symbol:    a language symbol (say a python symbol).  same format as String.
+     Code With Scope: <total size><String><Object>
+     \endcode
+     */
+    class BSONObj {
+        friend class BSONObjIterator;
+        class Holder {
+        public:
+            Holder( const char *objdata ) :
+            _objdata( objdata ) {
+            }
+            ~Holder() {
+                free((void *)_objdata);
+                _objdata = 0;
+            }
+        private:
+            const char *_objdata;
+        };
+        const char *_objdata;
+        boost::shared_ptr< Holder > _holder;
+        void init(const char *data, bool ifree) {
+            if ( ifree )
+                _holder.reset( new Holder( data ) );
+            _objdata = data;
+            if ( ! isValid() ){
+                stringstream ss;
+                ss << "Invalid BSONObj spec size: " << objsize();
+                string s = ss.str();
+                massert( 10334 ,  s , 0 );
+            }
+        }
+#pragma pack(1)
+        static struct EmptyObject {
+            EmptyObject() {
+                len = 5;
+                jstype = EOO;
+            }
+            int len;
+            char jstype;
+        } emptyObject;
+#pragma pack()
+    public:
+        /** Construct a BSONObj from data in the proper format. 
+            @param ifree true if the BSONObj should free() the msgdata when 
+            it destructs. 
+            */
+        explicit BSONObj(const char *msgdata, bool ifree = false) {
+            init(msgdata, ifree);
+        }
+        BSONObj(const Record *r);
+        /** Construct an empty BSONObj -- that is, {}. */
+        BSONObj() : _objdata( reinterpret_cast< const char * >( &emptyObject ) ) { }
+        // defensive
+        ~BSONObj() { _objdata = 0; }
+
+        void appendSelfToBufBuilder(BufBuilder& b) const {
+            assert( objsize() );
+            b.append(reinterpret_cast<const void *>( objdata() ), objsize());
+        }
+
+        /** Readable representation of a BSON object in an extended JSON-style notation. 
+            This is an abbreviated representation which might be used for logging.
+        */
+        string toString() const;
+        operator string() const { return toString(); }
+        
+        /** Properly formatted JSON string. */
+        string jsonString( JsonStringFormat format = Strict ) const;
+
+        /** note: addFields always adds _id even if not specified */
+        int addFields(BSONObj& from, set<string>& fields); /* returns n added */
+
+        /** returns # of top level fields in the object
+           note: iterates to count the fields
+        */
+        int nFields() const;
+
+        /** adds the field names to the fields set.  does NOT clear it (appends). */
+        int getFieldNames(set<string>& fields) const;
+
+        /** return has eoo() true if no match
+           supports "." notation to reach into embedded objects
+        */
+        BSONElement getFieldDotted(const char *name) const;
+        /** Like getFieldDotted(), but expands multikey arrays and returns all matching objects
+         */
+        void getFieldsDotted(const char *name, BSONElementSet &ret, bool *deep = 0) const;
+        /** Like getFieldDotted(), but returns first array encountered while traversing the
+            dotted fields of name.  The name variable is updated to represent field
+            names with respect to the returned element. */
+        BSONElement getFieldDottedOrArray(const char *&name) const;
+
+        /** Get the field of the specified name. eoo() is true on the returned 
+            element if not found. 
+        */
+        BSONElement getField(const string name) const {
+            return getField( name.c_str() );
+        };
+
+        /** Get the field of the specified name. eoo() is true on the returned 
+            element if not found. 
+        */
+        BSONElement getField(const char *name) const; /* return has eoo() true if no match */
+
+        /** Get the field of the specified name. eoo() is true on the returned 
+            element if not found. 
+        */
+        BSONElement operator[] (const char *field) const { 
+            return getField(field);
+        }
+
+        BSONElement operator[] (const string& field) const { 
+            return getField(field);
+        }
+
+        BSONElement operator[] (int field) const { 
+            stringstream ss;
+            ss << field;
+            string s = ss.str();
+            return getField(s.c_str());
+        }
+
+		/** @return true if field exists */
+        bool hasField( const char * name )const {
+            return ! getField( name ).eoo();
+        }
+
+        /** @return "" if DNE or wrong type */
+        const char * getStringField(const char *name) const;
+
+		/** @return subobject of the given name */
+        BSONObj getObjectField(const char *name) const;
+
+        /** @return INT_MIN if not present - does some type conversions */
+        int getIntField(const char *name) const;
+
+        /** @return false if not present */
+        bool getBoolField(const char *name) const;
+
+        /** makes a new BSONObj with the fields specified in pattern.
+           fields returned in the order they appear in pattern.
+           if any field is missing or undefined in the object, that field in the
+           output will be null.
+
+           sets output field names to match pattern field names.
+           If an array is encountered while scanning the dotted names in pattern,
+           that field is treated as missing.
+        */
+        BSONObj extractFieldsDotted(BSONObj pattern) const;
+        
+        /**
+           sets element field names to empty string
+           If a field in pattern is missing, it is omitted from the returned
+           object.
+        */
+        BSONObj extractFieldsUnDotted(BSONObj pattern) const;
+        
+        /** extract items from object which match a pattern object.
+			e.g., if pattern is { x : 1, y : 1 }, builds an object with 
+			x and y elements of this object, if they are present.
+           returns elements with original field names
+        */
+        BSONObj extractFields(const BSONObj &pattern , bool fillWithNull=false) const;
+        
+        BSONObj filterFieldsUndotted(const BSONObj &filter, bool inFilter) const;
+
+        BSONElement getFieldUsingIndexNames(const char *fieldName, const BSONObj &indexKey) const;
+        
+        /** @return the raw data of the object */
+        const char *objdata() const {
+            return _objdata;
+        }
+        /** @return total size of the BSON object in bytes */
+        int objsize() const {
+            return *(reinterpret_cast<const int*>(objdata()));
+        }
+
+        bool isValid();
+
+        /** @return if the user is a valid user doc
+            criter: isValid() no . or $ field names
+         */
+        bool okForStorage() const;
+
+		/** @return true if object is empty -- i.e.,  {} */
+        bool isEmpty() const {
+            return objsize() <= 5;
+        }
+
+        void dump() const {
+            out() << hex;
+            const char *p = objdata();
+            for ( int i = 0; i < objsize(); i++ ) {
+                out() << i << '\t' << ( 0xff & ( (unsigned) *p ) );
+                if ( *p >= 'A' && *p <= 'z' )
+                    out() << '\t' << *p;
+                out() << endl;
+                p++;
+            }
+        }
+
+        // Alternative output format
+        string hexDump() const;
+        
+        /**wo='well ordered'.  fields must be in same order in each object.
+           Ordering is with respect to the signs of the elements in idxKey.
+		   @return  <0 if l<r. 0 if l==r. >0 if l>r
+        */
+        int woCompare(const BSONObj& r, const BSONObj &idxKey = BSONObj(),
+                      bool considerFieldName=true) const;
+        
+        int woSortOrder( const BSONObj& r , const BSONObj& sortKey ) const;
+
+        /** This is "shallow equality" -- ints and doubles won't match.  for a
+           deep equality test use woCompare (which is slower).
+        */
+        bool woEqual(const BSONObj& r) const {
+            int os = objsize();
+            if ( os == r.objsize() ) {
+                return (os == 0 || memcmp(objdata(),r.objdata(),os)==0);
+            }
+            return false;
+        }
+
+		/** @return first field of the object */
+        BSONElement firstElement() const {
+            return BSONElement(objdata() + 4);
+        }
+
+		/** @return element with fieldname "name".  returnvalue.eoo() is true if not found */
+        BSONElement findElement(const char *name) const;
+
+		/** @return element with fieldname "name".  returnvalue.eoo() is true if not found */
+        BSONElement findElement(string name) const {
+            return findElement(name.c_str());
+        }
+
+		/** @return true if field exists in the object */
+        bool hasElement(const char *name) const;
+
+		/** Get the _id field from the object.  For good performance drivers should 
+            assure that _id is the first element of the object; however, correct operation 
+            is assured regardless.
+            @return true if found
+		*/
+		bool getObjectID(BSONElement& e) const;
+
+        /** makes a copy of the object. 
+        */
+        BSONObj copy() const;
+
+        /* make sure the data buffer is under the control of BSONObj's and not a remote buffer */
+        BSONObj getOwned() const{
+            if ( !isOwned() )
+                return copy();
+            return *this;
+        }
+        bool isOwned() const { return _holder.get() != 0; }
+
+        /** @return A hash code for the object */
+        int hash() const {
+            unsigned x = 0;
+            const char *p = objdata();
+            for ( int i = 0; i < objsize(); i++ )
+                x = x * 131 + p[i];
+            return (x & 0x7fffffff) | 0x8000000; // must be > 0
+        }
+
+        // Return a version of this object where top level elements of types
+        // that are not part of the bson wire protocol are replaced with
+        // string identifier equivalents.
+        // TODO Support conversion of element types other than min and max.
+        BSONObj clientReadable() const;
+        
+        /** Return new object with the field names replaced by those in the
+            passed object. */
+        BSONObj replaceFieldNames( const BSONObj &obj ) const;
+        
+        /** true unless corrupt */
+        bool valid() const;
+        
+        string md5() const;
+        
+        bool operator==( const BSONObj& other ){
+            return woCompare( other ) == 0;
+        }
+
+        enum MatchType {
+            Equality = 0,
+            LT = 0x1,
+            LTE = 0x3,
+            GTE = 0x6,
+            GT = 0x4,
+            opIN = 0x8, // { x : { $in : [1,2,3] } }
+            NE = 0x9,
+            opSIZE = 0x0A,
+            opALL = 0x0B,
+            NIN = 0x0C,
+            opEXISTS = 0x0D,
+            opMOD = 0x0E,
+            opTYPE = 0x0F,
+            opREGEX = 0x10,
+            opOPTIONS = 0x11,
+            opELEM_MATCH = 0x12
+        };        
+    };
+    ostream& operator<<( ostream &s, const BSONObj &o );
+    ostream& operator<<( ostream &s, const BSONElement &e );
+
+    struct BSONArray: BSONObj {
+        // Don't add anything other than forwarding constructors!!!
+        BSONArray(): BSONObj() {}
+        explicit BSONArray(const BSONObj& obj): BSONObj(obj) {}
+    };
+
+    class BSONObjCmp {
+    public:
+        BSONObjCmp( const BSONObj &_order = BSONObj() ) : order( _order ) {}
+        bool operator()( const BSONObj &l, const BSONObj &r ) const {
+            return l.woCompare( r, order ) < 0;
+        }
+    private:
+        BSONObj order;
+    };
+
+    class BSONObjCmpDefaultOrder : public BSONObjCmp {
+    public:
+        BSONObjCmpDefaultOrder() : BSONObjCmp( BSONObj() ) {}
+    };
+
+    typedef set< BSONObj, BSONObjCmpDefaultOrder > BSONObjSetDefaultOrder;
+
+    enum FieldCompareResult {
+        LEFT_SUBFIELD = -2,
+        LEFT_BEFORE = -1,
+        SAME = 0,
+        RIGHT_BEFORE = 1 ,
+        RIGHT_SUBFIELD = 2
+    };
+
+    FieldCompareResult compareDottedFieldNames( const string& l , const string& r );
+
+/** Use BSON macro to build a BSONObj from a stream 
+
+    e.g., 
+       BSON( "name" << "joe" << "age" << 33 )
+
+    with auto-generated object id:
+       BSON( GENOID << "name" << "joe" << "age" << 33 )
+ 
+    The labels GT, GTE, LT, LTE, NE can be helpful for stream-oriented construction
+    of a BSONObj, particularly when assembling a Query.  For example,
+    BSON( "a" << GT << 23.4 << NE << 30 << "b" << 2 ) produces the object
+    { a: { \$gt: 23.4, \$ne: 30 }, b: 2 }.
+*/
+#define BSON(x) (( mongo::BSONObjBuilder() << x ).obj())
+
+/** Use BSON_ARRAY macro like BSON macro, but without keys
+
+    BSONArray arr = BSON_ARRAY( "hello" << 1 << BSON( "foo" << BSON_ARRAY( "bar" << "baz" << "qux" ) ) );
+
+ */
+#define BSON_ARRAY(x) (( mongo::BSONArrayBuilder() << x ).arr())
+
+    /* Utility class to auto assign object IDs.
+       Example:
+         cout << BSON( GENOID << "z" << 3 ); // { _id : ..., z : 3 }
+    */
+    extern struct IDLabeler { } GENOID;
+    BSONObjBuilder& operator<<(BSONObjBuilder& b, IDLabeler& id);
+
+    /* Utility class to add a Date element with the current time
+       Example: 
+         cout << BSON( "created" << DATENOW ); // { created : "2009-10-09 11:41:42" }
+    */
+    extern struct DateNowLabeler { } DATENOW;
+
+    // Utility class to implement GT, GTE, etc as described above.
+    class Labeler {
+    public:
+        struct Label {
+            Label( const char *l ) : l_( l ) {}
+            const char *l_;
+        };
+        Labeler( const Label &l, BSONObjBuilderValueStream *s ) : l_( l ), s_( s ) {}
+        template<class T>
+        BSONObjBuilder& operator<<( T value );
+
+        /* the value of the element e is appended i.e. for 
+             "age" << GT << someElement
+           one gets 
+             { age : { $gt : someElement's value } } 
+        */
+        BSONObjBuilder& operator<<( const BSONElement& e );
+    private:
+        const Label &l_;
+        BSONObjBuilderValueStream *s_;
+    };
+    
+    extern Labeler::Label GT;
+    extern Labeler::Label GTE;
+    extern Labeler::Label LT;
+    extern Labeler::Label LTE;
+    extern Labeler::Label NE;
+    extern Labeler::Label SIZE;
+    
+    // Utility class to implement BSON( key << val ) as described above.
+    class BSONObjBuilderValueStream : public boost::noncopyable {
+    public:
+        friend class Labeler;
+        BSONObjBuilderValueStream( BSONObjBuilder * builder );
+
+        BSONObjBuilder& operator<<( const BSONElement& e );
+        
+        template<class T> 
+        BSONObjBuilder& operator<<( T value );
+
+        BSONObjBuilder& operator<<(DateNowLabeler& id);
+        
+        Labeler operator<<( const Labeler::Label &l );
+
+        void endField( const char *nextFieldName = 0 );
+        bool subobjStarted() const { return _fieldName != 0; }
+        
+    private:
+        const char * _fieldName;
+        BSONObjBuilder * _builder;
+
+        bool haveSubobj() const { return _subobj.get() != 0; }
+        BSONObjBuilder *subobj();
+        auto_ptr< BSONObjBuilder > _subobj;
+    };
+    
+    /**
+       utility for creating a BSONObj
+     */
+    class BSONObjBuilder : boost::noncopyable {
+    public:
+        /** @param initsize this is just a hint as to the final size of the object */
+        BSONObjBuilder(int initsize=512) : b(buf_), buf_(initsize), offset_( 0 ), s_( this ) {
+            b.skip(4); /*leave room for size field*/
+        }
+
+        /** @param baseBuilder construct a BSONObjBuilder using an existing BufBuilder */
+        BSONObjBuilder( BufBuilder &baseBuilder ) : b( baseBuilder ), buf_( 0 ), offset_( baseBuilder.len() ), s_( this ) {
+            b.skip( 4 );
+        }
+        
+        /** add all the fields from the object specified to this object */
+        BSONObjBuilder& appendElements(BSONObj x);
+
+        /** append element to the object we are building */
+        void append( const BSONElement& e) {
+            assert( !e.eoo() ); // do not append eoo, that would corrupt us. the builder auto appends when done() is called.
+            b.append((void*) e.rawdata(), e.size());
+        }
+
+        /** append an element but with a new name */
+        void appendAs(const BSONElement& e, const char *as) {
+            assert( !e.eoo() ); // do not append eoo, that would corrupt us. the builder auto appends when done() is called.
+            b.append((char) e.type());
+            b.append(as);
+            b.append((void *) e.value(), e.valuesize());
+        }
+
+        void appendAs(const BSONElement& e, const string& as) {
+            appendAs( e , as.c_str() );
+        }
+
+
+        /** add a subobject as a member */
+        void append(const char *fieldName, BSONObj subObj) {
+            b.append((char) Object);
+            b.append(fieldName);
+            b.append((void *) subObj.objdata(), subObj.objsize());
+        }
+
+        void append(const string& fieldName , BSONObj subObj) {
+            append( fieldName.c_str() , subObj );
+        }
+
+        /** add header for a new subobject and return bufbuilder for writing to
+            the subobject's body */
+        BufBuilder &subobjStart(const char *fieldName) {
+            b.append((char) Object);
+            b.append(fieldName);
+            return b;
+        }
+        
+        /** add a subobject as a member with type Array.  Thus arr object should have "0", "1", ...
+           style fields in it.
+        */
+        void appendArray(const char *fieldName, BSONObj subObj) {
+            b.append((char) Array);
+            b.append(fieldName);
+            b.append((void *) subObj.objdata(), subObj.objsize());
+        }
+        void append(const char *fieldName, BSONArray arr) { appendArray(fieldName, arr); }
+        
+
+        /** add header for a new subarray and return bufbuilder for writing to
+            the subarray's body */
+        BufBuilder &subarrayStart(const char *fieldName) {
+            b.append((char) Array);
+            b.append(fieldName);
+            return b;
+        }
+        
+        /** Append a boolean element */
+        void appendBool(const char *fieldName, int val) {
+            b.append((char) Bool);
+            b.append(fieldName);
+            b.append((char) (val?1:0));
+        }
+
+        /** Append a 32 bit integer element */
+        void append(const char *fieldName, int n) {
+            b.append((char) NumberInt);
+            b.append(fieldName);
+            b.append(n);
+        }
+        /** Append a 32 bit integer element */
+        void append(const string &fieldName, int n) {
+            append( fieldName.c_str(), n );
+        }
+
+        /** Append a 32 bit unsigned element - cast to a signed int. */
+        void append(const char *fieldName, unsigned n) { append(fieldName, (int) n); }
+
+        /** Append a NumberLong */
+        void append(const char *fieldName, long long n) { 
+            b.append((char) NumberLong);
+            b.append(fieldName);
+            b.append(n);
+        }
+
+        /** Append a NumberLong */
+        void append(const string& fieldName, long long n) { 
+            append( fieldName.c_str() , n );
+        }
+
+
+        /** Append a double element */
+        BSONObjBuilder& append(const char *fieldName, double n) {
+            b.append((char) NumberDouble);
+            b.append(fieldName);
+            b.append(n);
+            return *this;
+        }
+
+        /** tries to append the data as a number
+         * @return true if the data was able to be converted to a number
+         */
+        bool appendAsNumber( const string& fieldName , const string& data );
+
+        /** Append a BSON Object ID (OID type). */
+        void appendOID(const char *fieldName, OID *oid = 0 , bool generateIfBlank = false ) {
+            b.append((char) jstOID);
+            b.append(fieldName);
+            if ( oid )
+                b.append( (void *) oid, 12 );
+            else {
+                OID tmp;
+                if ( generateIfBlank )
+                    tmp.init();
+                else
+                    tmp.clear();
+                b.append( (void *) &tmp, 12 );
+            }
+        }
+        void append( const char *fieldName, OID oid ) {
+            appendOID( fieldName, &oid );
+        }
+        /** Append a time_t date.
+            @param dt a C-style 32 bit date value, that is
+                      the number of seconds since January 1, 1970, 00:00:00 GMT
+        */
+        void appendTimeT(const char *fieldName, time_t dt) {
+            b.append((char) Date);
+            b.append(fieldName);
+            b.append(static_cast<unsigned long long>(dt) * 1000);
+        }
+        /** Append a date.  
+            @param dt a Java-style 64 bit date value, that is 
+                      the number of milliseconds since January 1, 1970, 00:00:00 GMT
+        */
+        void appendDate(const char *fieldName, Date_t dt) {
+            b.append((char) Date);
+            b.append(fieldName);
+            b.append(dt);
+        }
+        void append(const char *fieldName, Date_t dt) {
+            appendDate(fieldName, dt);
+        }
+
+        /** Append a regular expression value
+            @param regex the regular expression pattern
+            @param regex options such as "i" or "g"
+        */
+        void appendRegex(const char *fieldName, const char *regex, const char *options = "") {
+            b.append((char) RegEx);
+            b.append(fieldName);
+            b.append(regex);
+            b.append(options);
+        }
+        /** Append a regular expression value
+            @param regex the regular expression pattern
+            @param regex options such as "i" or "g"
+        */
+        void appendRegex(string fieldName, string regex, string options = "") {
+            appendRegex(fieldName.c_str(), regex.c_str(), options.c_str());
+        }
+        void appendCode(const char *fieldName, const char *code) {
+            b.append((char) Code);
+            b.append(fieldName);
+            b.append((int) strlen(code)+1);
+            b.append(code);
+        }
+        /** Append a string element */
+        BSONObjBuilder& append(const char *fieldName, const char *str) {
+            b.append((char) String);
+            b.append(fieldName);
+            b.append((int) strlen(str)+1);
+            b.append(str);
+            return *this;
+        }
+        /** Append a string element */
+        void append(const char *fieldName, string str) {
+            append(fieldName, str.c_str());
+        }
+        void appendSymbol(const char *fieldName, const char *symbol) {
+            b.append((char) Symbol);
+            b.append(fieldName);
+            b.append((int) strlen(symbol)+1);
+            b.append(symbol);
+        }
+
+        /** Append a Null element to the object */
+        void appendNull( const char *fieldName ) {
+            b.append( (char) jstNULL );
+            b.append( fieldName );
+        }
+
+        // Append an element that is less than all other keys.
+        void appendMinKey( const char *fieldName ) {
+            b.append( (char) MinKey );
+            b.append( fieldName );
+        }
+        // Append an element that is greater than all other keys.
+        void appendMaxKey( const char *fieldName ) {
+            b.append( (char) MaxKey );
+            b.append( fieldName );
+        }
+        
+        // Append a Timestamp field -- will be updated to next OpTime on db insert.
+        void appendTimestamp( const char *fieldName ) {
+            b.append( (char) Timestamp );
+            b.append( fieldName );
+            b.append( (unsigned long long) 0 );
+        }
+
+        void appendTimestamp( const char *fieldName , unsigned long long val ) {
+            b.append( (char) Timestamp );
+            b.append( fieldName );
+            b.append( val );
+        }
+
+        /**
+         * @param time - in millis (but stored in seconds)
+         */
+        void appendTimestamp( const char *fieldName , unsigned long long time , unsigned int inc ){
+            OpTime t( (unsigned) (time / 1000) , inc );
+            appendTimestamp( fieldName , t.asDate() );
+        }
+        
+        /* Deprecated (but supported) */
+        void appendDBRef( const char *fieldName, const char *ns, const OID &oid ) {
+            b.append( (char) DBRef );
+            b.append( fieldName );
+            b.append( (int) strlen( ns ) + 1 );
+            b.append( ns );
+            b.append( (void *) &oid, 12 );
+        }
+
+        /** Append a binary data element 
+            @param fieldName name of the field
+            @param len length of the binary data in bytes
+            @param type type information for the data. @see BinDataType.  Use ByteArray if you 
+                   don't care about the type.
+            @param data the byte array
+        */
+        void appendBinData( const char *fieldName, int len, BinDataType type, const char *data ) {
+            b.append( (char) BinData );
+            b.append( fieldName );
+            b.append( len );
+            b.append( (char) type );
+            b.append( (void *) data, len );
+        }
+        void appendBinData( const char *fieldName, int len, BinDataType type, const unsigned char *data ) {
+            appendBinData(fieldName, len, type, (const char *) data);
+        }
+        
+        /**
+           @param len the length of data
+         */
+        void appendBinDataArray( const char * fieldName , const char * data , int len ){
+            b.append( (char) BinData );
+            b.append( fieldName );
+            b.append( len + 4 );
+            b.append( (char)0x2 );
+            b.append( len );
+            b.append( (void *) data, len );            
+        }
+
+        /** Append to the BSON object a field of type CodeWScope.  This is a javascript code 
+            fragment accompanied by some scope that goes with it.
+            */
+        void appendCodeWScope( const char *fieldName, const char *code, const BSONObj &scope ) {
+            b.append( (char) CodeWScope );
+            b.append( fieldName );
+            b.append( ( int )( 4 + 4 + strlen( code ) + 1 + scope.objsize() ) );
+            b.append( ( int ) strlen( code ) + 1 );
+            b.append( code );
+            b.append( ( void * )scope.objdata(), scope.objsize() );
+        }
+
+        void appendUndefined( const char *fieldName ) {
+            b.append( (char) Undefined );
+            b.append( fieldName );
+        }
+        
+        /* helper function -- see Query::where() for primary way to do this. */
+        void appendWhere( const char *code, const BSONObj &scope ){
+            appendCodeWScope( "$where" , code , scope );
+        }
+        void appendWhere( const string &code, const BSONObj &scope ){
+            appendWhere( code.c_str(), scope );
+        }
+        
+        /**
+           these are the min/max when comparing, not strict min/max elements for a given type
+         */
+        void appendMinForType( const string& field , int type );
+        void appendMaxForType( const string& field , int type );
+
+        /** Append an array of values. */
+        template < class T >
+        void append( const char *fieldName, const vector< T >& vals ) {
+            BSONObjBuilder arrBuilder;
+            for ( unsigned int i = 0; i < vals.size(); ++i )
+                arrBuilder.append( numStr( i ).c_str(), vals[ i ] );
+            marshalArray( fieldName, arrBuilder.done() );
+        }
+
+        /* Append an array of ints 
+        void appendArray( const char *fieldName, const vector< int >& vals ) {
+            BSONObjBuilder arrBuilder;
+            for ( unsigned i = 0; i < vals.size(); ++i )
+                arrBuilder.append( numStr( i ).c_str(), vals[ i ] );
+            marshalArray( fieldName, arrBuilder.done() );
+        }*/
+
+        /** The returned BSONObj will free the buffer when it is finished. */
+        BSONObj obj() {
+            massert( 10335 ,  "builder does not own memory", owned() );
+            int l;
+            return BSONObj(decouple(l), true);
+        }
+
+        /** Fetch the object we have built.
+			BSONObjBuilder still frees the object when the builder goes out of 
+			scope -- very important to keep in mind.  Use obj() if you 
+			would like the BSONObj to last longer than the builder.
+        */
+        BSONObj done() {
+            return BSONObj(_done());
+        }
+
+        /* assume ownership of the buffer - you must then free it (with free()) */
+        char* decouple(int& l) {
+            char *x = _done();
+            assert( x );
+            l = b.len();
+            b.decouple();
+            return x;
+        }
+        void decouple() {
+            b.decouple();    // post done() call version.  be sure jsobj frees...
+        }
+
+
+    private:
+        static const string numStrs[100]; // cache of 0 to 99 inclusive
+    public:
+        static string numStr( int i ) {
+            if (i>=0 && i<100)
+                return numStrs[i];
+
+            stringstream o;
+            o << i;
+            return o.str();
+        }
+
+        /** Stream oriented way to add field names and values. */
+        BSONObjBuilderValueStream &operator<<(const char * name ) {
+            s_.endField( name );
+            return s_;
+        }
+
+        // prevent implicit string conversions which would allow bad things like BSON( BSON( "foo" << 1 ) << 2 )
+        struct ForceExplicitString {
+        ForceExplicitString( const string &str ) : str_( str ) {}
+            string str_;
+        };
+
+        /** Stream oriented way to add field names and values. */
+        BSONObjBuilderValueStream &operator<<( const ForceExplicitString& name ) {
+            return operator<<( name.str_.c_str() );
+        }
+
+        Labeler operator<<( const Labeler::Label &l ) {
+            massert( 10336 ,  "No subobject started", s_.subobjStarted() );
+            return s_ << l;
+        }
+
+        bool owned() const {
+            return &b == &buf_;
+        }
+        
+    private:
+        // Append the provided arr object as an array.
+        void marshalArray( const char *fieldName, const BSONObj &arr ) {
+            b.append( (char) Array );
+            b.append( fieldName );
+            b.append( (void *) arr.objdata(), arr.objsize() );
+        }
+
+        char* _done() {
+            s_.endField();
+            b.append((char) EOO);
+            char *data = b.buf() + offset_;
+            *((int*)data) = b.len() - offset_;
+            return data;
+        }
+
+        BufBuilder &b;
+        BufBuilder buf_;
+        int offset_;
+        BSONObjBuilderValueStream s_;
+    };
+
+    class BSONArrayBuilder : boost::noncopyable{
+    public:
+        BSONArrayBuilder() :i(0), b() {}
+
+        template <typename T>
+        BSONArrayBuilder& append(const T& x){
+            b.append(num().c_str(), x);
+            return *this;
+        }
+
+        BSONArrayBuilder& append(const BSONElement& e){
+            b.appendAs(e, num().c_str());
+            return *this;
+        }
+
+        template <typename T>
+        BSONArrayBuilder& operator<<(const T& x){
+            return append(x);
+        }
+
+        BSONArray arr(){ return BSONArray(b.obj()); }
+
+    private:
+        string num(){ return b.numStr(i++); }
+        int i;
+        BSONObjBuilder b;
+    };
+
+
+    /** iterator for a BSONObj
+
+       Note each BSONObj ends with an EOO element: so you will get more() on an empty
+       object, although next().eoo() will be true.
+
+       todo: we may want to make a more stl-like iterator interface for this
+             with things like begin() and end()
+    */
+    class BSONObjIterator {
+    public:
+        /** Create an iterator for a BSON object. 
+        */
+        BSONObjIterator(const BSONObj& jso) {
+            int sz = jso.objsize();
+            if ( sz == 0 ) {
+                pos = theend = 0;
+                return;
+            }
+            pos = jso.objdata() + 4;
+            theend = jso.objdata() + sz;
+        }
+        /** @return true if more elements exist to be enumerated. */
+        bool moreWithEOO() {
+            return pos < theend;
+        }
+        bool more(){
+            return pos < theend && pos[0];
+        }
+        /** @return the next element in the object. For the final element, element.eoo() will be true. */
+        BSONElement next( bool checkEnd = false ) {
+            assert( pos < theend );
+            BSONElement e( pos, checkEnd ? theend - pos : -1 );
+            pos += e.size( checkEnd ? theend - pos : -1 );
+            return e;
+        }
+    private:
+        const char *pos;
+        const char *theend;
+    };
+
+    /* iterator a BSONObj which is an array, in array order.
+    class JSArrayIter {
+    public:
+    	BSONObjIterator(const BSONObj& jso) {
+    ...
+    	}
+    	bool more() { return ... }
+    	BSONElement next() {
+    ...
+    	}
+    };
+    */
+
+    extern BSONObj maxKey;
+    extern BSONObj minKey;
+	
+    // a BoundList contains intervals specified by inclusive start
+    // and end bounds.  The intervals should be nonoverlapping and occur in
+    // the specified direction of traversal.  For example, given a simple index {i:1}
+    // and direction +1, one valid BoundList is: (1, 2); (4, 6).  The same BoundList
+    // would be valid for index {i:-1} with direction -1.
+    typedef vector< pair< BSONObj, BSONObj > > BoundList;	
+
+    /*- just for testing -- */
+
+#pragma pack(1)
+    struct JSObj1 {
+        JSObj1() {
+            totsize=sizeof(JSObj1);
+            n = NumberDouble;
+            strcpy_s(nname, 5, "abcd");
+            N = 3.1;
+            s = String;
+            strcpy_s(sname, 7, "abcdef");
+            slen = 10;
+            strcpy_s(sval, 10, "123456789");
+            eoo = EOO;
+        }
+        unsigned totsize;
+
+        char n;
+        char nname[5];
+        double N;
+
+        char s;
+        char sname[7];
+        unsigned slen;
+        char sval[10];
+
+        char eoo;
+    };
+#pragma pack()
+    extern JSObj1 js1;
+
+#ifdef _DEBUG
+#define CHECK_OBJECT( o , msg ) massert( 10337 ,  (string)"object not valid" + (msg) , (o).isValid() )
+#else
+#define CHECK_OBJECT( o , msg )
+#endif
+
+    inline BSONObj BSONElement::embeddedObjectUserCheck() {
+        uassert( 10065 ,  "invalid parameter: expected an object", type()==Object || type()==Array );
+        return BSONObj(value());
+    }
+
+    inline BSONObj BSONElement::embeddedObject() const {
+        assert( type()==Object || type()==Array );
+        return BSONObj(value());
+    }
+
+    inline BSONObj BSONElement::codeWScopeObject() const {
+        assert( type() == CodeWScope );
+        int strSizeWNull = *(int *)( value() + 4 );
+        return BSONObj( value() + 4 + 4 + strSizeWNull );
+    }
+    
+    inline BSONObj BSONObj::copy() const {
+        char *p = (char*) malloc(objsize());
+        memcpy(p, objdata(), objsize());
+        return BSONObj(p, true);
+    }
+
+// wrap this element up as a singleton object.
+    inline BSONObj BSONElement::wrap() const {
+        BSONObjBuilder b(size()+6);
+        b.append(*this);
+        return b.obj();
+    }
+
+    inline BSONObj BSONElement::wrap( const char * newName ) const {
+        BSONObjBuilder b(size()+6+strlen(newName));
+        b.appendAs(*this,newName);
+        return b.obj();
+    }
+
+
+    inline bool BSONObj::hasElement(const char *name) const {
+        if ( !isEmpty() ) {
+            BSONObjIterator it(*this);
+            while ( it.moreWithEOO() ) {
+                BSONElement e = it.next();
+                if ( strcmp(name, e.fieldName()) == 0 )
+                    return true;
+            }
+        }
+        return false;
+    }
+
+    inline BSONElement BSONObj::findElement(const char *name) const {
+        if ( !isEmpty() ) {
+            BSONObjIterator it(*this);
+            while ( it.moreWithEOO() ) {
+                BSONElement e = it.next();
+                if ( strcmp(name, e.fieldName()) == 0 )
+                    return e;
+            }
+        }
+        return BSONElement();
+    }
+
+    /* add all the fields from the object specified to this object */
+    inline BSONObjBuilder& BSONObjBuilder::appendElements(BSONObj x) {
+        BSONObjIterator it(x);
+        while ( it.moreWithEOO() ) {
+            BSONElement e = it.next();
+            if ( e.eoo() ) break;
+            append(e);
+        }
+        return *this;
+    }
+
+    inline bool BSONObj::isValid(){
+        return objsize() > 0 && objsize() <= 1024 * 1024 * 8;
+    }
+
+    inline bool BSONObj::getObjectID(BSONElement& e) const { 
+        BSONElement f = findElement("_id");
+        if( !f.eoo() ) { 
+            e = f;
+            return true;
+        }
+        return false;
+    }
+
+    inline BSONObjBuilderValueStream::BSONObjBuilderValueStream( BSONObjBuilder * builder ) {
+        _fieldName = 0;
+        _builder = builder;
+    }
+    
+    template<class T> 
+    inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<( T value ) { 
+        _builder->append(_fieldName, value);
+        _fieldName = 0;
+        return *_builder;
+    }
+
+    inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<( const BSONElement& e ) { 
+        _builder->appendAs( e , _fieldName );
+        _fieldName = 0;
+        return *_builder;
+    }
+
+    inline BSONObjBuilder& BSONObjBuilderValueStream::operator<<(DateNowLabeler& id){
+        _builder->appendDate(_fieldName, jsTime());
+        _fieldName = 0;
+        return *_builder;
+    }
+
+    inline Labeler BSONObjBuilderValueStream::operator<<( const Labeler::Label &l ) { 
+        return Labeler( l, this );
+    }
+
+    inline void BSONObjBuilderValueStream::endField( const char *nextFieldName ) {
+        if ( _fieldName && haveSubobj() ) {
+            _builder->append( _fieldName, subobj()->done() );
+        }
+        _subobj.reset();
+        _fieldName = nextFieldName;
+    }    
+
+    inline BSONObjBuilder *BSONObjBuilderValueStream::subobj() {
+        if ( !haveSubobj() )
+            _subobj.reset( new BSONObjBuilder() );
+        return _subobj.get();
+    }
+    
+    template<class T> inline
+    BSONObjBuilder& Labeler::operator<<( T value ) {
+        s_->subobj()->append( l_.l_, value );
+        return *s_->_builder;
+    }    
+
+    inline
+    BSONObjBuilder& Labeler::operator<<( const BSONElement& e ) {
+        s_->subobj()->appendAs( e, l_.l_ );
+        return *s_->_builder;
+    }    
+
+    // {a: {b:1}} -> {a.b:1}
+    void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base="");
+    inline BSONObj nested2dotted(const BSONObj& obj){
+        BSONObjBuilder b;
+        nested2dotted(b, obj);
+        return b.obj();
+    }
+
+    // {a.b:1} -> {a: {b:1}}
+    void dotted2nested(BSONObjBuilder& b, const BSONObj& obj);
+    inline BSONObj dotted2nested(const BSONObj& obj){
+        BSONObjBuilder b;
+        dotted2nested(b, obj);
+        return b.obj();
+    }
+    
+    /* WARNING: nested/dotted conversions are not 100% reversible
+     * nested2dotted(dotted2nested({a.b: {c:1}})) -> {a.b.c: 1}
+     * also, dotted2nested ignores order
+     */
+
+    typedef map<string, BSONElement> BSONMap;
+    inline BSONMap bson2map(const BSONObj& obj){
+        BSONMap m;
+        BSONObjIterator it(obj);
+        while (it.more()){
+            BSONElement e = it.next();
+            m[e.fieldName()] = e;
+        }
+        return m;
+    }
+
+    struct BSONElementFieldNameCmp {
+        bool operator()( const BSONElement &l, const BSONElement &r ) const {
+            return strcmp( l.fieldName() , r.fieldName() ) <= 0;
+        }
+    };
+
+
+    typedef set<BSONElement, BSONElementFieldNameCmp> BSONSortedElements;
+    inline BSONSortedElements bson2set( const BSONObj& obj ){
+        BSONSortedElements s;
+        BSONObjIterator it(obj);
+        while ( it.more() )
+            s.insert( it.next() );
+        return s;
+    }
+    
+    class BSONObjIteratorSorted {
+    public:
+        BSONObjIteratorSorted( const BSONObj& o );
+        
+        ~BSONObjIteratorSorted(){
+            assert( _fields );
+            delete _fields;
+            _fields = 0;
+        }
+
+        bool more(){
+            return _cur < _nfields;
+        }
+        
+        BSONElement next(){
+            assert( _fields );
+            if ( _cur < _nfields )
+                return BSONElement( _fields[_cur++] );
+            return BSONElement();
+        }
+
+    private:
+        const char ** _fields;
+        int _nfields;
+        int _cur;
+    };
+
+} // namespace mongo
diff --git a/db/jsobjmanipulator.h b/db/jsobjmanipulator.h
new file mode 100644
index 0000000..d534d08
--- /dev/null
+++ b/db/jsobjmanipulator.h
@@ -0,0 +1,78 @@
+/** jsobjManipulator.h */
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+
+/** Manipulate the binary representation of a BSONElement in-place.
+ Careful, this casts away const.
+ */
+class BSONElementManipulator {
+public:
+    BSONElementManipulator( const BSONElement &element ) :
+    element_( element ) {
+        assert( !element_.eoo() );
+    }
+    /** Replace a Timestamp type with a Date type initialized to
+     OpTime::now().asDate()
+     */
+    void initTimestamp();
+
+    /** Change the value, in place, of the number. */
+    void setNumber(double d) {
+        if ( element_.type() == NumberDouble ) *reinterpret_cast< double * >( value() )  = d;
+        else if ( element_.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d;
+    }
+    void setLong(long long n) { 
+        if( element_.type() == NumberLong ) *reinterpret_cast< long long * >( value() ) = n;
+    }
+
+    /** Replace the type and value of the element with the type and value of e,
+        preserving the original fieldName */
+    void replaceTypeAndValue( const BSONElement &e ) {
+        *data() = e.type();
+        memcpy( value(), e.value(), e.valuesize() );
+    }
+    
+    static void lookForTimestamps( const BSONObj& obj ){
+        // If have a Timestamp field as the first or second element,
+        // update it to a Date field set to OpTime::now().asDate().  The
+        // replacement policy is a work in progress.
+        
+        BSONObjIterator i( obj );
+        for( int j = 0; i.moreWithEOO() && j < 2; ++j ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            if ( e.type() == Timestamp ){
+                BSONElementManipulator( e ).initTimestamp();
+                break;
+            }
+        }
+    }
+private:
+    char *data() { return nonConst( element_.rawdata() ); }
+    char *value() { return nonConst( element_.value() ); }
+    static char *nonConst( const char *s ) { return const_cast< char * >( s ); }
+    const BSONElement element_;
+};
+
+} // namespace mongo
diff --git a/db/json.cpp b/db/json.cpp
new file mode 100644
index 0000000..b55ddb1
--- /dev/null
+++ b/db/json.cpp
@@ -0,0 +1,569 @@
+// json.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "json.h"
+#include "../util/builder.h"
+#include "../util/base64.h"
+
+using namespace boost::spirit;
+
+namespace mongo {
+
+    struct ObjectBuilder {
+        BSONObjBuilder *back() {
+            return builders.back().get();
+        }
+        // Storage for field names of elements within builders.back().
+        const char *fieldName() {
+            return fieldNames.back().c_str();
+        }
+        bool empty() const {
+            return builders.size() == 0;
+        }
+        void init() {
+            boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            builders.push_back( b );
+            fieldNames.push_back( "" );
+            indexes.push_back( 0 );
+        }
+        void pushObject( const char *fieldName ) {
+            boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subobjStart( fieldName ) ) );
+            builders.push_back( b );
+            fieldNames.push_back( "" );
+            indexes.push_back( 0 );
+        }
+        void pushArray( const char *fieldName ) {
+            boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subarrayStart( fieldName ) ) );
+            builders.push_back( b );
+            fieldNames.push_back( "" );
+            indexes.push_back( 0 );
+        }
+        BSONObj pop() {
+            BSONObj ret;
+            if ( back()->owned() )
+                ret = back()->obj();
+            else
+                ret = back()->done();
+            builders.pop_back();
+            fieldNames.pop_back();
+            indexes.pop_back();
+            return ret;
+        }
+        void nameFromIndex() {
+            fieldNames.back() = BSONObjBuilder::numStr( indexes.back() );
+        }
+        string popString() {
+            string ret = ss.str();
+            ss.str( "" );
+            return ret;
+        }
+        // Cannot use auto_ptr because its copy constructor takes a non const reference.
+        vector< boost::shared_ptr< BSONObjBuilder > > builders;
+        vector< string > fieldNames;
+        vector< int > indexes;
+        stringstream ss;
+        string ns;
+        OID oid;
+        string binData;
+        BinDataType binDataType;
+        string regex;
+        string regexOptions;
+        Date_t date;
+    };
+
+    struct objectStart {
+        objectStart( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char &c ) const {
+            if ( b.empty() )
+                b.init();
+            else
+                b.pushObject( b.fieldName() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct arrayStart {
+        arrayStart( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char &c ) const {
+            b.pushArray( b.fieldName() );
+            b.nameFromIndex();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct arrayNext {
+        arrayNext( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char &c ) const {
+            ++b.indexes.back();
+            b.nameFromIndex();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct ch {
+        ch( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char c ) const {
+            b.ss << c;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct chE {
+        chE( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char c ) const {
+            char o = '\0';
+            switch ( c ) {
+            case '\"':
+                o = '\"';
+                break;
+            case '\'':
+                o = '\'';
+                break;
+            case '\\':
+                o = '\\';
+                break;
+            case '/':
+                o = '/';
+                break;
+            case 'b':
+                o = '\b';
+                break;
+            case 'f':
+                o = '\f';
+                break;
+            case 'n':
+                o = '\n';
+                break;
+            case 'r':
+                o = '\r';
+                break;
+            case 't':
+                o = '\t';
+                break;
+            case 'v':
+                o = '\v';
+                break;
+            default:
+                assert( false );
+            }
+            b.ss << o;
+        }
+        ObjectBuilder &b;
+    };
+
+    namespace hex {
+        int val( char c ) {
+            if ( '0' <= c && c <= '9' )
+                return c - '0';
+            if ( 'a' <= c && c <= 'f' )
+                return c - 'a' + 10;
+            if ( 'A' <= c && c <= 'F' )
+                return c - 'A' + 10;
+            assert( false );
+            return 0xff;
+        }
+        char val( const char *c ) {
+            return ( val( c[ 0 ] ) << 4 ) | val( c[ 1 ] );
+        }
+    } // namespace hex
+
+    struct chU {
+        chU( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            unsigned char first = hex::val( start );
+            unsigned char second = hex::val( start + 2 );
+            if ( first == 0 && second < 0x80 )
+                b.ss << second;
+            else if ( first < 0x08 ) {
+                b.ss << char( 0xc0 | ( ( first << 2 ) | ( second >> 6 ) ) );
+                b.ss << char( 0x80 | ( ~0xc0 & second ) );
+            } else {
+                b.ss << char( 0xe0 | ( first >> 4 ) );
+                b.ss << char( 0x80 | ( ~0xc0 & ( ( first << 2 ) | ( second >> 6 ) ) ) );
+                b.ss << char( 0x80 | ( ~0xc0 & second ) );
+            }
+        }
+        ObjectBuilder &b;
+    };
+
+    struct chClear {
+        chClear( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char c ) const {
+            b.popString();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct fieldNameEnd {
+        fieldNameEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            string name = b.popString();
+            massert( 10338 ,  "Invalid use of reserved field name",
+                     name != "$oid" &&
+                     name != "$binary" &&
+                     name != "$type" &&
+                     name != "$date" &&
+                     name != "$regex" &&
+                     name != "$options" );
+            b.fieldNames.back() = name;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct unquotedFieldNameEnd {
+        unquotedFieldNameEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            string name( start, end );
+            b.fieldNames.back() = name;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct stringEnd {
+        stringEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->append( b.fieldName(), b.popString() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct numberValue {
+        numberValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( double d ) const {
+            b.back()->append( b.fieldName(), d );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct intValue {
+        intValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( long long num ) const {
+            if (num >= numeric_limits<int>::min() && num <= numeric_limits<int>::max())
+                b.back()->append( b.fieldName(), (int)num );
+            else
+                b.back()->append( b.fieldName(), num );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct subobjectEnd {
+        subobjectEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.pop();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct arrayEnd {
+        arrayEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.pop();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct trueValue {
+        trueValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendBool( b.fieldName(), true );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct falseValue {
+        falseValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendBool( b.fieldName(), false );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct nullValue {
+        nullValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendNull( b.fieldName() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dbrefNS {
+        dbrefNS( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.ns = b.popString();
+        }
+        ObjectBuilder &b;
+    };
+
+// NOTE s must be 24 characters.
+    OID stringToOid( const char *s ) {
+        OID oid;
+        char *oidP = (char *)( &oid );
+        for ( int i = 0; i < 12; ++i )
+            oidP[ i ] = hex::val( s + ( i * 2 ) );
+        return oid;
+    }
+
+    struct oidValue {
+        oidValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.oid = stringToOid( start );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dbrefEnd {
+        dbrefEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendDBRef( b.fieldName(), b.ns.c_str(), b.oid );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct oidEnd {
+        oidEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendOID( b.fieldName(), &b.oid );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct binDataBinary {
+        binDataBinary( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            massert( 10339 ,  "Badly formatted bindata", ( end - start ) % 4 == 0 );
+            string encoded( start, end );
+            b.binData = base64::decode( encoded );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct binDataType {
+        binDataType( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.binDataType = BinDataType( hex::val( start ) );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct binDataEnd {
+        binDataEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendBinData( b.fieldName(), b.binData.length(),
+                                     b.binDataType, b.binData.data() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dateValue {
+        dateValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( Date_t v ) const {
+            b.date = v;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dateEnd {
+        dateEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendDate( b.fieldName(), b.date );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct regexValue {
+        regexValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.regex = b.popString();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct regexOptions {
+        regexOptions( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.regexOptions = string( start, end );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct regexEnd {
+        regexEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendRegex( b.fieldName(), b.regex.c_str(),
+                                   b.regexOptions.c_str() );
+        }
+        ObjectBuilder &b;
+    };
+
+// One gotcha with this parsing library is probably best ilustrated with an
+// example.  Say we have a production like this:
+// z = ( ch_p( 'a' )[ foo ] >> ch_p( 'b' ) ) | ( ch_p( 'a' )[ foo ] >> ch_p( 'c' ) );
+// On input "ac", action foo() will be called twice -- once as the parser tries
+// to match "ab", again as the parser successfully matches "ac".  Sometimes
+// the grammar can be modified to eliminate these situations.  Here, for example:
+// z = ch_p( 'a' )[ foo ] >> ( ch_p( 'b' ) | ch_p( 'c' ) );
+// However, this is not always possible.  In my implementation I've tried to
+// stick to the following pattern: store fields fed to action callbacks
+// temporarily as ObjectBuilder members, then append to a BSONObjBuilder once
+// the parser has completely matched a nonterminal and won't backtrack.  It's
+// worth noting here that this parser follows a short-circuit convention.  So,
+// in the original z example on line 3, if the input was "ab", foo() would only
+// be called once.
+    struct JsonGrammar : public grammar< JsonGrammar > {
+public:
+        JsonGrammar( ObjectBuilder &_b ) : b( _b ) {}
+
+        template < typename ScannerT >
+        struct definition {
+            definition( JsonGrammar const &self ) {
+                object = ch_p( '{' )[ objectStart( self.b ) ] >> !members >> '}';
+                members = list_p((fieldName >> ':' >> value) , ',');
+                fieldName =
+                    str[ fieldNameEnd( self.b ) ] |
+                    singleQuoteStr[ fieldNameEnd( self.b ) ] |
+                    unquotedFieldName[ unquotedFieldNameEnd( self.b ) ];
+                array = ch_p( '[' )[ arrayStart( self.b ) ] >> !elements >> ']';
+                elements = list_p(value, ch_p(',')[arrayNext( self.b )]);
+                value =
+                    oid[ oidEnd( self.b ) ] |
+                    dbref[ dbrefEnd( self.b ) ] |
+                    bindata[ binDataEnd( self.b ) ] |
+                    date[ dateEnd( self.b ) ] |
+                    regex[ regexEnd( self.b ) ] |
+                    str[ stringEnd( self.b ) ] |
+                    singleQuoteStr[ stringEnd( self.b ) ] |
+                    number |
+                    integer |
+                    object[ subobjectEnd( self.b ) ] |
+                    array[ arrayEnd( self.b ) ] |
+                    lexeme_d[ str_p( "true" ) ][ trueValue( self.b ) ] |
+                    lexeme_d[ str_p( "false" ) ][ falseValue( self.b ) ] |
+                    lexeme_d[ str_p( "null" ) ][ nullValue( self.b ) ];
+                // NOTE lexeme_d and rules don't mix well, so we have this mess.
+                // NOTE We use range_p rather than cntrl_p, because the latter is locale dependent.
+                str = lexeme_d[ ch_p( '"' )[ chClear( self.b ) ] >>
+                                *( ( ch_p( '\\' ) >>
+                                     (
+                                       ch_p( 'b' )[ chE( self.b ) ] |
+                                       ch_p( 'f' )[ chE( self.b ) ] |
+                                       ch_p( 'n' )[ chE( self.b ) ] |
+                                       ch_p( 'r' )[ chE( self.b ) ] |
+                                       ch_p( 't' )[ chE( self.b ) ] |
+                                       ch_p( 'v' )[ chE( self.b ) ] |
+                                       ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+                                       ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+                                     )
+                                   ) |
+                                   ( ~range_p( 0x00, 0x1f ) & ~ch_p( '"' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '"' ];
+
+                singleQuoteStr = lexeme_d[ ch_p( '\'' )[ chClear( self.b ) ] >>
+                                *( ( ch_p( '\\' ) >>
+                                     (
+                                       ch_p( 'b' )[ chE( self.b ) ] |
+                                       ch_p( 'f' )[ chE( self.b ) ] |
+                                       ch_p( 'n' )[ chE( self.b ) ] |
+                                       ch_p( 'r' )[ chE( self.b ) ] |
+                                       ch_p( 't' )[ chE( self.b ) ] |
+                                       ch_p( 'v' )[ chE( self.b ) ] |
+                                       ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+                                       ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+                                     )
+                                   ) |
+                                   ( ~range_p( 0x00, 0x1f ) & ~ch_p( '\'' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '\'' ];
+
+                // real_p accepts numbers with nonsignificant zero prefixes, which
+                // aren't allowed in JSON.  Oh well.
+                number = strict_real_p[ numberValue( self.b ) ];
+
+                static int_parser<long long, 10,  1, numeric_limits<long long>::digits10 + 1> long_long_p;
+                integer = long_long_p[ intValue(self.b) ];
+
+                // We allow a subset of valid js identifier names here.
+                unquotedFieldName = lexeme_d[ ( alpha_p | ch_p( '$' ) | ch_p( '_' ) ) >> *( ( alnum_p | ch_p( '$' ) | ch_p( '_'  )) ) ];
+
+                dbref = dbrefS | dbrefT;
+                dbrefS = ch_p( '{' ) >> "\"$ref\"" >> ':' >>
+                         str[ dbrefNS( self.b ) ] >> ',' >> "\"$id\"" >> ':' >> quotedOid >> '}';
+                dbrefT = str_p( "Dbref" ) >> '(' >> str[ dbrefNS( self.b ) ] >> ',' >>
+                         quotedOid >> ')';
+
+                oid = oidS | oidT;
+                oidS = ch_p( '{' ) >> "\"$oid\"" >> ':' >> quotedOid >> '}';
+                oidT = str_p( "ObjectId" ) >> '(' >> quotedOid >> ')';
+
+                quotedOid = lexeme_d[ '"' >> ( repeat_p( 24 )[ xdigit_p ] )[ oidValue( self.b ) ] >> '"' ];
+
+                bindata = ch_p( '{' ) >> "\"$binary\"" >> ':' >>
+                          lexeme_d[ '"' >> ( *( range_p( 'A', 'Z' ) | range_p( 'a', 'z' ) | range_p( '0', '9' ) | ch_p( '+' ) | ch_p( '/' ) ) >> *ch_p( '=' ) )[ binDataBinary( self.b ) ] >> '"' ] >> ',' >> "\"$type\"" >> ':' >>
+                          lexeme_d[ '"' >> ( repeat_p( 2 )[ xdigit_p ] )[ binDataType( self.b ) ] >> '"' ] >> '}';
+
+                // TODO: this will need to use a signed parser at some point
+                date = dateS | dateT;
+                dateS = ch_p( '{' ) >> "\"$date\"" >> ':' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> '}';
+                dateT = !str_p("new") >> str_p( "Date" ) >> '(' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> ')';
+
+                regex = regexS | regexT;
+                regexS = ch_p( '{' ) >> "\"$regex\"" >> ':' >> str[ regexValue( self.b ) ] >> ',' >> "\"$options\"" >> ':' >> lexeme_d[ '"' >> ( *( alpha_p ) )[ regexOptions( self.b ) ] >> '"' ] >> '}';
+                // FIXME Obviously it would be nice to unify this with str.
+                regexT = lexeme_d[ ch_p( '/' )[ chClear( self.b ) ] >>
+                                   *( ( ch_p( '\\' ) >>
+                                        ( ch_p( '"' )[ chE( self.b ) ] |
+                                          ch_p( '\\' )[ chE( self.b ) ] |
+                                          ch_p( '/' )[ chE( self.b ) ] |
+                                          ch_p( 'b' )[ chE( self.b ) ] |
+                                          ch_p( 'f' )[ chE( self.b ) ] |
+                                          ch_p( 'n' )[ chE( self.b ) ] |
+                                          ch_p( 'r' )[ chE( self.b ) ] |
+                                          ch_p( 't' )[ chE( self.b ) ] |
+                                          ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) ) ) |
+                                      ( ~range_p( 0x00, 0x1f ) & ~ch_p( '/' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> str_p( "/" )[ regexValue( self.b ) ]
+                                   >> ( *( ch_p( 'i' ) | ch_p( 'g' ) | ch_p( 'm' ) ) )[ regexOptions( self.b ) ] ];
+            }
+            rule< ScannerT > object, members, array, elements, value, str, number, integer,
+            dbref, dbrefS, dbrefT, oid, oidS, oidT, bindata, date, dateS, dateT,
+            regex, regexS, regexT, quotedOid, fieldName, unquotedFieldName, singleQuoteStr;
+            const rule< ScannerT > &start() const {
+                return object;
+            }
+        };
+        ObjectBuilder &b;
+    };
+
+    BSONObj fromjson( const char *str ) {
+        if ( ! strlen(str) )
+            return BSONObj();
+        ObjectBuilder b;
+        JsonGrammar parser( b );
+        parse_info<> result = parse( str, parser, space_p );
+        if ( !result.full ) {
+            int len = strlen( result.stop );
+            if ( len > 10 )
+                len = 10;
+            stringstream ss;
+            ss << "Failure parsing JSON string near: " << string( result.stop, len );
+            massert( 10340 ,  ss.str(), false );
+        }
+        return b.pop();
+    }
+
+    BSONObj fromjson( const string &str ) {
+        return fromjson( str.c_str() );
+    }
+
+} // namespace mongo
diff --git a/db/json.h b/db/json.h
new file mode 100644
index 0000000..c65785a
--- /dev/null
+++ b/db/json.h
@@ -0,0 +1,40 @@
+/** @file json.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    /** Create a BSONObj from a JSON <http://www.json.org> string.  In addition
+     to the JSON extensions extensions described here
+     <http://mongodb.onconfluence.com/display/DOCS/Mongo+Extended+JSON>,
+     this function accepts certain unquoted field names and allows single quotes
+     to optionally be used when specifying field names and string values instead
+     of double quotes.  JSON unicode escape sequences (of the form \uXXXX) are
+     converted to utf8.
+     \throws MsgAssertionException if parsing fails.  The message included with
+     this assertion includes a rough indication of where parsing failed.
+    */
+    BSONObj fromjson(const string &str);
+
+    BSONObj fromjson(const char *str);
+
+} // namespace mongo
diff --git a/db/lasterror.cpp b/db/lasterror.cpp
new file mode 100644
index 0000000..e8b1fcf
--- /dev/null
+++ b/db/lasterror.cpp
@@ -0,0 +1,193 @@
+// lasterror.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "stdafx.h"
+
+#include "../util/unittest.h"
+#include "../util/message.h"
+
+
+#include "lasterror.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    LastError LastError::noError;
+    LastErrorHolder lastError;
+    boost::mutex LastErrorHolder::_idsmutex;
+
+    void LastError::appendSelf( BSONObjBuilder &b ) {
+        if ( !valid ) {
+            b.appendNull( "err" );
+            b.append( "n", 0 );
+            return;
+        }
+        if ( msg.empty() )
+            b.appendNull( "err" );
+        else
+            b.append( "err", msg );
+        if ( code )
+            b.append( "code" , code );
+        if ( updatedExisting != NotUpdate )
+            b.appendBool( "updatedExisting", updatedExisting == True );
+        b.append( "n", nObjects );
+    }
+
+    void LastErrorHolder::setID( int id ){
+        _id.set( id );
+    }
+    
+    int LastErrorHolder::getID(){
+        return _id.get();
+    }
+
+    LastError * LastErrorHolder::disableForCommand() {
+        LastError *le = _get();
+        assert( le );
+        le->disabled = true;
+        le->nPrev--; // caller is a command that shouldn't count as an operation
+        return le;
+    }
+
+    LastError * LastErrorHolder::get( bool create ) {
+        LastError *ret = _get( create );
+        if ( ret && !ret->disabled )
+            return ret;
+        return 0;
+    }
+    
+    LastError * LastErrorHolder::_get( bool create ){
+        int id = _id.get();
+        if ( id == 0 )
+            return _tl.get();
+
+        boostlock lock(_idsmutex);        
+        map<int,Status>::iterator i = _ids.find( id );
+        if ( i == _ids.end() ){
+            if ( ! create )
+                return 0;
+            
+            LastError * le = new LastError();
+            Status s;
+            s.time = time(0);
+            s.lerr = le;
+            _ids[id] = s;
+            return le;
+        }
+        
+        Status &status = i->second;
+        status.time = time(0);
+        return status.lerr;
+    }
+
+    void LastErrorHolder::remove( int id ){
+        boostlock lock(_idsmutex);
+        map<int,Status>::iterator i = _ids.find( id );
+        if ( i == _ids.end() )
+            return;
+        
+        delete i->second.lerr;
+        _ids.erase( i );
+    }
+
+    void LastErrorHolder::release(){
+        int id = _id.get();
+        if ( id == 0 ){
+            _tl.release();
+            return;
+        }
+        
+        remove( id );
+    }
+    
+    void LastErrorHolder::reset( LastError * le ){
+        int id = _id.get();
+        if ( id == 0 ){
+            _tl.reset( le );
+            return;
+        }
+
+        boostlock lock(_idsmutex);
+        Status & status = _ids[id];
+        status.time = time(0);
+        status.lerr = le;
+    }
+
+    void prepareErrForNewRequest( Message &m, LastError * err ) {
+        // a killCursors message shouldn't affect last error
+        if ( m.data->operation() == dbKillCursors ) {
+            err->disabled = true;
+        } else {
+            err->disabled = false;
+            err->nPrev++;
+        }        
+    }
+    
+    void LastErrorHolder::startRequest( Message& m ) {
+        int id = m.data->id & 0xFFFF0000;
+        setID( id );
+        LastError * le = _get( true );
+        prepareErrForNewRequest( m, le );
+    }
+
+    void LastErrorHolder::startRequest( Message& m , LastError * connectionOwned ) {
+        if ( !connectionOwned->overridenById ) {
+            prepareErrForNewRequest( m, connectionOwned );
+            return;
+        }
+        startRequest(m);
+    }
+
+    struct LastErrorHolderTest : public UnitTest {
+    public:
+        
+        void test( int i ){
+            _tl.set( i );
+            assert( _tl.get() == i );
+        }
+        
+        void tlmaptest(){
+            test( 1 );
+            test( 12123123 );
+            test( -123123 );
+            test( numeric_limits<int>::min() );
+            test( numeric_limits<int>::max() );
+        }
+        
+        void run(){
+            tlmaptest();
+
+            LastError * a = new LastError();
+            LastError * b = new LastError();
+            
+            LastErrorHolder holder;
+            holder.reset( a );
+            assert( a == holder.get() );
+            holder.setID( 1 );
+            assert( 0 == holder.get() );
+            holder.reset( b );
+            assert( b == holder.get() );
+            holder.setID( 0 );
+            assert( a == holder.get() );
+            
+            holder.remove( 1 );
+        }
+        
+        ThreadLocalValue<int> _tl;
+    } lastErrorHolderTest;
+
+} // namespace mongo
diff --git a/db/lasterror.h b/db/lasterror.h
new file mode 100644
index 0000000..8f687bb
--- /dev/null
+++ b/db/lasterror.h
@@ -0,0 +1,130 @@
+// lasterror.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/thread/tss.hpp>
+#undef assert
+#define assert xassert
+
+namespace mongo {
+    class BSONObjBuilder;
+    class Message;
+
+    struct LastError {
+        int code;
+        string msg;
+        enum UpdatedExistingType { NotUpdate, True, False } updatedExisting;
+        /* todo: nObjects should be 64 bit */
+        int nObjects;
+        int nPrev;
+        bool valid;
+        bool overridenById;
+        bool disabled;
+        void raiseError(int _code , const char *_msg) {
+            reset( true );
+            code = _code;
+            msg = _msg;
+        }
+        void recordUpdate( bool _updatedExisting, int nChanged ) {
+            reset( true );
+            nObjects = nChanged;
+            updatedExisting = _updatedExisting ? True : False;
+        }
+        void recordDelete( int nDeleted ) {
+            reset( true );
+            nObjects = nDeleted;
+        }
+        LastError() {
+            overridenById = false;
+            reset();
+        }
+        void reset( bool _valid = false ) {
+            code = 0;
+            msg.clear();
+            updatedExisting = NotUpdate;
+            nObjects = 0;
+            nPrev = 1;
+            valid = _valid;
+            disabled = false;
+        }
+        void appendSelf( BSONObjBuilder &b );
+        static LastError noError;
+    };
+
+    extern class LastErrorHolder {
+    public:
+        LastErrorHolder() : _id( 0 ) {}
+
+        LastError * get( bool create = false );
+
+        LastError * _get( bool create = false ); // may return a disabled LastError
+
+        void reset( LastError * le );
+        
+        /**
+         * id of 0 means should use thread local management
+         */
+        void setID( int id );
+        int getID();
+
+        void remove( int id );
+        void release();
+        
+        /** when db receives a message/request, call this */
+        void startRequest( Message& m , LastError * connectionOwned );
+        void startRequest( Message& m );
+        
+        // used to disable lastError reporting while processing a killCursors message
+        // disable causes get() to return 0.
+        LastError *disableForCommand(); // only call once per command invocation!
+    private:
+        ThreadLocalValue<int> _id;
+        boost::thread_specific_ptr<LastError> _tl;
+        
+        struct Status {
+            time_t time;
+            LastError *lerr;
+        };
+        static boost::mutex _idsmutex;
+        map<int,Status> _ids;    
+    } lastError;
+    
+    inline void raiseError(int code , const char *msg) {
+        LastError *le = lastError.get();
+        if ( le == 0 ) {
+            DEV log() << "warning: lastError==0 can't report:" << msg << '\n';
+        } else if ( le->disabled ) {
+            log() << "lastError disabled, can't report: " << msg << endl;
+        } else {
+            le->raiseError(code, msg);
+        }
+    }
+    
+    inline void recordUpdate( bool updatedExisting, int nChanged ) {
+        LastError *le = lastError.get();
+        if ( le )
+            le->recordUpdate( updatedExisting, nChanged );        
+    }
+
+    inline void recordDelete( int nDeleted ) {
+        LastError *le = lastError.get();
+        if ( le )
+            le->recordDelete( nDeleted );        
+    }
+
+} // namespace mongo
diff --git a/db/matcher.cpp b/db/matcher.cpp
new file mode 100644
index 0000000..d71b7ef
--- /dev/null
+++ b/db/matcher.cpp
@@ -0,0 +1,672 @@
+// matcher.cpp
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "matcher.h"
+#include "../util/goodies.h"
+#include "../util/unittest.h"
+#include "storage.h"
+#include "../scripting/engine.h"
+#include "db.h"
+#include "client.h"
+
+namespace mongo {
+    
+    //#include "minilex.h"
+    //MiniLex minilex;
+    
+    class Where {
+    public:
+        Where() {
+            jsScope = 0;
+            func = 0;
+        }
+        ~Where() {
+
+            if ( scope.get() )
+                scope->execSetup( "_mongo.readOnly = false;" , "make not read only" );
+
+            if ( jsScope ){
+                delete jsScope;
+                jsScope = 0;
+            }
+            func = 0;
+        }
+        
+        auto_ptr<Scope> scope;
+        ScriptingFunction func;
+        BSONObj *jsScope;
+        
+        void setFunc(const char *code) {
+            massert( 10341 ,  "scope has to be created first!" , scope.get() );
+            func = scope->createFunction( code );
+        }
+        
+    };
+
+    Matcher::~Matcher() {
+        delete where;
+        where = 0;
+    }
+
+    ElementMatcher::ElementMatcher( BSONElement _e , int _op ) : toMatch( _e ) , compareOp( _op ) {
+        if ( _op == BSONObj::opMOD ){
+            BSONObj o = _e.embeddedObject().firstElement().embeddedObject();
+            mod = o["0"].numberInt();
+            modm = o["1"].numberInt();
+            
+            uassert( 10073 ,  "mod can't be 0" , mod );
+        }
+        else if ( _op == BSONObj::opTYPE ){
+            type = (BSONType)(_e.embeddedObject().firstElement().numberInt());
+        }
+        else if ( _op == BSONObj::opELEM_MATCH ){
+            BSONElement m = toMatch.embeddedObjectUserCheck().firstElement();
+            uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object );
+            subMatcher.reset( new Matcher( m.embeddedObject() ) );
+        }
+    }
+
+
+    ElementMatcher::~ElementMatcher(){
+    }
+
+
+
+} // namespace mongo
+
+#include "pdfile.h"
+
+namespace {
+    inline pcrecpp::RE_Options flags2options(const char* flags){
+        pcrecpp::RE_Options options;
+        options.set_utf8(true);
+        while ( flags && *flags ) {
+            if ( *flags == 'i' )
+                options.set_caseless(true);
+            else if ( *flags == 'm' )
+                options.set_multiline(true);
+            else if ( *flags == 'x' )
+                options.set_extended(true);
+            flags++;
+        }
+        return options;
+    }
+}
+
+namespace mongo {
+    
+    CoveredIndexMatcher::CoveredIndexMatcher(const BSONObj &jsobj, const BSONObj &indexKeyPattern) :
+        _keyMatcher(jsobj.filterFieldsUndotted(indexKeyPattern, true), 
+        indexKeyPattern),
+        _docMatcher(jsobj) 
+    {
+        _needRecord = ! ( 
+                         _docMatcher.keyMatch() && 
+                         _keyMatcher.jsobj.nFields() == _docMatcher.jsobj.nFields()
+                          );
+    }
+    
+    bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc ) {
+        if ( _keyMatcher.keyMatch() ) {
+            if ( !_keyMatcher.matches(key) ) {
+                return false;
+            }
+        }
+        
+        if ( ! _needRecord ){
+            return true;
+        }
+
+        return _docMatcher.matches(recLoc.rec());
+    }
+    
+    
+    /* _jsobj          - the query pattern
+    */
+    Matcher::Matcher(const BSONObj &_jsobj, const BSONObj &constrainIndexKey) :
+        where(0), jsobj(_jsobj), haveSize(), all(), hasArray(0), _atomic(false), nRegex(0) {
+
+        BSONObjIterator i(jsobj);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+
+            if ( ( e.type() == CodeWScope || e.type() == Code || e.type() == String ) && strcmp(e.fieldName(), "$where")==0 ) {
+                // $where: function()...
+                uassert( 10066 ,  "$where occurs twice?", where == 0 );
+                uassert( 10067 ,  "$where query, but no script engine", globalScriptEngine );
+                where = new Where();
+                where->scope = globalScriptEngine->getPooledScope( cc().ns() );
+                where->scope->localConnect( cc().database()->name.c_str() );
+
+                if ( e.type() == CodeWScope ) {
+                    where->setFunc( e.codeWScopeCode() );
+                    where->jsScope = new BSONObj( e.codeWScopeScopeData() , 0 );
+                }
+                else {
+                    const char *code = e.valuestr();
+                    where->setFunc(code);
+                }
+                
+                where->scope->execSetup( "_mongo.readOnly = true;" , "make read only" );
+
+                continue;
+            }
+
+            if ( e.type() == RegEx ) {
+                if ( nRegex >= 4 ) {
+                    out() << "ERROR: too many regexes in query" << endl;
+                }
+                else {
+                    RegexMatcher& rm = regexs[nRegex];
+                    rm.re = new pcrecpp::RE(e.regex(), flags2options(e.regexFlags()));
+                    rm.fieldName = e.fieldName();
+                    nRegex++;
+                }
+                continue;
+            }
+            
+            // greater than / less than...
+            // e.g., e == { a : { $gt : 3 } }
+            //       or
+            //            { a : { $in : [1,2,3] } }
+            if ( e.type() == Object ) {
+                // support {$regex:"a|b", $options:"imx"}
+                const char* regex = NULL;
+                const char* flags = "";
+                
+                // e.g., fe == { $gt : 3 }
+                BSONObjIterator j(e.embeddedObject());
+                bool isOperator = false;
+                while ( j.more() ) {
+                    BSONElement fe = j.next();
+                    const char *fn = fe.fieldName();
+                    
+                    if ( fn[0] == '$' && fn[1] ) {
+                        int op = fe.getGtLtOp( -1 );
+
+                        if ( op == -1 ){
+                            if ( fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ){
+                                break; // { $ref : xxx } - treat as normal object
+                            }
+                            uassert( 10068 ,  (string)"invalid operator: " + fn , op != -1 );
+                        }
+
+                        isOperator = true;
+                        
+                        switch ( op ){
+                        case BSONObj::GT:
+                        case BSONObj::GTE:
+                        case BSONObj::LT:
+                        case BSONObj::LTE:{
+                            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+                            _builders.push_back( b );
+                            b->appendAs(fe, e.fieldName());
+                            addBasic(b->done().firstElement(), op);
+                            isOperator = true;
+                            break;
+                        }
+                        case BSONObj::NE:{
+                            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+                            _builders.push_back( b );
+                            b->appendAs(fe, e.fieldName());
+                            addBasic(b->done().firstElement(), BSONObj::NE);
+                            break;
+                        }
+                        case BSONObj::opALL:
+                            all = true;
+                        case BSONObj::opIN:
+                        case BSONObj::NIN:
+                            basics.push_back( ElementMatcher( e , op , fe.embeddedObject() ) );
+                            break;
+                        case BSONObj::opMOD:
+                        case BSONObj::opTYPE:
+                        case BSONObj::opELEM_MATCH:
+                            // these are types where ElementMatcher has all the info
+                            basics.push_back( ElementMatcher( e , op ) );
+                            break;
+                        case BSONObj::opSIZE:{
+                            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+                            _builders.push_back( b );
+                            b->appendAs(fe, e.fieldName());
+                            addBasic(b->done().firstElement(), BSONObj::opSIZE);    
+                            haveSize = true;
+                            break;
+                        }
+                        case BSONObj::opEXISTS:{
+                            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+                            _builders.push_back( b );
+                            b->appendAs(fe, e.fieldName());
+                            addBasic(b->done().firstElement(), BSONObj::opEXISTS);
+                            break;
+                        }
+                        case BSONObj::opREGEX:{
+                            regex = fe.valuestrsafe();
+                            break;
+                        }
+                        case BSONObj::opOPTIONS:{
+                            flags = fe.valuestrsafe();
+                            break;
+                        }
+                        default:
+                            uassert( 10069 ,  (string)"BUG - can't operator for: " + fn , 0 );
+                        }
+                        
+                    }
+                    else {
+                        isOperator = false;
+                        break;
+                    }
+                }
+                if (regex){
+                    if ( nRegex >= 4 ) {
+                        out() << "ERROR: too many regexes in query" << endl;
+                    } else {
+                        RegexMatcher& rm = regexs[nRegex];
+                        rm.re = new pcrecpp::RE(regex, flags2options(flags));
+                        rm.fieldName = e.fieldName();
+                        nRegex++;
+                    }
+                }
+                if ( isOperator )
+                    continue;
+            }
+
+            if ( e.type() == Array ){
+                hasArray = true;
+            }
+            else if( strcmp(e.fieldName(), "$atomic") == 0 ) {
+                _atomic = e.trueValue();
+                continue;
+            }
+            
+            // normal, simple case e.g. { a : "foo" }
+            addBasic(e, BSONObj::Equality);
+        }
+        
+        constrainIndexKey_ = constrainIndexKey;
+    }
+
+    inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) {
+        assert( op != BSONObj::NE && op != BSONObj::NIN );
+        
+        if ( op == BSONObj::Equality )
+            return l.valuesEqual(r);
+        
+        if ( op == BSONObj::opIN ) {
+            // { $in : [1,2,3] }
+            return bm.myset->count(l);
+        }
+
+        if ( op == BSONObj::opSIZE ) {
+            if ( l.type() != Array )
+                return 0;
+            int count = 0;
+            BSONObjIterator i( l.embeddedObject() );
+            while( i.moreWithEOO() ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                ++count;
+            }
+            return count == r.number();
+        }
+        
+        if ( op == BSONObj::opMOD ){
+            if ( ! l.isNumber() )
+                return false;
+            
+            return l.numberLong() % bm.mod == bm.modm;
+        }
+        
+        if ( op == BSONObj::opTYPE ){
+            return bm.type == l.type();
+        }
+
+        /* check LT, GTE, ... */
+        if ( l.canonicalType() != r.canonicalType() )
+            return false;
+        int c = compareElementValues(l, r);
+        if ( c < -1 ) c = -1;
+        if ( c > 1 ) c = 1;
+        int z = 1 << (c+1);
+        return (op & z);
+    }
+
+    int Matcher::matchesNe(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm ) {
+        int ret = matchesDotted( fieldName, toMatch, obj, BSONObj::Equality, bm );
+        if ( bm.toMatch.type() != jstNULL )
+            return ( ret <= 0 ) ? 1 : 0;
+        else
+            return -ret;
+    }
+
+    int retMissing( const ElementMatcher &bm ) {
+        if ( bm.compareOp != BSONObj::opEXISTS )
+            return 0;
+        return bm.toMatch.boolean() ? -1 : 1;
+    }
+    
+    /* Check if a particular field matches.
+
+       fieldName - field to match "a.b" if we are reaching into an embedded object.
+       toMatch   - element we want to match.
+       obj       - database object to check against
+       compareOp - Equality, LT, GT, etc.
+       isArr     -
+
+       Special forms:
+
+         { "a.b" : 3 }             means       obj.a.b == 3
+         { a : { $lt : 3 } }       means       obj.a < 3
+    	 { a : { $in : [1,2] } }   means       [1,2].contains(obj.a)
+         
+         return value
+       -1 mismatch
+        0 missing element
+        1 match
+    */
+    int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& bm , bool isArr) {
+
+        if ( compareOp == BSONObj::opALL ) {
+            if ( bm.myset->size() == 0 )
+                return -1; // is this desired?
+            BSONObjSetDefaultOrder actualKeys;
+            IndexSpec( BSON( fieldName << 1 ) ).getKeys( obj, actualKeys );
+            if ( actualKeys.size() == 0 )
+                return 0;
+            for( set< BSONElement, element_lt >::const_iterator i = bm.myset->begin(); i != bm.myset->end(); ++i ) {
+                // ignore nulls
+                if ( i->type() == jstNULL )
+                    continue;
+                // parallel traversal would be faster worst case I guess
+                BSONObjBuilder b;
+                b.appendAs( *i, "" );
+                if ( !actualKeys.count( b.done() ) )
+                    return -1;
+            }
+            return 1;
+        }
+
+        if ( compareOp == BSONObj::NE )
+            return matchesNe( fieldName, toMatch, obj, bm );
+        if ( compareOp == BSONObj::NIN ) {
+            for( set<BSONElement,element_lt>::const_iterator i = bm.myset->begin(); i != bm.myset->end(); ++i ) {
+                int ret = matchesNe( fieldName, *i, obj, bm );
+                if ( ret != 1 )
+                    return ret;
+            }
+            return 1;
+        }
+        
+        BSONElement e;
+        bool indexed = !constrainIndexKey_.isEmpty();
+        if ( indexed ) {
+            e = obj.getFieldUsingIndexNames(fieldName, constrainIndexKey_);
+            assert( !e.eoo() );
+        } else {
+            if ( isArr ) {
+                BSONObjIterator ai(obj);
+                bool found = false;
+                while ( ai.moreWithEOO() ) {
+                    BSONElement z = ai.next();
+                    if ( z.type() == Object ) {
+                        BSONObj eo = z.embeddedObject();
+                        int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, bm, false);
+                        if ( cmp > 0 ) {
+                            return 1;
+                        } else if ( cmp < 0 ) {
+                            found = true;
+                        }
+                    }
+                }
+                return found ? -1 : retMissing( bm );
+            }
+            const char *p = strchr(fieldName, '.');
+            if ( p ) {
+                string left(fieldName, p-fieldName);
+
+                BSONElement se = obj.getField(left.c_str());
+                if ( se.eoo() )
+                    return retMissing( bm );
+                if ( se.type() != Object && se.type() != Array )
+                    return retMissing( bm );
+
+                BSONObj eo = se.embeddedObject();
+                return matchesDotted(p+1, toMatch, eo, compareOp, bm, se.type() == Array);
+            } else {
+                e = obj.getField(fieldName);
+            }
+        }
+
+        if ( compareOp == BSONObj::opEXISTS ) {
+            return ( e.eoo() ^ toMatch.boolean() ) ? 1 : -1;
+        } else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) &&
+            valuesMatch(e, toMatch, compareOp, bm ) ) {
+            return 1;
+        } else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) {
+            
+            BSONObjIterator ai(e.embeddedObject());
+
+            while ( ai.moreWithEOO() ) {
+                BSONElement z = ai.next();
+                
+                if ( compareOp == BSONObj::opELEM_MATCH ){
+                    // SERVER-377
+                    if ( z.type() == Object && bm.subMatcher->matches( z.embeddedObject() ) )
+                        return 1;
+                }
+                else {
+                    if ( valuesMatch( z, toMatch, compareOp, bm) ) {
+                        return 1;
+                    }
+                }
+
+            }
+            
+            if ( compareOp == BSONObj::Equality && e.woCompare( toMatch ) == 0 ){
+                // match an entire array to itself
+                return 1;
+            }
+            
+        }
+        else if ( e.eoo() ) {
+            // 0 indicates "missing element"
+            return 0;
+        }
+        return -1;
+    }
+
+    extern int dump;
+
+    inline bool regexMatches(RegexMatcher& rm, const BSONElement& e) {
+        char buf[64];
+        const char *p = buf;
+        if ( e.type() == String || e.type() == Symbol )
+            p = e.valuestr();
+        else if ( e.isNumber() ) {
+            sprintf(buf, "%f", e.number());
+        }
+        else if ( e.type() == Date ) {
+            Date_t d = e.date();
+            time_t t = (d.millis/1000);
+            time_t_to_String(t, buf);
+        }
+        else
+            return false;
+        return rm.re->PartialMatch(p);
+    }
+
+    /* See if an object matches the query.
+    */
+    bool Matcher::matches(const BSONObj& jsobj ) {
+        /* assuming there is usually only one thing to match.  if more this
+        could be slow sometimes. */
+
+        // check normal non-regex cases:
+        for ( unsigned i = 0; i < basics.size(); i++ ) {
+            ElementMatcher& bm = basics[i];
+            BSONElement& m = bm.toMatch;
+            // -1=mismatch. 0=missing element. 1=match
+            int cmp = matchesDotted(m.fieldName(), m, jsobj, bm.compareOp, bm );
+            if ( cmp < 0 )
+                return false;
+            if ( cmp == 0 ) {
+                /* missing is ok iff we were looking for null */
+                if ( m.type() == jstNULL || m.type() == Undefined ) {
+                    if ( bm.compareOp == BSONObj::NE ) {
+                        return false;
+                    }
+                } else {
+                    return false;
+                }
+            }
+        }
+
+        for ( int r = 0; r < nRegex; r++ ) {
+            RegexMatcher& rm = regexs[r];
+            BSONElementSet s;
+            if ( !constrainIndexKey_.isEmpty() ) {
+                BSONElement e = jsobj.getFieldUsingIndexNames(rm.fieldName, constrainIndexKey_);
+                if ( !e.eoo() )
+                    s.insert( e );
+            } else {
+                jsobj.getFieldsDotted( rm.fieldName, s );
+            }
+            bool match = false;
+            for( BSONElementSet::const_iterator i = s.begin(); i != s.end(); ++i )
+                if ( regexMatches(rm, *i) )
+                    match = true;
+            if ( !match )
+                return false;
+        }
+        
+        if ( where ) {
+            if ( where->func == 0 ) {
+                uassert( 10070 , "$where compile error", false);
+                return false; // didn't compile
+            }
+            
+            if ( where->jsScope ){
+                where->scope->init( where->jsScope );
+            }
+            where->scope->setThis( const_cast< BSONObj * >( &jsobj ) );
+            where->scope->setObject( "obj", const_cast< BSONObj & >( jsobj ) );
+            where->scope->setBoolean( "fullObject" , true ); // this is a hack b/c fullObject used to be relevant
+            
+            int err = where->scope->invoke( where->func , BSONObj() , 1000 * 60 , false );
+            where->scope->setThis( 0 );
+            if ( err == -3 ) { // INVOKE_ERROR
+                stringstream ss;
+                ss << "error on invocation of $where function:\n" 
+                   << where->scope->getError();
+                uassert( 10071 , ss.str(), false);
+                return false;
+            } else if ( err != 0 ) { // ! INVOKE_SUCCESS
+                uassert( 10072 , "unknown error in invocation of $where function", false);
+                return false;                
+            }
+            return where->scope->getBoolean( "return" ) != 0;
+
+        }
+
+        return true;
+    }
+
+    struct JSObj1 js1;
+
+#pragma pack(1)
+    struct JSObj2 {
+        JSObj2() {
+            totsize=sizeof(JSObj2);
+            s = String;
+            strcpy_s(sname, 7, "abcdef");
+            slen = 10;
+            strcpy_s(sval, 10, "123456789");
+            eoo = EOO;
+        }
+        unsigned totsize;
+        char s;
+        char sname[7];
+        unsigned slen;
+        char sval[10];
+        char eoo;
+    } js2;
+
+    struct JSUnitTest : public UnitTest {
+        void run() {
+
+            BSONObj j1((const char *) &js1);
+            BSONObj j2((const char *) &js2);
+            Matcher m(j2);
+            assert( m.matches(j1) );
+            js2.sval[0] = 'z';
+            assert( !m.matches(j1) );
+            Matcher n(j1);
+            assert( n.matches(j1) );
+            assert( !n.matches(j2) );
+
+            BSONObj j0 = BSONObj();
+//		BSONObj j0((const char *) &js0);
+            Matcher p(j0);
+            assert( p.matches(j1) );
+            assert( p.matches(j2) );
+        }
+    } jsunittest;
+
+#pragma pack()
+
+    struct RXTest : public UnitTest {
+
+        RXTest() {
+        }
+        
+        void run() {
+            /*
+            static const boost::regex e("(\\d{4}[- ]){3}\\d{4}");
+            static const boost::regex b(".....");
+            out() << "regex result: " << regex_match("hello", e) << endl;
+            out() << "regex result: " << regex_match("abcoo", b) << endl;
+            */
+
+            int ret = 0;
+            
+            pcre_config( PCRE_CONFIG_UTF8 , &ret );
+            massert( 10342 ,  "pcre not compiled with utf8 support" , ret );
+
+            pcrecpp::RE re1(")({a}h.*o");
+            pcrecpp::RE re("h.llo");
+            assert( re.FullMatch("hello") );
+            assert( !re1.FullMatch("hello") );
+
+
+            pcrecpp::RE_Options options;
+            options.set_utf8(true);
+            pcrecpp::RE part("dwi", options);
+            assert( part.PartialMatch("dwight") );
+
+            pcre_config( PCRE_CONFIG_UNICODE_PROPERTIES , &ret );
+            if ( ! ret )
+                cout << "warning: some regex utf8 things will not work.  pcre build doesn't have --enable-unicode-properties" << endl;
+            
+        }
+    } rxtest;
+
+} // namespace mongo
diff --git a/db/matcher.h b/db/matcher.h
new file mode 100644
index 0000000..f1609f9
--- /dev/null
+++ b/db/matcher.h
@@ -0,0 +1,184 @@
+// matcher.h
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "jsobj.h"
+#include <pcrecpp.h>
+
+namespace mongo {
+
+    class CoveredIndexMatcher;
+    class Matcher;
+
+    class RegexMatcher {
+    public:
+        const char *fieldName;
+        pcrecpp::RE *re;
+        RegexMatcher() {
+            re = 0;
+        }
+        ~RegexMatcher() {
+            delete re;
+        }
+    };
+    
+    struct element_lt
+    {
+        bool operator()(const BSONElement& l, const BSONElement& r) const
+        {
+            int x = (int) l.canonicalType() - (int) r.canonicalType();
+            if ( x < 0 ) return true;
+            else if ( x > 0 ) return false;
+            return compareElementValues(l,r) < 0;
+        }
+    };
+
+    
+    class ElementMatcher {
+    public:
+    
+        ElementMatcher() {
+        }
+        
+        ElementMatcher( BSONElement _e , int _op );
+        
+        ElementMatcher( BSONElement _e , int _op , const BSONObj& array ) : toMatch( _e ) , compareOp( _op ) {
+            
+            myset.reset( new set<BSONElement,element_lt>() );
+            
+            BSONObjIterator i( array );
+            while ( i.more() ) {
+                BSONElement ie = i.next();
+                myset->insert(ie);
+            }
+        }
+        
+        ~ElementMatcher();
+
+        BSONElement toMatch;
+        int compareOp;
+        shared_ptr< set<BSONElement,element_lt> > myset;
+        
+        // these are for specific operators
+        int mod;
+        int modm;
+        BSONType type;
+
+        shared_ptr<Matcher> subMatcher;
+    };
+
+// SQL where clause equivalent
+    class Where;
+    class DiskLoc;
+
+    /* Match BSON objects against a query pattern.
+
+       e.g.
+           db.foo.find( { a : 3 } );
+
+       { a : 3 } is the pattern object.  See wiki documentation for full info.
+
+       GT/LT:
+         { a : { $gt : 3 } }
+       Not equal:
+         { a : { $ne : 3 } }
+
+       TODO: we should rewrite the matcher to be more an AST style.
+    */
+    class Matcher : boost::noncopyable {
+        int matchesDotted(
+            const char *fieldName,
+            const BSONElement& toMatch, const BSONObj& obj,
+            int compareOp, const ElementMatcher& bm, bool isArr = false);
+
+        int matchesNe(
+            const char *fieldName,
+            const BSONElement &toMatch, const BSONObj &obj,
+            const ElementMatcher&bm);
+        
+    public:
+        static int opDirection(int op) {
+            return op <= BSONObj::LTE ? -1 : 1;
+        }
+
+        // Only specify constrainIndexKey if matches() will be called with
+        // index keys having empty string field names.
+        Matcher(const BSONObj &pattern, const BSONObj &constrainIndexKey = BSONObj());
+
+        ~Matcher();
+
+        bool matches(const BSONObj& j);
+        
+        bool keyMatch() const { return !all && !haveSize && !hasArray; }
+
+        bool atomic() const { return _atomic; }
+
+    private:
+        void addBasic(const BSONElement &e, int c) {
+            // TODO May want to selectively ignore these element types based on op type.
+            if ( e.type() == MinKey || e.type() == MaxKey )
+                return;
+            basics.push_back( ElementMatcher( e , c ) );
+        }
+
+        int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm);
+
+        Where *where;                    // set if query uses $where
+        BSONObj jsobj;                  // the query pattern.  e.g., { name: "joe" }
+        BSONObj constrainIndexKey_;
+        vector<ElementMatcher> basics;
+//        int n;                           // # of basicmatcher items
+        bool haveSize;
+        bool all;
+        bool hasArray;
+
+        /* $atomic - if true, a multi document operation (some removes, updates)
+                     should be done atomically.  in that case, we do not yield - 
+                     i.e. we stay locked the whole time.
+                     http://www.mongodb.org/display/DOCS/Removing[
+        */
+        bool _atomic;
+
+        RegexMatcher regexs[4];
+        int nRegex;
+
+        // so we delete the mem when we're done:
+        vector< shared_ptr< BSONObjBuilder > > _builders;
+
+        friend class CoveredIndexMatcher;
+    };
+    
+    // If match succeeds on index key, then attempt to match full document.
+    class CoveredIndexMatcher : boost::noncopyable {
+    public:
+        CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern);
+        bool matches(const BSONObj &o){ return _docMatcher.matches( o ); }
+        bool matches(const BSONObj &key, const DiskLoc &recLoc);
+        bool needRecord(){ return _needRecord; }
+
+        Matcher& docMatcher() { return _docMatcher; }
+    private:
+        Matcher _keyMatcher;
+        Matcher _docMatcher;
+        bool _needRecord;
+    };
+    
+} // namespace mongo
diff --git a/db/minilex.h b/db/minilex.h
new file mode 100644
index 0000000..ba8df26
--- /dev/null
+++ b/db/minilex.h
@@ -0,0 +1,160 @@
+// minilex.h
+// mini js lexical analyzer.  idea is to be dumb and fast.
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+namespace mongo {
+
+#if defined(_WIN32)
+    
+} // namespace mongo
+
+#include <hash_map>
+using namespace stdext;
+
+namespace mongo {
+    
+    typedef const char * MyStr;
+    struct less_str {
+        bool operator()(const MyStr & x, const MyStr & y) const {
+            if ( strcmp(x, y) > 0)
+                return true;
+            
+            return false;
+        }
+    };
+    
+    typedef hash_map<const char*, int, hash_compare<const char *, less_str> > strhashmap;
+    
+#else
+    
+} // namespace mongo
+
+#include <ext/hash_map>
+
+namespace mongo {
+    
+    using namespace __gnu_cxx;
+
+    typedef const char * MyStr;
+    struct eq_str {
+        bool operator()(const MyStr & x, const MyStr & y) const {
+            if ( strcmp(x, y) == 0)
+                return true;
+            
+            return false;
+        }
+    };
+    
+    typedef hash_map<const char*, int, hash<const char *>, eq_str > strhashmap;
+    
+#endif
+    
+    struct MiniLex {
+        strhashmap reserved;
+        bool ic[256]; // ic=Identifier Character
+        bool starter[256];
+
+        // dm: very dumb about comments and escaped quotes -- but we are faster then at least,
+        // albeit returning too much (which is ok for jsbobj current usage).
+        void grabVariables(char *code /*modified and must stay in scope*/, strhashmap& vars) {
+            char *p = code;
+            char last = 0;
+            while ( *p ) {
+                if ( starter[*p] ) {
+                    char *q = p+1;
+                    while ( *q && ic[*q] ) q++;
+                    const char *identifier = p;
+                    bool done = *q == 0;
+                    *q = 0;
+                    if ( !reserved.count(identifier) ) {
+                        // we try to be smart about 'obj' but have to be careful as obj.obj
+                        // can happen; this is so that nFields is right for simplistic where cases
+                        // so we can stop scanning in jsobj when we find the field of interest.
+                        if ( strcmp(identifier,"obj")==0 && p>code && p[-1] != '.' )
+                            ;
+                        else
+                            vars[identifier] = 1;
+                    }
+                    if ( done )
+                        break;
+                    p = q + 1;
+                    continue;
+                }
+
+                if ( *p == '\'' ) {
+                    p++;
+                    while ( *p && *p != '\'' ) p++;
+                }
+                else if ( *p == '"' ) {
+                    p++;
+                    while ( *p && *p != '"' ) p++;
+                }
+                p++;
+            }
+        }
+
+        MiniLex() {
+            strhashmap atest;
+            atest["foo"] = 3;
+            assert( atest.count("bar") == 0 );
+            assert( atest.count("foo") == 1 );
+            assert( atest["foo"] == 3 );
+
+            for ( int i = 0; i < 256; i++ ) {
+                ic[i] = starter[i] = false;
+            }
+            for ( int i = 'a'; i <= 'z'; i++ )
+                ic[i] = starter[i] = true;
+            for ( int i = 'A'; i <= 'Z'; i++ )
+                ic[i] = starter[i] = true;
+            for ( int i = '0'; i <= '9'; i++ )
+                ic[i] = true;
+            for ( int i = 128; i < 256; i++ )
+                ic[i] = starter[i] = true;
+            ic['$'] = starter['$'] = true;
+            ic['_'] = starter['_'] = true;
+
+            reserved["break"] = true;
+            reserved["case"] = true;
+            reserved["catch"] = true;
+            reserved["continue"] = true;
+            reserved["default"] = true;
+            reserved["delete"] = true;
+            reserved["do"] = true;
+            reserved["else"] = true;
+            reserved["finally"] = true;
+            reserved["for"] = true;
+            reserved["function"] = true;
+            reserved["if"] = true;
+            reserved["in"] = true;
+            reserved["instanceof"] = true;
+            reserved["new"] = true;
+            reserved["return"] = true;
+            reserved["switch"] = true;
+            reserved["this"] = true;
+            reserved["throw"] = true;
+            reserved["try"] = true;
+            reserved["typeof"] = true;
+            reserved["var"] = true;
+            reserved["void"] = true;
+            reserved["while"] = true;
+            reserved["with "] = true;
+        }
+    };
+
+} // namespace mongo
diff --git a/db/module.cpp b/db/module.cpp
new file mode 100644
index 0000000..d218fe6
--- /dev/null
+++ b/db/module.cpp
@@ -0,0 +1,52 @@
+// module.cpp
+
+#include "stdafx.h"
+#include "module.h"
+
+namespace mongo {
+
+    std::list<Module*> * Module::_all;
+
+    Module::Module( const string& name )
+        : _name( name ) , _options( (string)"Module " + name + " options" ){
+        if ( ! _all )
+            _all = new list<Module*>();
+        _all->push_back( this );
+    }
+
+    Module::~Module(){}
+
+    void Module::addOptions( program_options::options_description& options ){
+        if ( ! _all ) {
+            return;
+        }
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ){
+            Module* m = *i;
+            options.add( m->_options );
+        }
+    }
+
+    void Module::configAll( program_options::variables_map& params ){
+        if ( ! _all ) {
+            return;
+        }
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ){
+            Module* m = *i;
+            m->config( params );
+        }
+
+    }
+
+
+    void Module::initAll(){
+        if ( ! _all ) {
+            return;
+        }
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ){
+            Module* m = *i;
+            m->init();
+        }
+
+    }
+
+}
diff --git a/db/module.h b/db/module.h
new file mode 100644
index 0000000..728e861
--- /dev/null
+++ b/db/module.h
@@ -0,0 +1,70 @@
+// module.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.info
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include <boost/program_options.hpp>
+#include <list>
+
+namespace mongo {
+
+    /**
+     * Module is the base class for adding modules to MongoDB
+     * modules allow adding hooks and features to mongo
+     * the idea is to add hooks into the main code for module support where needed
+     * some ideas are: monitoring, indexes, full text search
+     */
+    class Module {
+    public:
+        Module( const string& name );
+        virtual ~Module();
+        
+        boost::program_options::options_description_easy_init add_options(){
+            return _options.add_options();
+        }
+
+        /**
+         * read config from command line
+         */
+        virtual void config( program_options::variables_map& params ) = 0;
+
+        /**
+         * called after configuration when the server is ready start
+         */
+        virtual void init() = 0;
+
+        /**
+         * called when the database is about to shutdown
+         */
+        virtual void shutdown() = 0;
+
+        const string& getName(){ return _name; }
+        
+        // --- static things
+        
+        static void addOptions( program_options::options_description& options );
+        static void configAll( program_options::variables_map& params );
+        static void initAll();
+
+    private:
+        static std::list<Module*> * _all;
+        string _name;
+        program_options::options_description _options;
+    };
+}
diff --git a/db/modules/mms.cpp b/db/modules/mms.cpp
new file mode 100644
index 0000000..9c00e60
--- /dev/null
+++ b/db/modules/mms.cpp
@@ -0,0 +1,144 @@
+// mms.cpp
+
+#include "stdafx.h"
+#include "../db.h"
+#include "../instance.h"
+#include "../module.h"
+#include "../../util/httpclient.h"
+#include "../../util/background.h"
+
+namespace po = boost::program_options;
+
+namespace mongo {
+
+    /** Mongo Monitoring Service
+        if enabled, this runs in the background ands pings mss
+     */
+    class MMS : public BackgroundJob , Module {
+    public:
+
+        MMS()
+            : Module( "mms" ) , _baseurl( "http://mms.10gen.com/ping/" ) , 
+              _secsToSleep(1) , _token( "" ) , _name( "" ) {
+            
+            add_options()
+                ( "mms-token" , po::value<string>() , "account token for mongo monitoring server" )
+                ( "mms-name" , po::value<string>() , "server name mongo monitoring server" )
+                ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval for mongo monitoring server" )
+                ;
+        }    
+        
+        ~MMS(){}
+
+        void config( program_options::variables_map& params ){
+            if ( params.count( "mms-token" ) ){
+                _token = params["mms-token"].as<string>();
+            }
+            if ( params.count( "mms-name" ) ){
+                _name = params["mms-name"].as<string>();
+            }
+            _secsToSleep = params["mms-interval"].as<int>();
+        }
+        
+        void run(){
+        if ( _token.size() == 0  && _name.size() == 0 ){
+            log(1) << "mms not configured" << endl;
+            return;
+        }
+
+        if ( _token.size() == 0 ){
+            log() << "no token for mms - not running" << endl;
+            return;
+        }
+        
+        if ( _name.size() == 0 ){
+            log() << "no name for mms - not running" << endl;
+            return;
+        }
+
+        log() << "mms monitor staring...  token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl;
+
+        unsigned long long lastTime = 0;
+        unsigned long long lastLockTime = 0;
+        
+        while ( ! inShutdown() ){
+            sleepsecs( _secsToSleep );
+            
+            stringstream url;
+            url << _baseurl << _token << "?";
+            url << "monitor_name=" << _name << "&";
+            url << "version=" << versionString << "&";
+            url << "git_hash=" << gitVersion() << "&";
+
+            { //percent_locked
+                unsigned long long time = curTimeMicros64();
+                unsigned long long start , lock;
+                dbMutex.info().getTimingInfo( start , lock );
+                if ( lastTime ){
+                    double timeDiff = (double) (time - lastTime);
+                    double lockDiff = (double) (lock - lastLockTime);
+                    url << "percent_locked=" << (int)ceil( 100 * ( lockDiff / timeDiff ) ) << "&";
+                }
+                lastTime = time;
+                lastLockTime = lock;
+            }
+            
+            vector< string > dbNames;
+            getDatabaseNames( dbNames );
+            boost::intmax_t totalSize = 0;
+            for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
+                boost::intmax_t size = dbSize( i->c_str() );
+                totalSize += size;
+            }
+            url << "data_size=" << totalSize / ( 1024 * 1024 ) << "&";
+
+            
+            
+            /* TODO: 
+              message_operations
+              update_operations
+              insert_operations
+              get_more_operations
+              delete_operations
+              kill_cursors_operations 
+            */
+            
+
+            log(1) << "mms url: " << url.str() << endl;
+            
+            try {
+                HttpClient c;
+                map<string,string> headers;
+                stringstream ss;
+                int rc = c.get( url.str() , headers , ss );
+                log(1) << "\t response code: " << rc << endl;
+                if ( rc != 200 ){
+                    log() << "mms error response code:" << rc << endl;
+                    log(1) << "mms error body:" << ss.str() << endl;
+                }
+            }
+            catch ( std::exception& e ){
+                log() << "mms get exception: " << e.what() << endl;
+            }
+        }
+        }
+
+        void init(){ go(); }
+
+        void shutdown(){
+            // TODO
+        }
+
+    private:
+        string _baseurl;
+        int _secsToSleep;
+        
+        string _token;
+        string _name;
+
+    } /* mms */;
+
+}
+
+        
+
diff --git a/db/mr.cpp b/db/mr.cpp
new file mode 100644
index 0000000..ff88d9e
--- /dev/null
+++ b/db/mr.cpp
@@ -0,0 +1,596 @@
+// mr.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "stdafx.h"
+#include "db.h"
+#include "instance.h"
+#include "commands.h"
+#include "../scripting/engine.h"
+#include "../client/dbclient.h"
+#include "../client/connpool.h"
+#include "../client/parallel.h"
+
+namespace mongo {
+
+    namespace mr {
+
+        class MyCmp {
+        public:
+            MyCmp(){}
+            bool operator()( const BSONObj &l, const BSONObj &r ) const {
+                return l.firstElement().woCompare( r.firstElement() ) < 0;
+            }
+        };
+
+        typedef pair<BSONObj,BSONObj> Data;
+        //typedef list< Data > InMemory;
+        typedef map< BSONObj,list<BSONObj>,MyCmp > InMemory;
+
+        BSONObj reduceValues( list<BSONObj>& values , Scope * s , ScriptingFunction reduce , bool final , ScriptingFunction finalize ){
+            uassert( 10074 ,  "need values" , values.size() );
+            
+            int sizeEstimate = ( values.size() * values.begin()->getField( "value" ).size() ) + 128;
+            BSONObj key;
+
+            BSONObjBuilder reduceArgs( sizeEstimate );
+        
+            BSONObjBuilder valueBuilder( sizeEstimate );
+            int n = 0;
+            for ( list<BSONObj>::iterator i=values.begin(); i!=values.end(); i++){
+                BSONObj o = *i;
+                BSONObjIterator j(o);
+                BSONElement keyE = j.next();
+                if ( n == 0 ){
+                    reduceArgs.append( keyE );
+                    BSONObjBuilder temp;
+                    temp.append( keyE );
+                    key = temp.obj();
+                }
+                valueBuilder.appendAs( j.next() , BSONObjBuilder::numStr( n++ ).c_str() );
+            }
+        
+            reduceArgs.appendArray( "values" , valueBuilder.obj() );
+            BSONObj args = reduceArgs.obj();
+            
+            s->invokeSafe( reduce , args );
+            if ( s->type( "return" ) == Array ){
+                uassert( 10075 , "reduce -> multiple not supported yet",0);                
+                return BSONObj();
+            }
+            
+            if ( finalize ){
+                BSONObjBuilder b;
+                b.appendAs( key.firstElement() , "_id" );
+                s->append( b , "value" , "return" );
+                s->invokeSafe( finalize , b.obj() );
+            }
+            
+            BSONObjBuilder b;
+            b.appendAs( key.firstElement() , final ? "_id" : "0" );
+            s->append( b , final ? "value" : "1" , "return" );
+            return b.obj();
+        }
+        
+        class MRSetup {
+        public:
+            MRSetup( const string& _dbname , const BSONObj& cmdObj , bool markAsTemp = true ){
+                static int jobNumber = 1;
+                
+                dbname = _dbname;
+                ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+                verbose = cmdObj["verbose"].trueValue();
+                keeptemp = cmdObj["keeptemp"].trueValue();
+                
+                { // setup names
+                    stringstream ss;
+                    if ( ! keeptemp )
+                        ss << "tmp.";
+                    ss << "mr." << cmdObj.firstElement().fieldName() << "_" << time(0) << "_" << jobNumber++;    
+                    tempShort = ss.str();
+                    tempLong = dbname + "." + tempShort;
+                    incLong = tempLong + "_inc";
+
+                    if ( ! keeptemp && markAsTemp )
+                        cc().addTempCollection( tempLong );
+
+                    if ( cmdObj["out"].type() == String )
+                        finalShort = cmdObj["out"].valuestr();
+                    else
+                        finalShort = tempShort;
+                    
+                    finalLong = dbname + "." + finalShort;
+                    
+                }
+             
+                { // code
+                    mapCode = cmdObj["map"].ascode();
+                    reduceCode = cmdObj["reduce"].ascode();
+                    if ( cmdObj["finalize"].type() ){
+                        finalizeCode = cmdObj["finalize"].ascode();
+                    }
+                    
+
+                    if ( cmdObj["mapparams"].type() == Array ){
+                        mapparams = cmdObj["mapparams"].embeddedObjectUserCheck();
+                    }
+
+                    if ( cmdObj["scope"].type() == Object ){
+                        scopeSetup = cmdObj["scope"].embeddedObjectUserCheck();
+                    }
+                    
+                }
+                
+                { // query options
+                    if ( cmdObj["query"].type() == Object ){
+                        filter = cmdObj["query"].embeddedObjectUserCheck();
+                        q = filter;
+                    }
+                    
+                    if ( cmdObj["sort"].type() == Object )
+                        q.sort( cmdObj["sort"].embeddedObjectUserCheck() );
+
+                    if ( cmdObj["limit"].isNumber() )
+                        limit = cmdObj["limit"].numberLong();
+                    else 
+                        limit = 0;
+                }
+            }
+            
+            /**
+               @return number objects in collection
+             */
+            long long renameIfNeeded( DBDirectClient& db ){
+                if ( finalLong != tempLong ){
+                    db.dropCollection( finalLong );
+                    if ( db.count( tempLong ) ){
+                        BSONObj info;
+                        uassert( 10076 ,  "rename failed" , db.runCommand( "admin" , BSON( "renameCollection" << tempLong << "to" << finalLong ) , info ) );
+                    }
+                }
+                return db.count( finalLong );
+            }
+                
+            string dbname;
+            string ns;
+            
+            // options
+            bool verbose;            
+            bool keeptemp;
+
+            // query options
+            
+            BSONObj filter;
+            Query q;
+            long long limit;
+
+            // functions
+            
+            string mapCode;
+            string reduceCode;
+            string finalizeCode;
+            
+            BSONObj mapparams;
+            BSONObj scopeSetup;
+            
+            // output tables
+            string incLong;
+            
+            string tempShort;
+            string tempLong;
+            
+            string finalShort;
+            string finalLong;
+            
+        }; // end MRsetup
+
+        class MRState {
+        public:
+            MRState( MRSetup& s ) : setup(s){
+                scope = globalScriptEngine->getPooledScope( setup.dbname );
+                scope->localConnect( setup.dbname.c_str() );
+                
+                map = scope->createFunction( setup.mapCode.c_str() );
+                if ( ! map )
+                    throw UserException( 9012, (string)"map compile failed: " + scope->getError() );
+
+                reduce = scope->createFunction( setup.reduceCode.c_str() );
+                if ( ! reduce )
+                    throw UserException( 9013, (string)"reduce compile failed: " + scope->getError() );
+
+                if ( setup.finalizeCode.size() )
+                    finalize  = scope->createFunction( setup.finalizeCode.c_str() );
+                else
+                    finalize = 0;
+                
+                if ( ! setup.scopeSetup.isEmpty() )
+                    scope->init( &setup.scopeSetup );
+
+                db.dropCollection( setup.tempLong );
+                db.dropCollection( setup.incLong );
+                
+                writelock l( setup.incLong );
+                string err;
+                assert( userCreateNS( setup.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) );
+
+            }
+
+            void finalReduce( list<BSONObj>& values ){
+                if ( values.size() == 0 )
+                    return;
+
+                BSONObj key = values.begin()->firstElement().wrap( "_id" );
+                BSONObj res = reduceValues( values , scope.get() , reduce , 1 , finalize );
+                
+                writelock l( setup.tempLong );
+                theDataFileMgr.insertAndLog( setup.tempLong.c_str() , res , false );
+            }
+
+            
+            MRSetup& setup;
+            auto_ptr<Scope> scope;
+            DBDirectClient db;
+
+            ScriptingFunction map;
+            ScriptingFunction reduce;
+            ScriptingFunction finalize;
+            
+        };
+        
+        class MRTL {
+        public:
+            MRTL( MRState& state ) : _state( state ){
+                _temp = new InMemory();
+                _size = 0;
+                numEmits = 0;
+            }
+            ~MRTL(){
+                delete _temp;
+            }
+            
+            
+            void reduceInMemory(){
+                
+                InMemory * old = _temp;
+                InMemory * n = new InMemory();
+                _temp = n;
+                _size = 0;
+                
+                for ( InMemory::iterator i=old->begin(); i!=old->end(); i++ ){
+                    BSONObj key = i->first;
+                    list<BSONObj>& all = i->second;
+                    
+                    if ( all.size() == 1 ){
+                        // this key has low cardinality, so just write to db
+                        writelock l(_state.setup.incLong);
+                        write( *(all.begin()) );
+                    }
+                    else if ( all.size() > 1 ){
+                        BSONObj res = reduceValues( all , _state.scope.get() , _state.reduce , false , 0 );
+                        insert( res );
+                    }
+                }
+                
+                delete( old );
+
+            }
+
+            void dump(){
+                writelock l(_state.setup.incLong);
+                    
+                for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ){
+                    list<BSONObj>& all = i->second;
+                    if ( all.size() < 1 )
+                        continue;
+                    
+                    for ( list<BSONObj>::iterator j=all.begin(); j!=all.end(); j++ )
+                        write( *j );
+                }
+                _temp->clear();
+                _size = 0;
+
+            }
+            
+            void insert( const BSONObj& a ){
+                list<BSONObj>& all = (*_temp)[a];
+                all.push_back( a );
+                _size += a.objsize() + 16;
+            }
+
+            void checkSize(){
+                if ( _size < 1024 * 5 )
+                    return;
+
+                long before = _size;
+                reduceInMemory();
+                log(1) << "  mr: did reduceInMemory  " << before << " -->> " << _size << endl;
+
+                if ( _size < 1024 * 15 )
+                    return;
+                
+                dump();
+                log(1) << "  mr: dumping to db" << endl;
+            }
+
+        private:
+            void write( BSONObj& o ){
+                theDataFileMgr.insert( _state.setup.incLong.c_str() , o , true );
+            }
+            
+            MRState& _state;
+        
+            InMemory * _temp;
+            long _size;
+            
+        public:
+            long long numEmits;
+        };
+
+        boost::thread_specific_ptr<MRTL> _tlmr;
+
+        BSONObj fast_emit( const BSONObj& args ){
+            uassert( 10077 ,  "fast_emit takes 2 args" , args.nFields() == 2 );
+            _tlmr->insert( args );
+            _tlmr->numEmits++;
+            return BSONObj();
+        }
+
+        class MapReduceCommand : public Command {
+        public:
+            MapReduceCommand() : Command("mapreduce"){}
+            virtual bool slaveOk() { return true; }
+        
+            virtual void help( stringstream &help ) const {
+                help << "see http://www.mongodb.org/display/DOCS/MapReduce";
+            }
+        
+            bool run(const char *dbname, BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+                Timer t;
+                Client::GodScope cg;
+                MRSetup mr( cc().database()->name , cmd );
+
+                log(1) << "mr ns: " << mr.ns << endl;
+                
+                if ( ! db.exists( mr.ns ) ){
+                    errmsg = "ns doesn't exist";
+                    return false;
+                }
+                
+                bool shouldHaveData = false;
+                
+                long long num = 0;
+                long long inReduce = 0;
+                
+                BSONObjBuilder countsBuilder;
+                BSONObjBuilder timingBuilder;
+                try {
+                    
+                    MRState state( mr );
+                    state.scope->injectNative( "emit" , fast_emit );
+                    
+                    MRTL * mrtl = new MRTL( state );
+                    _tlmr.reset( mrtl );
+
+                    ProgressMeter pm( db.count( mr.ns , mr.filter ) );
+                    auto_ptr<DBClientCursor> cursor = db.query( mr.ns , mr.q );
+                    long long mapTime = 0;
+                    Timer mt;
+                    while ( cursor->more() ){
+                        BSONObj o = cursor->next(); 
+                    
+                        if ( mr.verbose ) mt.reset();
+                        
+                        state.scope->setThis( &o );
+                        if ( state.scope->invoke( state.map , state.setup.mapparams , 0 , true ) )
+                            throw UserException( 9014, (string)"map invoke failed: " + state.scope->getError() );
+                        
+                        if ( mr.verbose ) mapTime += mt.micros();
+                        
+                        num++;
+                        if ( num % 100 == 0 ){
+                            Timer t;
+                            mrtl->checkSize();
+                            inReduce += t.micros();
+                            dbtemprelease temprlease;
+                        }
+                        pm.hit();
+
+                        if ( mr.limit && num >= mr.limit )
+                            break;
+                    }
+                    
+                    countsBuilder.append( "input" , num );
+                    countsBuilder.append( "emit" , mrtl->numEmits );
+                    if ( mrtl->numEmits )
+                        shouldHaveData = true;
+                    
+                    timingBuilder.append( "mapTime" , mapTime / 1000 );
+                    timingBuilder.append( "emitLoop" , t.millis() );
+                    
+                    // final reduce
+                    
+                    mrtl->reduceInMemory();
+                    mrtl->dump();
+                    
+                    BSONObj sortKey = BSON( "0" << 1 );
+                    db.ensureIndex( mr.incLong , sortKey );
+                    
+                    BSONObj prev;
+                    list<BSONObj> all;
+                    
+                    ProgressMeter fpm( db.count( mr.incLong ) );
+                    cursor = db.query( mr.incLong, Query().sort( sortKey ) );
+
+                    while ( cursor->more() ){
+                        BSONObj o = cursor->next().getOwned();
+                        
+                        if ( o.woSortOrder( prev , sortKey ) == 0 ){
+                            all.push_back( o );
+                            continue;
+                        }
+                        
+                        state.finalReduce( all );
+                        
+                        all.clear();
+                        prev = o;
+                        all.push_back( o );
+                        fpm.hit();
+                        dbtemprelease tl;
+                    }
+                    
+                    state.finalReduce( all );
+
+                    _tlmr.reset( 0 );
+                }
+                catch ( ... ){
+                    log() << "mr failed, removing collection" << endl;
+                    db.dropCollection( mr.tempLong );
+                    db.dropCollection( mr.incLong );
+                    throw;
+                }
+                
+                db.dropCollection( mr.incLong );
+                
+                long long finalCount = mr.renameIfNeeded( db );
+
+                timingBuilder.append( "total" , t.millis() );
+                
+                result.append( "result" , mr.finalShort );
+                result.append( "timeMillis" , t.millis() );
+                countsBuilder.append( "output" , finalCount );
+                if ( mr.verbose ) result.append( "timing" , timingBuilder.obj() );
+                result.append( "counts" , countsBuilder.obj() );
+
+                if ( finalCount == 0 && shouldHaveData ){
+                    result.append( "cmd" , cmd );
+                    errmsg = "there were emits but no data!";
+                    return false;
+                }
+
+                return true;
+            }
+
+        private:
+            DBDirectClient db;
+
+        } mapReduceCommand;
+        
+        class MapReduceFinishCommand : public Command {
+        public:
+            MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ){}
+            virtual bool slaveOk() { return true; }
+
+            bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){
+                dbtemprelease temprlease; // we don't touch the db directly
+                                    
+                string dbname = cc().database()->name;
+                string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe();
+
+                MRSetup mr( dbname , cmdObj.firstElement().embeddedObjectUserCheck() , false );
+                
+                set<ServerAndQuery> servers;
+                
+                BSONObjBuilder shardCounts;
+                map<string,long long> counts;
+                
+                BSONObj shards = cmdObj["shards"].embeddedObjectUserCheck();
+                vector< auto_ptr<DBClientCursor> > shardCursors;
+                BSONObjIterator i( shards );
+                while ( i.more() ){
+                    BSONElement e = i.next();
+                    string shard = e.fieldName();
+
+                    BSONObj res = e.embeddedObjectUserCheck();
+                    
+                    uassert( 10078 ,  "something bad happened" , shardedOutputCollection == res["result"].valuestrsafe() );
+                    servers.insert( shard );
+                    shardCounts.appendAs( res["counts"] , shard.c_str() );
+
+                    BSONObjIterator j( res["counts"].embeddedObjectUserCheck() );
+                    while ( j.more() ){
+                        BSONElement temp = j.next();
+                        counts[temp.fieldName()] += temp.numberLong();
+                    }
+
+                }
+
+                BSONObj sortKey = BSON( "_id" << 1 );
+
+                ParallelSortClusteredCursor cursor( servers , dbname + "." + shardedOutputCollection ,
+                                                    Query().sort( sortKey ) );
+                
+                
+                auto_ptr<Scope> s = globalScriptEngine->getPooledScope( ns );
+                ScriptingFunction reduceFunction = s->createFunction( mr.reduceCode.c_str() );
+                ScriptingFunction finalizeFunction = 0;
+                if ( mr.finalizeCode.size() )
+                    finalizeFunction = s->createFunction( mr.finalizeCode.c_str() );
+
+                list<BSONObj> values;
+
+                result.append( "result" , mr.finalShort );
+
+                DBDirectClient db;
+                
+                while ( cursor.more() ){
+                    BSONObj t = cursor.next();
+                                        
+                    if ( values.size() == 0 ){
+                        values.push_back( t );
+                        continue;
+                    }
+                    
+                    if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ){
+                        values.push_back( t );
+                        continue;
+                    }
+                    
+
+                    db.insert( mr.tempLong , reduceValues( values , s.get() , reduceFunction , 1 , finalizeFunction ) );
+                    values.clear();
+                    values.push_back( t );
+                }
+                
+                if ( values.size() )
+                    db.insert( mr.tempLong , reduceValues( values , s.get() , reduceFunction , 1 , finalizeFunction ) );
+                
+                long long finalCount = mr.renameIfNeeded( db );
+                log(0) << " mapreducefinishcommand " << mr.finalLong << " " << finalCount << endl;
+
+                for ( set<ServerAndQuery>::iterator i=servers.begin(); i!=servers.end(); i++ ){
+                    ScopedDbConnection conn( i->_server );
+                    conn->dropCollection( dbname + "." + shardedOutputCollection );
+                }
+                
+                result.append( "shardCounts" , shardCounts.obj() );
+                
+                {
+                    BSONObjBuilder c;
+                    for ( map<string,long long>::iterator i=counts.begin(); i!=counts.end(); i++ ){
+                        c.append( i->first , i->second );
+                    }
+                    result.append( "counts" , c.obj() );
+                }
+
+                return 1;
+            }
+        } mapReduceFinishCommand;
+
+    }
+
+}
+
diff --git a/db/namespace.cpp b/db/namespace.cpp
new file mode 100644
index 0000000..ecd5f64
--- /dev/null
+++ b/db/namespace.cpp
@@ -0,0 +1,753 @@
+// namespace.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "pdfile.h"
+#include "db.h"
+#include "../util/mmap.h"
+#include "../util/hashtab.h"
+#include "../scripting/engine.h"
+#include "btree.h"
+#include <algorithm>
+#include <list>
+#include "query.h"
+#include "queryutil.h"
+#include "json.h"
+
+namespace mongo {
+
+    BSONObj idKeyPattern = fromjson("{\"_id\":1}");
+
+    /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+       so you can look for a deleterecord about the right size.
+    */
+    int bucketSizes[] = {
+        32, 64, 128, 256, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000,
+        0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000,
+        0x400000, 0x800000
+    };
+
+    bool NamespaceIndex::exists() const {
+        return !boost::filesystem::exists(path());
+    }
+    
+    boost::filesystem::path NamespaceIndex::path() const {
+        return boost::filesystem::path( dir_ ) / ( database_ + ".ns" );
+    }
+
+	int lenForNewNsFiles = 16 * 1024 * 1024;
+    
+    void NamespaceIndex::init() {
+        if ( ht )
+            return;
+        /* if someone manually deleted the datafiles for a database,
+           we need to be sure to clear any cached info for the database in
+           local.*.
+        */
+		/*
+        if ( "local" != database_ ) {
+            DBInfo i(database_.c_str());
+            i.dbDropped();
+        }
+		*/
+		int len = -1;
+        boost::filesystem::path nsPath = path();
+        string pathString = nsPath.string();
+		void *p;
+        if( boost::filesystem::exists(nsPath) ) { 
+			p = f.map(pathString.c_str());
+            if( p ) {
+                len = f.length();
+                if ( len % (1024*1024) != 0 ){
+                    log() << "bad .ns file: " << pathString << endl;
+                    uassert( 10079 ,  "bad .ns file length, cannot open database", len % (1024*1024) == 0 );
+                }
+            }
+		}
+		else {
+			// use lenForNewNsFiles, we are making a new database
+			massert( 10343 ,  "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 );
+			long l = lenForNewNsFiles;
+			p = f.map(pathString.c_str(), l);
+            if( p ) { 
+                len = (int) l;
+                assert( len == lenForNewNsFiles );
+            }
+		}
+
+        if ( p == 0 ) {
+            problem() << "couldn't open file " << pathString << " terminating" << endl;
+            dbexit( EXIT_FS );
+        }
+        ht = new HashTable<Namespace,NamespaceDetails>(p, len, "namespace index");
+    }
+
+    void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) {
+        {
+            // defensive code: try to make us notice if we reference a deleted record
+            (unsigned&) (((Record *) d)->data) = 0xeeeeeeee;
+        }
+        dassert( dloc.drec() == d );
+        DEBUGGING out() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl;
+        if ( capped ) {
+            if ( !deletedList[ 1 ].isValid() ) {
+                // Initial extent allocation.  Insert at end.
+                d->nextDeleted = DiskLoc();
+                if ( deletedList[ 0 ].isNull() )
+                    deletedList[ 0 ] = dloc;
+                else {
+                    DiskLoc i = deletedList[ 0 ];
+                    for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted );
+                    i.drec()->nextDeleted = dloc;
+                }
+            } else {
+                d->nextDeleted = firstDeletedInCapExtent();
+                firstDeletedInCapExtent() = dloc;
+            }
+        } else {
+            int b = bucket(d->lengthWithHeaders);
+            DiskLoc& list = deletedList[b];
+            DiskLoc oldHead = list;
+            list = dloc;
+            d->nextDeleted = oldHead;
+        }
+    }
+
+    /*
+       lenToAlloc is WITH header
+    */
+    DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) {
+        lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+        DiskLoc loc = _alloc(ns, lenToAlloc);
+        if ( loc.isNull() )
+            return loc;
+
+        DeletedRecord *r = loc.drec();
+
+        /* note we want to grab from the front so our next pointers on disk tend
+        to go in a forward direction which is important for performance. */
+        int regionlen = r->lengthWithHeaders;
+        extentLoc.set(loc.a(), r->extentOfs);
+        assert( r->extentOfs < loc.getOfs() );
+
+        DEBUGGING out() << "TEMP: alloc() returns " << loc.toString() << ' ' << ns << " lentoalloc:" << lenToAlloc << " ext:" << extentLoc.toString() << endl;
+
+        int left = regionlen - lenToAlloc;
+        if ( capped == 0 ) {
+            if ( left < 24 || left < (lenToAlloc >> 3) ) {
+                // you get the whole thing.
+                return loc;
+            }
+        }
+
+        /* split off some for further use. */
+        r->lengthWithHeaders = lenToAlloc;
+        DiskLoc newDelLoc = loc;
+        newDelLoc.inc(lenToAlloc);
+        DeletedRecord *newDel = newDelLoc.drec();
+        newDel->extentOfs = r->extentOfs;
+        newDel->lengthWithHeaders = left;
+        newDel->nextDeleted.Null();
+
+        addDeletedRec(newDel, newDelLoc);
+
+        return loc;
+    }
+
+    /* for non-capped collections.
+       returned item is out of the deleted list upon return
+    */
+    DiskLoc NamespaceDetails::__stdAlloc(int len) {
+        DiskLoc *prev;
+        DiskLoc *bestprev = 0;
+        DiskLoc bestmatch;
+        int bestmatchlen = 0x7fffffff;
+        int b = bucket(len);
+        DiskLoc cur = deletedList[b];
+        prev = &deletedList[b];
+        int extra = 5; // look for a better fit, a little.
+        int chain = 0;
+        while ( 1 ) {
+            {
+                int a = cur.a();
+                if ( a < -1 || a >= 100000 ) {
+                    problem() << "~~ Assertion - cur out of range in _alloc() " << cur.toString() <<
+                    " a:" << a << " b:" << b << " chain:" << chain << '\n';
+                    sayDbContext();
+                    if ( cur == *prev )
+                        prev->Null();
+                    cur.Null();
+                }
+            }
+            if ( cur.isNull() ) {
+                // move to next bucket.  if we were doing "extra", just break
+                if ( bestmatchlen < 0x7fffffff )
+                    break;
+                b++;
+                if ( b > MaxBucket ) {
+                    // out of space. alloc a new extent.
+                    return DiskLoc();
+                }
+                cur = deletedList[b];
+                prev = &deletedList[b];
+                continue;
+            }
+            DeletedRecord *r = cur.drec();
+            if ( r->lengthWithHeaders >= len &&
+                    r->lengthWithHeaders < bestmatchlen ) {
+                bestmatchlen = r->lengthWithHeaders;
+                bestmatch = cur;
+                bestprev = prev;
+            }
+            if ( bestmatchlen < 0x7fffffff && --extra <= 0 )
+                break;
+            if ( ++chain > 30 && b < MaxBucket ) {
+                // too slow, force move to next bucket to grab a big chunk
+                //b++;
+                chain = 0;
+                cur.Null();
+            }
+            else {
+                /*this defensive check only made sense for the mmap storage engine: 
+                  if ( r->nextDeleted.getOfs() == 0 ) {
+                    problem() << "~~ Assertion - bad nextDeleted " << r->nextDeleted.toString() <<
+                    " b:" << b << " chain:" << chain << ", fixing.\n";
+                    r->nextDeleted.Null();
+                }*/
+                cur = r->nextDeleted;
+                prev = &r->nextDeleted;
+            }
+        }
+
+        /* unlink ourself from the deleted list */
+        {
+            DeletedRecord *bmr = bestmatch.drec();
+            *bestprev = bmr->nextDeleted;
+            bmr->nextDeleted.setInvalid(); // defensive.
+            assert(bmr->extentOfs < bestmatch.getOfs());
+        }
+
+        return bestmatch;
+    }
+
+    void NamespaceDetails::dumpDeleted(set<DiskLoc> *extents) {
+        for ( int i = 0; i < Buckets; i++ ) {
+            DiskLoc dl = deletedList[i];
+            while ( !dl.isNull() ) {
+                DeletedRecord *r = dl.drec();
+                DiskLoc extLoc(dl.a(), r->extentOfs);
+                if ( extents == 0 || extents->count(extLoc) <= 0 ) {
+                    out() << "  bucket " << i << endl;
+                    out() << "   " << dl.toString() << " ext:" << extLoc.toString();
+                    if ( extents && extents->count(extLoc) <= 0 )
+                        out() << '?';
+                    out() << " len:" << r->lengthWithHeaders << endl;
+                }
+                dl = r->nextDeleted;
+            }
+        }
+    }
+
+    /* combine adjacent deleted records
+
+       this is O(n^2) but we call it for capped tables where typically n==1 or 2!
+       (or 3...there will be a little unused sliver at the end of the extent.)
+    */
+    void NamespaceDetails::compact() {
+        assert(capped);
+
+        list<DiskLoc> drecs;
+
+        // Pull out capExtent's DRs from deletedList
+        DiskLoc i = firstDeletedInCapExtent();
+        for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted )
+            drecs.push_back( i );
+        firstDeletedInCapExtent() = i;
+
+        // This is the O(n^2) part.
+        drecs.sort();
+
+        list<DiskLoc>::iterator j = drecs.begin();
+        assert( j != drecs.end() );
+        DiskLoc a = *j;
+        while ( 1 ) {
+            j++;
+            if ( j == drecs.end() ) {
+                DEBUGGING out() << "TEMP: compact adddelrec\n";
+                addDeletedRec(a.drec(), a);
+                break;
+            }
+            DiskLoc b = *j;
+            while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) {
+                // a & b are adjacent.  merge.
+                a.drec()->lengthWithHeaders += b.drec()->lengthWithHeaders;
+                j++;
+                if ( j == drecs.end() ) {
+                    DEBUGGING out() << "temp: compact adddelrec2\n";
+                    addDeletedRec(a.drec(), a);
+                    return;
+                }
+                b = *j;
+            }
+            DEBUGGING out() << "temp: compact adddelrec3\n";
+            addDeletedRec(a.drec(), a);
+            a = b;
+        }
+    }
+
+    DiskLoc NamespaceDetails::firstRecord( const DiskLoc &startExtent ) const {
+        for (DiskLoc i = startExtent.isNull() ? firstExtent : startExtent;
+                !i.isNull(); i = i.ext()->xnext ) {
+            if ( !i.ext()->firstRecord.isNull() )
+                return i.ext()->firstRecord;
+        }
+        return DiskLoc();
+    }
+
+    DiskLoc NamespaceDetails::lastRecord( const DiskLoc &startExtent ) const {
+        for (DiskLoc i = startExtent.isNull() ? lastExtent : startExtent;
+                !i.isNull(); i = i.ext()->xprev ) {
+            if ( !i.ext()->lastRecord.isNull() )
+                return i.ext()->lastRecord;
+        }
+        return DiskLoc();
+    }
+
+    DiskLoc &NamespaceDetails::firstDeletedInCapExtent() {
+        if ( deletedList[ 1 ].isNull() )
+            return deletedList[ 0 ];
+        else
+            return deletedList[ 1 ].drec()->nextDeleted;
+    }
+
+    bool NamespaceDetails::inCapExtent( const DiskLoc &dl ) const {
+        assert( !dl.isNull() );
+        // We could have a rec or drec, doesn't matter.
+        return dl.drec()->myExtent( dl ) == capExtent.ext();
+    }
+
+    bool NamespaceDetails::nextIsInCapExtent( const DiskLoc &dl ) const {
+        assert( !dl.isNull() );
+        DiskLoc next = dl.drec()->nextDeleted;
+        if ( next.isNull() )
+            return false;
+        return inCapExtent( next );
+    }
+
+    void NamespaceDetails::advanceCapExtent( const char *ns ) {
+        // We want deletedList[ 1 ] to be the last DeletedRecord of the prev cap extent
+        // (or DiskLoc() if new capExtent == firstExtent)
+        if ( capExtent == lastExtent )
+            deletedList[ 1 ] = DiskLoc();
+        else {
+            DiskLoc i = firstDeletedInCapExtent();
+            for (; !i.isNull() && nextIsInCapExtent( i ); i = i.drec()->nextDeleted );
+            deletedList[ 1 ] = i;
+        }
+
+        capExtent = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext;
+
+        /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
+        //dassert( theCapExtent()->ns == ns );
+
+        theCapExtent()->assertOk();
+        capFirstNewRecord = DiskLoc();
+    }
+
+    int n_complaints_cap = 0;
+    void NamespaceDetails::maybeComplain( const char *ns, int len ) const {
+        if ( ++n_complaints_cap < 8 ) {
+            out() << "couldn't make room for new record (len: " << len << ") in capped ns " << ns << '\n';
+            int i = 0;
+            for ( DiskLoc e = firstExtent; !e.isNull(); e = e.ext()->xnext, ++i ) {
+                out() << "  Extent " << i;
+                if ( e == capExtent )
+                    out() << " (capExtent)";
+                out() << '\n';
+                out() << "    magic: " << hex << e.ext()->magic << dec << " extent->ns: " << e.ext()->nsDiagnostic.buf << '\n';
+                out() << "    fr: " << e.ext()->firstRecord.toString() <<
+                     " lr: " << e.ext()->lastRecord.toString() << " extent->len: " << e.ext()->length << '\n';
+            }
+            assert( len * 5 > lastExtentSize ); // assume it is unusually large record; if not, something is broken
+        }
+    }
+
+    DiskLoc NamespaceDetails::__capAlloc( int len ) {
+        DiskLoc prev = deletedList[ 1 ];
+        DiskLoc i = firstDeletedInCapExtent();
+        DiskLoc ret;
+        for (; !i.isNull() && inCapExtent( i ); prev = i, i = i.drec()->nextDeleted ) {
+            // We need to keep at least one DR per extent in deletedList[ 0 ],
+            // so make sure there's space to create a DR at the end.
+            if ( i.drec()->lengthWithHeaders >= len + 24 ) {
+                ret = i;
+                break;
+            }
+        }
+
+        /* unlink ourself from the deleted list */
+        if ( !ret.isNull() ) {
+            if ( prev.isNull() )
+                deletedList[ 0 ] = ret.drec()->nextDeleted;
+            else
+                prev.drec()->nextDeleted = ret.drec()->nextDeleted;
+            ret.drec()->nextDeleted.setInvalid(); // defensive.
+            assert( ret.drec()->extentOfs < ret.getOfs() );
+        }
+
+        return ret;
+    }
+
+    void NamespaceDetails::checkMigrate() {
+        // migrate old NamespaceDetails format
+        if ( capped && capExtent.a() == 0 && capExtent.getOfs() == 0 ) {
+            capFirstNewRecord = DiskLoc();
+            capFirstNewRecord.setInvalid();
+            // put all the DeletedRecords in deletedList[ 0 ]
+            for ( int i = 1; i < Buckets; ++i ) {
+                DiskLoc first = deletedList[ i ];
+                if ( first.isNull() )
+                    continue;
+                DiskLoc last = first;
+                for (; !last.drec()->nextDeleted.isNull(); last = last.drec()->nextDeleted );
+                last.drec()->nextDeleted = deletedList[ 0 ];
+                deletedList[ 0 ] = first;
+                deletedList[ i ] = DiskLoc();
+            }
+            // NOTE deletedList[ 1 ] set to DiskLoc() in above
+
+            // Last, in case we're killed before getting here
+            capExtent = firstExtent;
+        }
+    }
+
+    /* alloc with capped table handling. */
+    DiskLoc NamespaceDetails::_alloc(const char *ns, int len) {
+        if ( !capped )
+            return __stdAlloc(len);
+
+        // capped.
+
+        // signal done allocating new extents.
+        if ( !deletedList[ 1 ].isValid() )
+            deletedList[ 1 ] = DiskLoc();
+
+        assert( len < 400000000 );
+        int passes = 0;
+        DiskLoc loc;
+
+        // delete records until we have room and the max # objects limit achieved.
+
+        /* this fails on a rename -- that is ok but must keep commented out */
+        //assert( theCapExtent()->ns == ns );
+
+        theCapExtent()->assertOk();
+        DiskLoc firstEmptyExtent;
+        while ( 1 ) {
+            if ( nrecords < max ) {
+                loc = __capAlloc( len );
+                if ( !loc.isNull() )
+                    break;
+            }
+
+            // If on first iteration through extents, don't delete anything.
+            if ( !capFirstNewRecord.isValid() ) {
+                advanceCapExtent( ns );
+                if ( capExtent != firstExtent )
+                    capFirstNewRecord.setInvalid();
+                // else signal done with first iteration through extents.
+                continue;
+            }
+
+            if ( !capFirstNewRecord.isNull() &&
+                    theCapExtent()->firstRecord == capFirstNewRecord ) {
+                // We've deleted all records that were allocated on the previous
+                // iteration through this extent.
+                advanceCapExtent( ns );
+                continue;
+            }
+
+            if ( theCapExtent()->firstRecord.isNull() ) {
+                if ( firstEmptyExtent.isNull() )
+                    firstEmptyExtent = capExtent;
+                advanceCapExtent( ns );
+                if ( firstEmptyExtent == capExtent ) {
+                    maybeComplain( ns, len );
+                    return DiskLoc();
+                }
+                continue;
+            }
+
+            massert( 10344 ,  "Capped collection full and delete not allowed", cappedMayDelete() );
+            DiskLoc fr = theCapExtent()->firstRecord;
+            theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true);
+            compact();
+            if( ++passes >= 5000 ) {
+                log() << "passes ns:" << ns << " len:" << len << '\n';
+                log() << "passes max:" << max << " nrecords:" << nrecords << " datasize: " << datasize << endl;
+                massert( 10345 ,  "passes >= 5000 in capped collection alloc", false );
+            }
+        }
+
+        // Remember first record allocated on this iteration through capExtent.
+        if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() )
+            capFirstNewRecord = loc;
+
+        return loc;
+    }
+
+    /* you MUST call when adding an index.  see pdfile.cpp */
+    IndexDetails& NamespaceDetails::addIndex(const char *thisns) {
+        assert( nsdetails(thisns) == this );
+
+        if( nIndexes == NIndexesBase && extraOffset == 0 ) { 
+            nsindex(thisns)->allocExtra(thisns);
+        }
+
+        IndexDetails& id = idx(nIndexes);
+        nIndexes++;
+        NamespaceDetailsTransient::get_w(thisns).addedIndex();
+        return id;
+    }
+
+    // must be called when renaming a NS to fix up extra
+    void NamespaceDetails::copyingFrom(const char *thisns, NamespaceDetails *src) { 
+        if( extraOffset ) {
+            extraOffset = 0; // so allocExtra() doesn't assert.
+            Extra *e = nsindex(thisns)->allocExtra(thisns);
+            memcpy(e, src->extra(), sizeof(Extra));
+        } 
+    }
+
+    /* returns index of the first index in which the field is present. -1 if not present.
+       (aug08 - this method not currently used)
+    */
+    int NamespaceDetails::fieldIsIndexed(const char *fieldName) {
+        massert( 10346 , "not implemented", false);
+        /*
+        for ( int i = 0; i < nIndexes; i++ ) {
+            IndexDetails& idx = indexes[i];
+            BSONObj idxKey = idx.info.obj().getObjectField("key"); // e.g., { ts : -1 }
+            if ( !idxKey.findElement(fieldName).eoo() )
+                return i;
+        }*/
+        return -1;
+    }
+    
+    long long NamespaceDetails::storageSize(){
+        Extent * e = firstExtent.ext();
+        assert( e );
+        
+        long long total = 0;
+        while ( e ){
+                total += e->length;
+                e = e->getNextExtent();
+        }
+        return total;
+    }
+    
+    /* ------------------------------------------------------------------------- */
+
+    boost::mutex NamespaceDetailsTransient::_qcMutex;
+    map< string, shared_ptr< NamespaceDetailsTransient > > NamespaceDetailsTransient::_map;
+    typedef map< string, shared_ptr< NamespaceDetailsTransient > >::iterator ouriter;
+
+    void NamespaceDetailsTransient::reset() {
+        clearQueryCache();
+        _keysComputed = false;
+        _indexSpecs.clear();
+    }
+    
+/*    NamespaceDetailsTransient& NamespaceDetailsTransient::get(const char *ns) {
+        shared_ptr< NamespaceDetailsTransient > &t = map_[ ns ];
+        if ( t.get() == 0 )
+            t.reset( new NamespaceDetailsTransient(ns) );
+        return *t;
+    }
+*/
+    void NamespaceDetailsTransient::clearForPrefix(const char *prefix) {
+        assertInWriteLock();
+        vector< string > found;
+        for( ouriter i = _map.begin(); i != _map.end(); ++i )
+            if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 )
+                found.push_back( i->first );
+        for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) {
+            _map[ *i ].reset();
+        }
+    }
+    
+    void NamespaceDetailsTransient::computeIndexKeys() {
+        _keysComputed = true;
+        _indexKeys.clear();
+        NamespaceDetails *d = nsdetails(_ns.c_str());
+        NamespaceDetails::IndexIterator i = d->ii();
+        while( i.more() )
+            i.next().keyPattern().getFieldNames(_indexKeys);
+    }
+    
+    void NamespaceDetailsTransient::cllStart( int logSizeMb ) {
+        assertInWriteLock();
+        _cll_ns = "local.temp.oplog." + _ns;
+        _cll_enabled = true;
+        stringstream spec;
+        // 128MB
+        spec << "{size:" << logSizeMb * 1024 * 1024 << ",capped:true,autoIndexId:false}";
+        setClient( _cll_ns.c_str() );
+        string err;
+        massert( 10347 ,  "Could not create log ns", userCreateNS( _cll_ns.c_str(), fromjson( spec.str() ), err, false ) );
+        NamespaceDetails *d = nsdetails( _cll_ns.c_str() );
+        d->cappedDisallowDelete();
+    }
+
+    void NamespaceDetailsTransient::cllInvalidate() {
+        assertInWriteLock();
+        cllDrop();
+        _cll_enabled = false;
+    }
+    
+    bool NamespaceDetailsTransient::cllValidateComplete() {
+        assertInWriteLock();
+        cllDrop();
+        bool ret = _cll_enabled;
+        _cll_enabled = false;
+        _cll_ns = "";
+        return ret;
+    }
+    
+    void NamespaceDetailsTransient::cllDrop() {
+        assertInWriteLock();
+        if ( !_cll_enabled )
+            return;
+        setClient( _cll_ns.c_str() );
+        dropNS( _cll_ns );
+    }
+
+    /* ------------------------------------------------------------------------- */
+
+    /* add a new namespace to the system catalog (<dbname>.system.namespaces).
+       options: { capped : ..., size : ... }
+    */
+    void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0) {
+        log(1) << "New namespace: " << ns << '\n';
+        if ( strstr(ns, "system.namespaces") ) {
+            // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
+            // TODO: fix above should not be strstr!
+            return;
+        }
+
+        {
+            BSONObjBuilder b;
+            b.append("name", ns);
+            if ( options )
+                b.append("options", *options);
+            BSONObj j = b.done();
+            char database[256];
+            nsToDatabase(ns, database);
+            string s = database;
+            s += ".system.namespaces";
+            theDataFileMgr.insert(s.c_str(), j.objdata(), j.objsize(), true);
+        }
+    }
+
+    void renameNamespace( const char *from, const char *to ) {
+		NamespaceIndex *ni = nsindex( from );
+		assert( ni && ni->details( from ) && !ni->details( to ) );
+		
+		// Our namespace and index details will move to a different 
+		// memory location.  The only references to namespace and 
+		// index details across commands are in cursors and nsd
+		// transient (including query cache) so clear these.
+		ClientCursor::invalidate( from );
+		NamespaceDetailsTransient::clearForPrefix( from );
+
+		NamespaceDetails *details = ni->details( from );
+		ni->add_ns( to, *details );
+        NamespaceDetails *todetails = ni->details( to );
+        try { 
+            todetails->copyingFrom(to, details); // fixes extraOffset
+        }
+        catch( DBException& ) { 
+            // could end up here if .ns is full - if so try to clean up / roll back a little
+            ni->kill_ns(to);
+            throw;
+        }
+		ni->kill_ns( from );
+		details = todetails;
+		
+		BSONObj oldSpec;
+		char database[MaxDatabaseLen];
+		nsToDatabase(from, database);
+		string s = database;
+		s += ".system.namespaces";
+		assert( Helpers::findOne( s.c_str(), BSON( "name" << from ), oldSpec ) );
+		
+		BSONObjBuilder newSpecB;
+		BSONObjIterator i( oldSpec.getObjectField( "options" ) );
+		while( i.more() ) {
+			BSONElement e = i.next();
+			if ( strcmp( e.fieldName(), "create" ) != 0 )
+				newSpecB.append( e );
+			else
+				newSpecB << "create" << to;
+		}
+		BSONObj newSpec = newSpecB.done();    
+		addNewNamespaceToCatalog( to, newSpec.isEmpty() ? 0 : &newSpec );
+
+		deleteObjects( s.c_str(), BSON( "name" << from ), false, false, true );
+		// oldSpec variable no longer valid memory
+
+		BSONObj oldIndexSpec;
+		s = database;
+		s += ".system.indexes";
+		while( Helpers::findOne( s.c_str(), BSON( "ns" << from ), oldIndexSpec ) ) {
+			BSONObjBuilder newIndexSpecB;
+			BSONObjIterator i( oldIndexSpec );
+			while( i.more() ) {
+				BSONElement e = i.next();
+				if ( strcmp( e.fieldName(), "ns" ) != 0 )
+					newIndexSpecB.append( e );
+				else
+					newIndexSpecB << "ns" << to;
+			}
+			BSONObj newIndexSpec = newIndexSpecB.done();
+			DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, BSONElement(), false );
+			int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) );
+			IndexDetails &indexDetails = details->idx(indexI);
+			string oldIndexNs = indexDetails.indexNamespace();
+			indexDetails.info = newIndexSpecLoc;
+			string newIndexNs = indexDetails.indexNamespace();
+			
+			BtreeBucket::renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() );
+			deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true );
+		}
+	}
+
+    bool legalClientSystemNS( const string& ns , bool write ){
+        if ( ns.find( ".system.users" ) != string::npos )
+            return true;
+
+        if ( ns.find( ".system.js" ) != string::npos ){
+            if ( write )
+                Scope::storedFuncMod();
+            return true;
+        }
+        
+        return false;
+    }
+	
+} // namespace mongo
diff --git a/db/namespace.h b/db/namespace.h
new file mode 100644
index 0000000..df4c62f
--- /dev/null
+++ b/db/namespace.h
@@ -0,0 +1,653 @@
+// namespace.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "jsobj.h"
+#include "queryutil.h"
+#include "storage.h"
+#include "../util/hashtab.h"
+#include "../util/mmap.h"
+
+namespace mongo {
+
+    class Cursor;
+
+#pragma pack(1)
+
+	/* in the mongo source code, "client" means "database". */
+
+    const int MaxDatabaseLen = 256; // max str len for the db name, including null char
+
+	// "database.a.b.c" -> "database"
+    inline void nsToDatabase(const char *ns, char *database) {
+        const char *p = ns;
+        char *q = database;
+        while ( *p != '.' ) {
+            if ( *p == 0 )
+                break;
+            *q++ = *p++;
+        }
+        *q = 0;
+        if (q-database>=MaxDatabaseLen) {
+            log() << "nsToDatabase: ns too long. terminating, buf overrun condition" << endl;
+            dbexit( EXIT_POSSIBLE_CORRUPTION );
+        }
+    }
+    inline string nsToDatabase(const char *ns) {
+        char buf[MaxDatabaseLen];
+        nsToDatabase(ns, buf);
+        return buf;
+    }
+
+	/* e.g.
+	   NamespaceString ns("acme.orders");
+	   cout << ns.coll; // "orders"
+	*/
+    class NamespaceString {
+    public:
+        string db;
+        string coll; // note collection names can have periods in them for organizing purposes (e.g. "system.indexes")
+    private:
+        void init(const char *ns) { 
+            const char *p = strchr(ns, '.');
+            if( p == 0 ) return;
+            db = string(ns, p - ns);
+            coll = p + 1;
+        }
+    public:
+        NamespaceString( const char * ns ) { init(ns); }
+        NamespaceString( const string& ns ) { init(ns.c_str()); }
+
+        bool isSystem() { 
+            return strncmp(coll.c_str(), "system.", 7) == 0;
+        }
+    };
+
+	/* This helper class is used to make the HashMap below in NamespaceDetails */
+    class Namespace {
+    public:
+        enum MaxNsLenValue { MaxNsLen = 128 };
+        Namespace(const char *ns) {
+            *this = ns;
+        }
+        Namespace& operator=(const char *ns) {
+            uassert( 10080 , "ns name too long, max size is 128", strlen(ns) < MaxNsLen);
+            //memset(buf, 0, MaxNsLen); /* this is just to keep stuff clean in the files for easy dumping and reading */
+            strcpy_s(buf, MaxNsLen, ns);
+            return *this;
+        }
+
+        /* for more than 10 indexes -- see NamespaceDetails::Extra */
+        string extraName() { 
+            string s = string(buf) + "$extra";
+            massert( 10348 , "ns name too long", s.size() < MaxNsLen);
+            return s;
+        }
+
+        void kill() {
+            buf[0] = 0x7f;
+        }
+
+        bool operator==(const char *r) {
+            return strcmp(buf, r) == 0;
+        }
+        bool operator==(const Namespace& r) {
+            return strcmp(buf, r.buf) == 0;
+        }
+        int hash() const {
+            unsigned x = 0;
+            const char *p = buf;
+            while ( *p ) {
+                x = x * 131 + *p;
+                p++;
+            }
+            return (x & 0x7fffffff) | 0x8000000; // must be > 0
+        }
+
+        /**
+           ( foo.bar ).getSisterNS( "blah" ) == foo.blah
+		   perhaps this should move to the NamespaceString helper?
+         */
+        string getSisterNS( const char * local ) {
+            assert( local && local[0] != '.' );
+            string old(buf);
+            if ( old.find( "." ) != string::npos )
+                old = old.substr( 0 , old.find( "." ) );
+            return old + "." + local;
+        }
+
+        operator string() const {
+            return (string)buf;
+        }
+
+        char buf[MaxNsLen];
+    };
+
+}
+
+#include "index.h"
+
+namespace mongo {
+
+    /**
+       @return true if a client can modify this namespace
+       things like *.system.users
+     */
+    bool legalClientSystemNS( const string& ns , bool write );
+
+
+    /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+       so you can look for a deleterecord about the right size.
+    */
+    const int Buckets = 19;
+    const int MaxBucket = 18;
+
+    extern int bucketSizes[];
+
+    /* this is the "header" for a collection that has all its details.  in the .ns file.
+    */
+    class NamespaceDetails {
+        friend class NamespaceIndex;
+        enum { NIndexesExtra = 30,
+               NIndexesBase  = 10
+        };
+        struct Extra { 
+            // note we could use this field for more chaining later, so don't waste it:
+            unsigned long long reserved1; 
+            IndexDetails details[NIndexesExtra];
+            unsigned reserved2;
+            unsigned reserved3;
+        };
+        Extra* extra() { 
+            assert( extraOffset );
+            return (Extra *) (((char *) this) + extraOffset);
+        }
+    public:
+        void copyingFrom(const char *thisns, NamespaceDetails *src); // must be called when renaming a NS to fix up extra
+
+        enum { NIndexesMax = 40 };
+
+        BOOST_STATIC_ASSERT( NIndexesMax == NIndexesBase + NIndexesExtra );
+
+        NamespaceDetails( const DiskLoc &loc, bool _capped ) {
+            /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
+            firstExtent = lastExtent = capExtent = loc;
+            datasize = nrecords = 0;
+            lastExtentSize = 0;
+            nIndexes = 0;
+            capped = _capped;
+            max = 0x7fffffff;
+            paddingFactor = 1.0;
+            flags = 0;
+            capFirstNewRecord = DiskLoc();
+            // Signal that we are on first allocation iteration through extents.
+            capFirstNewRecord.setInvalid();
+            // For capped case, signal that we are doing initial extent allocation.
+            if ( capped )
+                deletedList[ 1 ].setInvalid();
+			assert( sizeof(dataFileVersion) == 2 );
+			dataFileVersion = 0;
+			indexFileVersion = 0;
+            multiKeyIndexBits = 0;
+            reservedA = 0;
+            extraOffset = 0;
+            backgroundIndexBuildInProgress = 0;
+            memset(reserved, 0, sizeof(reserved));
+        }
+        DiskLoc firstExtent;
+        DiskLoc lastExtent;
+
+        /* NOTE: capped collections override the meaning of deleted list.  
+                 deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
+                 the namespace.
+                 deletedList[1] points to the last record in the prev extent.  When the "current extent" 
+                 changes, this value is updated.  !deletedList[1].isValid() when this value is not 
+                 yet computed.
+        */
+        DiskLoc deletedList[Buckets];
+
+        long long datasize;
+        long long nrecords;
+        int lastExtentSize;
+        int nIndexes;
+    private:
+        IndexDetails _indexes[NIndexesBase];
+    public:
+        int capped;
+        int max; // max # of objects for a capped table.
+        double paddingFactor; // 1.0 = no padding.
+        int flags;
+        DiskLoc capExtent;
+        DiskLoc capFirstNewRecord;
+
+        /* NamespaceDetails version.  So we can do backward compatibility in the future.
+		   See filever.h
+        */
+		unsigned short dataFileVersion;
+		unsigned short indexFileVersion;
+
+        unsigned long long multiKeyIndexBits;
+    private:
+        unsigned long long reservedA;
+        long long extraOffset; // where the $extra info is located (bytes relative to this)
+    public:
+        int backgroundIndexBuildInProgress; // 1 if in prog
+        char reserved[76];
+
+        /* NOTE: be careful with flags.  are we manipulating them in read locks?  if so, 
+                 this isn't thread safe.  TODO
+        */
+        enum NamespaceFlags {
+            Flag_HaveIdIndex = 1 << 0, // set when we have _id index (ONLY if ensureIdIndex was called -- 0 if that has never been called)
+            Flag_CappedDisallowDelete = 1 << 1 // set when deletes not allowed during capped table allocation.
+        };
+
+        IndexDetails& idx(int idxNo) {
+            if( idxNo < NIndexesBase ) 
+                return _indexes[idxNo];
+            return extra()->details[idxNo-NIndexesBase];
+        }
+
+        class IndexIterator { 
+            friend class NamespaceDetails;
+            int i;
+            int n;
+            NamespaceDetails *d;
+            Extra *e;
+            IndexIterator(NamespaceDetails *_d) { 
+                d = _d;
+                i = 0;
+                n = d->nIndexes;
+                if( n > NIndexesBase )
+                    e = d->extra();
+            }
+        public:
+            int pos() { return i; } // note this is the next one to come
+            bool more() { return i < n; }
+            IndexDetails& next() { 
+                int k = i;
+                i++;
+                return k < NIndexesBase ? d->_indexes[k] : 
+                    e->details[k-10];
+            }
+        };
+
+        IndexIterator ii() { 
+            return IndexIterator(this);
+        }
+
+        /* hackish - find our index # in the indexes array
+        */
+        int idxNo(IndexDetails& idx) { 
+            IndexIterator i = ii();
+            while( i.more() ) {
+                if( &i.next() == &idx )
+                    return i.pos()-1;
+            }
+            massert( 10349 , "E12000 idxNo fails", false);
+            return -1;
+        }
+
+        /* multikey indexes are indexes where there are more than one key in the index
+             for a single document. see multikey in wiki.
+           for these, we have to do some dedup work on queries.
+        */
+        bool isMultikey(int i) {
+            return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0;
+        }
+        void setIndexIsMultikey(int i) { 
+            dassert( i < NIndexesMax );
+            multiKeyIndexBits |= (((unsigned long long) 1) << i);
+        }
+        void clearIndexIsMultikey(int i) { 
+            dassert( i < NIndexesMax );
+            multiKeyIndexBits &= ~(((unsigned long long) 1) << i);
+        }
+
+        /* add a new index.  does not add to system.indexes etc. - just to NamespaceDetails.
+           caller must populate returned object. 
+         */
+        IndexDetails& addIndex(const char *thisns);
+
+        void aboutToDeleteAnIndex() {
+            flags &= ~Flag_HaveIdIndex;
+        }
+
+        void cappedDisallowDelete() {
+            flags |= Flag_CappedDisallowDelete;
+        }
+        
+        /* returns index of the first index in which the field is present. -1 if not present. */
+        int fieldIsIndexed(const char *fieldName);
+
+        void paddingFits() {
+            double x = paddingFactor - 0.01;
+            if ( x >= 1.0 )
+                paddingFactor = x;
+        }
+        void paddingTooSmall() {
+            double x = paddingFactor + 0.6;
+            if ( x <= 2.0 )
+                paddingFactor = x;
+        }
+
+        //returns offset in indexes[]
+        int findIndexByName(const char *name) {
+            IndexIterator i = ii();
+            while( i.more() ) {
+                if ( strcmp(i.next().info.obj().getStringField("name"),name) == 0 )
+                    return i.pos()-1;
+            }
+            return -1;
+        }
+
+        //returns offset in indexes[]
+        int findIndexByKeyPattern(const BSONObj& keyPattern) {
+            IndexIterator i = ii();
+            while( i.more() ) {
+                if( i.next().keyPattern() == keyPattern ) 
+                    return i.pos()-1;
+            }
+            return -1;
+        }
+
+        /* @return -1 = not found 
+           generally id is first index, so not that expensive an operation (assuming present).
+        */
+        int findIdIndex() {
+            IndexIterator i = ii();
+            while( i.more() ) {
+                if( i.next().isIdIndex() ) 
+                    return i.pos()-1;
+            }
+            return -1;
+        }
+
+        /* return which "deleted bucket" for this size object */
+        static int bucket(int n) {
+            for ( int i = 0; i < Buckets; i++ )
+                if ( bucketSizes[i] > n )
+                    return i;
+            return Buckets-1;
+        }
+
+        /* allocate a new record.  lenToAlloc includes headers. */
+        DiskLoc alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc);
+
+        /* add a given record to the deleted chains for this NS */
+        void addDeletedRec(DeletedRecord *d, DiskLoc dloc);
+
+        void dumpDeleted(set<DiskLoc> *extents = 0);
+
+        bool capLooped() const {
+            return capped && capFirstNewRecord.isValid();
+        }
+
+        // Start from firstExtent by default.
+        DiskLoc firstRecord( const DiskLoc &startExtent = DiskLoc() ) const;
+
+        // Start from lastExtent by default.
+        DiskLoc lastRecord( const DiskLoc &startExtent = DiskLoc() ) const;
+
+        bool inCapExtent( const DiskLoc &dl ) const;
+
+        void checkMigrate();
+
+        long long storageSize();
+
+    private:
+        bool cappedMayDelete() const {
+            return !( flags & Flag_CappedDisallowDelete );
+        }
+        Extent *theCapExtent() const {
+            return capExtent.ext();
+        }
+        void advanceCapExtent( const char *ns );
+        void maybeComplain( const char *ns, int len ) const;
+        DiskLoc __stdAlloc(int len);
+        DiskLoc __capAlloc(int len);
+        DiskLoc _alloc(const char *ns, int len);
+        void compact(); // combine adjacent deleted records
+
+        DiskLoc &firstDeletedInCapExtent();
+        bool nextIsInCapExtent( const DiskLoc &dl ) const;
+    };
+
+#pragma pack()
+
+    /* these are things we know / compute about a namespace that are transient -- things
+       we don't actually store in the .ns file.  so mainly caching of frequently used
+       information.
+
+       CAUTION: Are you maintaining this properly on a collection drop()?  A dropdatabase()?  Be careful.
+                The current field "allIndexKeys" may have too many keys in it on such an occurrence;
+                as currently used that does not cause anything terrible to happen.
+
+       todo: cleanup code, need abstractions and separation
+    */
+    class NamespaceDetailsTransient : boost::noncopyable {
+        /* general ------------------------------------------------------------- */
+    private:
+        string _ns;
+        void reset();
+        static std::map< string, shared_ptr< NamespaceDetailsTransient > > _map;
+    public:
+        NamespaceDetailsTransient(const char *ns) : _ns(ns), _keysComputed(false), _qcWriteCount(), _cll_enabled() { }
+        /* _get() is not threadsafe */
+        static NamespaceDetailsTransient& _get(const char *ns);
+        /* use get_w() when doing write operations */
+        static NamespaceDetailsTransient& get_w(const char *ns) { 
+            DEV assertInWriteLock();
+            return _get(ns);
+        }
+        void addedIndex() { reset(); }
+        void deletedIndex() { reset(); }
+        /* Drop cached information on all namespaces beginning with the specified prefix.
+           Can be useful as index namespaces share the same start as the regular collection. 
+           SLOW - sequential scan of all NamespaceDetailsTransient objects */
+        static void clearForPrefix(const char *prefix);
+
+        /* indexKeys() cache ---------------------------------------------------- */
+        /* assumed to be in write lock for this */
+    private:
+        bool _keysComputed;
+        set<string> _indexKeys;
+        void computeIndexKeys();
+    public:
+        /* get set of index keys for this namespace.  handy to quickly check if a given
+           field is indexed (Note it might be a secondary component of a compound index.)
+        */
+        set<string>& indexKeys() {
+            DEV assertInWriteLock();
+            if ( !_keysComputed )
+                computeIndexKeys();
+            return _indexKeys;
+        }
+
+        /* IndexSpec caching */
+    private:
+        map<const IndexDetails*,IndexSpec> _indexSpecs;
+    public:
+        const IndexSpec& getIndexSpec( const IndexDetails * details ){
+            DEV assertInWriteLock();
+            IndexSpec& spec = _indexSpecs[details];
+            if ( spec.meta.isEmpty() ){
+                spec.reset( details->info );
+            }
+            return spec;
+        }
+
+        /* query cache (for query optimizer) ------------------------------------- */
+    private:
+        int _qcWriteCount;
+        map< QueryPattern, pair< BSONObj, long long > > _qcCache;
+    public:
+        static boost::mutex _qcMutex;
+        /* you must be in the qcMutex when calling this (and using the returned val): */
+        static NamespaceDetailsTransient& get_inlock(const char *ns) {
+            return _get(ns);
+        }
+        void clearQueryCache() { // public for unit tests
+            _qcCache.clear();
+            _qcWriteCount = 0;
+        }
+        /* you must notify the cache if you are doing writes, as query plan optimality will change */
+        void notifyOfWriteOp() {
+            if ( _qcCache.empty() )
+                return;
+            if ( ++_qcWriteCount >= 100 )
+                clearQueryCache();
+        }
+        BSONObj indexForPattern( const QueryPattern &pattern ) {
+            return _qcCache[ pattern ].first;
+        }
+        long long nScannedForPattern( const QueryPattern &pattern ) {
+            return _qcCache[ pattern ].second;
+        }
+        void registerIndexForPattern( const QueryPattern &pattern, const BSONObj &indexKey, long long nScanned ) {
+            _qcCache[ pattern ] = make_pair( indexKey, nScanned );
+        }
+
+        /* for collection-level logging -- see CmdLogCollection ----------------- */ 
+        /* assumed to be in write lock for this */
+    private:
+        string _cll_ns; // "local.temp.oplog." + _ns;
+        bool _cll_enabled;
+        void cllDrop(); // drop _cll_ns
+    public:
+        string cllNS() const { return _cll_ns; }
+        bool cllEnabled() const { return _cll_enabled; }
+        void cllStart( int logSizeMb = 256 ); // begin collection level logging
+        void cllInvalidate();
+        bool cllValidateComplete();
+
+    }; /* NamespaceDetailsTransient */
+
+    inline NamespaceDetailsTransient& NamespaceDetailsTransient::_get(const char *ns) {
+        shared_ptr< NamespaceDetailsTransient > &t = _map[ ns ];
+        if ( t.get() == 0 )
+            t.reset( new NamespaceDetailsTransient(ns) );
+        return *t;
+    }
+
+    /* NamespaceIndex is the ".ns" file you see in the data directory.  It is the "system catalog"
+       if you will: at least the core parts.  (Additional info in system.* collections.)
+    */
+    class NamespaceIndex {
+        friend class NamespaceCursor;
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
+    public:
+        NamespaceIndex(const string &dir, const string &database) :
+        ht( 0 ),
+        dir_( dir ),
+        database_( database ) {}
+
+        /* returns true if new db will be created if we init lazily */
+        bool exists() const;
+        
+        void init();
+
+        void add_ns(const char *ns, DiskLoc& loc, bool capped) {
+            NamespaceDetails details( loc, capped );
+			add_ns( ns, details );
+        }
+
+		void add_ns( const char *ns, const NamespaceDetails &details ) {
+            init();
+            Namespace n(ns);
+            uassert( 10081 , "too many namespaces/collections", ht->put(n, details));
+		}
+
+        /* just for diagnostics */
+        size_t detailsOffset(NamespaceDetails *d) {
+            if ( !ht )
+                return -1;
+            return ((char *) d) -  (char *) ht->nodes;
+        }
+
+        /* extra space for indexes when more than 10 */
+        NamespaceDetails::Extra* allocExtra(const char *ns) { 
+            Namespace n(ns);
+            Namespace extra(n.extraName().c_str()); // throws userexception if ns name too long
+            NamespaceDetails *d = details(ns);
+            massert( 10350 ,  "allocExtra: base ns missing?", d );
+            assert( d->extraOffset == 0 );
+            massert( 10351 ,  "allocExtra: extra already exists", ht->get(extra) == 0 );
+            NamespaceDetails::Extra temp;
+            memset(&temp, 0, sizeof(temp));
+            uassert( 10082 ,  "allocExtra: too many namespaces/collections", ht->put(extra, (NamespaceDetails&) temp));
+            NamespaceDetails::Extra *e = (NamespaceDetails::Extra *) ht->get(extra);
+            d->extraOffset = ((char *) e) - ((char *) d);
+            assert( d->extra() == e );
+            return e;
+        }
+
+        NamespaceDetails* details(const char *ns) {
+            if ( !ht )
+                return 0;
+            Namespace n(ns);
+            NamespaceDetails *d = ht->get(n);
+            if ( d )
+                d->checkMigrate();
+            return d;
+        }
+
+        void kill_ns(const char *ns) {
+            if ( !ht )
+                return;
+            Namespace n(ns);
+            ht->kill(n);
+
+            try {
+                Namespace extra(n.extraName().c_str());
+                ht->kill(extra);
+            }
+            catch(DBException&) { }
+        }
+
+        bool find(const char *ns, DiskLoc& loc) {
+            NamespaceDetails *l = details(ns);
+            if ( l ) {
+                loc = l->firstExtent;
+                return true;
+            }
+            return false;
+        }
+
+        bool allocated() const {
+            return ht != 0;
+        }
+
+    private:
+        boost::filesystem::path path() const;
+        
+        MemoryMappedFile f;
+        HashTable<Namespace,NamespaceDetails> *ht;
+        string dir_;
+        string database_;
+    };
+
+    extern string dbpath; // --dbpath parm 
+
+    // Rename a namespace within current 'client' db.
+    // (Arguments should include db name)
+    void renameNamespace( const char *from, const char *to );
+
+} // namespace mongo
diff --git a/db/nonce.cpp b/db/nonce.cpp
new file mode 100644
index 0000000..4c677be
--- /dev/null
+++ b/db/nonce.cpp
@@ -0,0 +1,74 @@
+// nonce.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "stdafx.h"
+#include "nonce.h"
+
+extern int do_md5_test(void);
+
+namespace mongo {
+    
+	Security::Security() {
+		static int n;
+		massert( 10352 , "Security is a singleton class", ++n == 1);
+		init(); 
+	}
+
+    void Security::init(){
+		if( _initialized ) return;
+		_initialized = true;
+
+#if defined(__linux__)
+        _devrandom = new ifstream("/dev/urandom", ios::binary|ios::in);
+        massert( 10353 ,  "can't open dev/urandom", _devrandom->is_open() );
+#elif defined(_WIN32)
+        srand(curTimeMicros());
+#else
+        srandomdev();
+#endif
+        assert( sizeof(nonce) == 8 );
+        
+#ifndef NDEBUG
+        if ( do_md5_test() )
+	    massert( 10354 , "md5 unit test fails", false);
+#endif
+    }
+    
+    nonce Security::getNonce(){
+        static boost::mutex m;
+        boostlock lk(m);
+
+		/* question/todo: /dev/random works on OS X.  is it better 
+		   to use that than random() / srandom()?
+		*/
+
+        nonce n;
+#if defined(__linux__)
+        _devrandom->read((char*)&n, sizeof(n));
+        massert( 10355 , "devrandom failed", !_devrandom->fail());
+#elif defined(_WIN32)
+        n = (((unsigned long long)rand())<<32) | rand();
+#else
+        n = (((unsigned long long)random())<<32) | random();
+#endif
+        return n;
+    }
+    
+	bool Security::_initialized;
+    Security security;
+        
+} // namespace mongo
diff --git a/db/nonce.h b/db/nonce.h
new file mode 100644
index 0000000..593931f
--- /dev/null
+++ b/db/nonce.h
@@ -0,0 +1,42 @@
+// nonce.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    typedef unsigned long long nonce;
+    
+    struct Security {
+        Security();
+
+        nonce getNonce();
+
+		/** safe during global var initialization */
+		nonce getNonceInitSafe() { 
+			init();
+			return getNonce();
+		}
+	private:
+        ifstream *_devrandom;
+		static bool _initialized;
+		void init(); // can call more than once
+    };
+    
+    extern Security security;
+        
+} // namespace mongo
diff --git a/db/pcre.txt b/db/pcre.txt
new file mode 100644
index 0000000..3e21047
--- /dev/null
+++ b/db/pcre.txt
@@ -0,0 +1,15 @@
+
+
+You need to install pcre.
+
+This could be scripted:
+
+cd /tmp
+curl -O ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-7.4.tar.gz
+tar -xzf pcre-7.4.tar.gz 
+./configure --enable-utf8 --with-match-limit=200000 --with-match-limit-recursion=4000
+make
+make install
+
+
+At that point is will be installed in /usr/*.  the version in p/pcre-7.4 is for VC++.
diff --git a/db/pdfile.cpp b/db/pdfile.cpp
new file mode 100644
index 0000000..18df5f1
--- /dev/null
+++ b/db/pdfile.cpp
@@ -0,0 +1,1649 @@
+// pdfile.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+todo:
+_ table scans must be sequential, not next/prev pointers
+_ coalesce deleted
+
+_ disallow system* manipulations from the database.
+*/
+
+#include "stdafx.h"
+#include "pdfile.h"
+#include "db.h"
+#include "../util/mmap.h"
+#include "../util/hashtab.h"
+#include "../util/file_allocator.h"
+#include "btree.h"
+#include <algorithm>
+#include <list>
+#include "query.h"
+#include "repl.h"
+#include "dbhelpers.h"
+#include "namespace.h"
+#include "queryutil.h"
+#include "extsort.h"
+#include "curop.h"
+
+namespace mongo {
+
+    string dbpath = "/data/db/";
+
+    DataFileMgr theDataFileMgr;
+    DatabaseHolder dbHolder;
+    int MAGIC = 0x1000;
+//    int curOp = -2;
+
+    extern int otherTraceLevel;
+    void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
+    void ensureIdIndexForNewNs(const char *ns) {
+        if ( !strstr( ns, ".system." ) && !strstr( ns, ".$freelist" ) ) {
+            log( 1 ) << "adding _id index for new collection" << endl;
+            ensureHaveIdIndex( ns );
+        }        
+    }
+
+    string getDbContext() {
+        stringstream ss;
+        Client * c = currentClient.get();
+        if ( c ){
+            Database *database = c->database();
+            if ( database ) {
+                ss << database->name << ' ';
+                ss << cc().ns() << ' ';
+            }
+        }
+        return ss.str();
+    }
+
+    BSONObj::BSONObj(const Record *r) {
+        init(r->data, false);
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    int initialExtentSize(int len) {
+        long long sz = len * 16;
+        if ( len < 1000 ) sz = len * 64;
+        if ( sz > 1000000000 )
+            sz = 1000000000;
+        int z = ((int)sz) & 0xffffff00;
+        assert( z > len );
+        DEV log() << "initialExtentSize(" << len << ") returns " << z << endl;
+        return z;
+    }
+
+    bool _userCreateNS(const char *ns, const BSONObj& j, string& err) {
+        if ( nsdetails(ns) ) {
+            err = "collection already exists";
+            return false;
+        }
+
+        log(1) << "create collection " << ns << ' ' << j << '\n';
+
+        /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field
+           and then go back and set to ok : 1 after we are done.
+        */
+        bool isFreeList = strstr(ns, ".$freelist") != 0;
+        if( !isFreeList )
+            addNewNamespaceToCatalog(ns, j.isEmpty() ? 0 : &j);
+
+        long long size = initialExtentSize(128);
+        BSONElement e = j.findElement("size");
+        if ( e.isNumber() ) {
+            size = (long long) e.number();
+            size += 256;
+            size &= 0xffffffffffffff00LL;
+        }
+        
+        uassert( 10083 ,  "invalid size spec", size > 0 );
+
+        bool newCapped = false;
+        int mx = 0;
+        e = j.findElement("capped");
+        if ( e.type() == Bool && e.boolean() ) {
+            newCapped = true;
+            e = j.findElement("max");
+            if ( e.isNumber() ) {
+                mx = (int) e.number();
+            }
+        }
+
+        // $nExtents just for debug/testing.  We create '$nExtents' extents,
+        // each of size 'size'.
+        e = j.findElement( "$nExtents" );
+        int nExtents = int( e.number() );
+        Database *database = cc().database();
+        if ( nExtents > 0 ) {
+            assert( size <= 0x7fffffff );
+            for ( int i = 0; i < nExtents; ++i ) {
+                assert( size <= 0x7fffffff );
+                // $nExtents is just for testing - always allocate new extents
+                // rather than reuse existing extents so we have some predictibility
+                // in the extent size used by our tests
+                database->suitableFile( (int) size )->createExtent( ns, (int) size, newCapped );
+            }
+        } else {
+            while ( size > 0 ) {
+                int max = MongoDataFile::maxSize() - MDFHeader::headerSize();
+                int desiredExtentSize = (int) (size > max ? max : size);
+                Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped );
+                size -= e->length;
+            }
+            if ( !newCapped ) {
+                // check if it's time to preallocate a new file, and if so queue that job for a bg thread
+                // safe to call this multiple times - the implementation will only preallocate one file
+                database->preallocateAFile();
+            }
+        }
+
+        NamespaceDetails *d = nsdetails(ns);
+        assert(d);
+
+        if ( j.getField( "autoIndexId" ).type() ) {
+            if ( j["autoIndexId"].trueValue() ){
+                ensureIdIndexForNewNs( ns );
+            }
+        } else {
+            if ( !newCapped ) {
+                ensureIdIndexForNewNs( ns );
+            }
+        }
+
+        if ( mx > 0 )
+            d->max = mx;
+
+        return true;
+    }
+
+    // { ..., capped: true, size: ..., max: ... }
+    // returns true if successful
+    bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication) {
+        const char *coll = strchr( ns, '.' ) + 1;
+        massert( 10356 ,  "invalid ns", coll && *coll );
+        char cl[ 256 ];
+        nsToDatabase( ns, cl );
+        bool ok = _userCreateNS(ns, j, err);
+        if ( logForReplication && ok ) {
+            if ( j.getField( "create" ).eoo() ) {
+                BSONObjBuilder b;
+                b << "create" << coll;
+                b.appendElements( j );
+                j = b.obj();
+            }
+            string logNs = string( cl ) + ".$cmd";
+            logOp("c", logNs.c_str(), j);
+        }
+        return ok;
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    int MongoDataFile::maxSize() {
+        if ( sizeof( int* ) == 4 )
+            return 512 * 1024 * 1024;
+        else
+            return 0x7ff00000;
+    }
+
+    int MongoDataFile::defaultSize( const char *filename ) const {
+        int size;
+
+        if ( fileNo <= 4 )
+            size = (64*1024*1024) << fileNo;
+        else
+            size = 0x7ff00000;
+
+        if ( strstr(filename, "_hudsonSmall") ) {
+            int mult = 1;
+            if ( fileNo > 1 && fileNo < 1000 )
+                mult = fileNo;
+            size = 1024 * 512 * mult;
+            log() << "Warning : using small files for _hudsonSmall" << endl;
+        }
+        else if ( cmdLine.smallfiles ){
+            size = size >> 2;
+        }
+        
+        
+        return size;
+    }
+
+    void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) {
+        {
+            /* check quotas
+               very simple temporary implementation - we will in future look up
+               the quota from the grid database
+            */
+            if ( cmdLine.quota && fileNo > cmdLine.quotaFiles && !boost::filesystem::exists(filename) ) {
+                /* todo: if we were adding / changing keys in an index did we do some
+                   work previously that needs cleaning up?  Possible.  We should
+                   check code like that and have it catch the exception and do
+                   something reasonable.
+                */
+                string s = "db disk space quota exceeded ";
+                Database *database = cc().database();
+                if ( database )
+                    s += database->name;
+                uasserted(12501,s);
+            }
+        }
+
+        long size = defaultSize( filename );
+        while ( size < minSize ) {
+            if ( size < maxSize() / 2 )
+                size *= 2;
+            else {
+                size = maxSize();
+                break;
+            }
+        }
+        if ( size > maxSize() )
+            size = maxSize();
+
+        assert( ( size >= 64*1024*1024 ) || cmdLine.smallfiles || ( strstr( filename, "_hudsonSmall" ) ) );
+        assert( size % 4096 == 0 );
+
+        if ( preallocateOnly ) {
+            if ( cmdLine.prealloc ) {
+                theFileAllocator().requestAllocation( filename, size );
+            }
+            return;
+        }
+        
+        header = (MDFHeader *) mmf.map(filename, size);
+        if( sizeof(char *) == 4 ) 
+            uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", header);
+        else
+            uassert( 10085 , "can't map file memory", header);
+        header->init(fileNo, size);
+    }
+
+    void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { 
+        DiskLoc oldExtentLoc;
+        NamespaceIndex *ni = nsindex(ns);
+        NamespaceDetails *details = ni->details(ns);
+        if ( details ) {
+            assert( !details->lastExtent.isNull() );
+            assert( !details->firstExtent.isNull() );
+            e->xprev = details->lastExtent;
+            details->lastExtent.ext()->xnext = eloc;
+            assert( !eloc.isNull() );
+            details->lastExtent = eloc;
+        }
+        else {
+            ni->add_ns(ns, eloc, capped);
+            details = ni->details(ns);
+        }
+
+        details->lastExtentSize = e->length;
+        DEBUGGING out() << "temp: newextent adddelrec " << ns << endl;
+        details->addDeletedRec(emptyLoc.drec(), emptyLoc);
+    }
+
+    Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
+        massert( 10357 ,  "shutdown in progress", !goingAway );
+        massert( 10358 ,  "bad new extent size", approxSize >= 0 && approxSize <= 0x7ff00000 );
+        massert( 10359 ,  "header==0 on new extent: 32 bit mmap space exceeded?", header ); // null if file open failed
+        int ExtentSize = approxSize <= header->unusedLength ? approxSize : header->unusedLength;
+        DiskLoc loc;
+        if ( ExtentSize <= 0 ) {
+            /* not there could be a lot of looping here is db just started and
+               no files are open yet.  we might want to do something about that. */
+            if ( loops > 8 ) {
+                assert( loops < 10000 );
+                out() << "warning: loops=" << loops << " fileno:" << fileNo << ' ' << ns << '\n';
+            }
+            log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n";
+            return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
+        }
+        int offset = header->unused.getOfs();
+        header->unused.setOfs( fileNo, offset + ExtentSize );
+        header->unusedLength -= ExtentSize;
+        loc.setOfs(fileNo, offset);
+        Extent *e = _getExtent(loc);
+        DiskLoc emptyLoc = e->init(ns, ExtentSize, fileNo, offset);
+
+        addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);
+
+        DEV log() << "new extent " << ns << " size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset
+                  << " emptyLoc:" << hex << emptyLoc.getOfs() << dec << endl;
+        return e;
+    }
+
+    Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { 
+        string s = cc().database()->name + ".$freelist";
+        NamespaceDetails *f = nsdetails(s.c_str());
+        if( f ) {
+            int low, high;
+            if( capped ) {
+                // be strict about the size
+                low = approxSize;
+                if( low > 2048 ) low -= 256;
+                high = (int) (approxSize * 1.05) + 256;
+            }
+            else { 
+                low = (int) (approxSize * 0.8);
+                high = (int) (approxSize * 1.4);
+            }
+            if( high < 0 ) high = approxSize;
+            int n = 0;
+            Extent *best = 0;
+            int bestDiff = 0x7fffffff;
+            {
+                DiskLoc L = f->firstExtent;
+                while( !L.isNull() ) { 
+                    Extent * e = L.ext();
+                    if( e->length >= low && e->length <= high ) { 
+                        int diff = abs(e->length - approxSize);
+                        if( diff < bestDiff ) { 
+                            bestDiff = diff;
+                            best = e;
+                            if( diff == 0 ) 
+                                break;
+                        }
+                    }
+                    L = e->xnext;
+                    ++n;
+                
+                }
+            }
+            OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
+            if( best ) {
+                Extent *e = best;
+                // remove from the free list
+                if( !e->xprev.isNull() )
+                    e->xprev.ext()->xnext = e->xnext;
+                if( !e->xnext.isNull() )
+                    e->xnext.ext()->xprev = e->xprev;
+                if( f->firstExtent == e->myLoc )
+                    f->firstExtent = e->xnext;
+                if( f->lastExtent == e->myLoc )
+                    f->lastExtent = e->xprev;
+
+                // use it
+                OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
+                DiskLoc emptyLoc = e->reuse(ns);
+                addNewExtentToNamespace(ns, e, e->myLoc, emptyLoc, capped);
+                return e;
+            }
+        }
+
+        return 0;
+        //        return createExtent(ns, approxSize, capped);
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    DiskLoc Extent::reuse(const char *nsname) { 
+        log(3) << "reset extent was:" << nsDiagnostic.buf << " now:" << nsname << '\n';
+        massert( 10360 ,  "Extent::reset bad magic value", magic == 0x41424344 );
+        xnext.Null();
+        xprev.Null();
+        nsDiagnostic = nsname;
+        firstRecord.Null();
+        lastRecord.Null();
+
+        DiskLoc emptyLoc = myLoc;
+        emptyLoc.inc( (extentData-(char*)this) );
+
+        int delRecLength = length - (extentData - (char *) this);
+        DeletedRecord *empty1 = (DeletedRecord *) extentData;
+        DeletedRecord *empty = (DeletedRecord *) getRecord(emptyLoc);
+        assert( empty == empty1 );
+        memset(empty, delRecLength, 1);
+
+        empty->lengthWithHeaders = delRecLength;
+        empty->extentOfs = myLoc.getOfs();
+        empty->nextDeleted.Null();
+
+        return emptyLoc;
+    }
+
+    /* assumes already zeroed -- insufficient for block 'reuse' perhaps */
+    DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) {
+        magic = 0x41424344;
+        myLoc.setOfs(_fileNo, _offset);
+        xnext.Null();
+        xprev.Null();
+        nsDiagnostic = nsname;
+        length = _length;
+        firstRecord.Null();
+        lastRecord.Null();
+
+        DiskLoc emptyLoc = myLoc;
+        emptyLoc.inc( (extentData-(char*)this) );
+
+        DeletedRecord *empty1 = (DeletedRecord *) extentData;
+        DeletedRecord *empty = (DeletedRecord *) getRecord(emptyLoc);
+        assert( empty == empty1 );
+        empty->lengthWithHeaders = _length - (extentData - (char *) this);
+        empty->extentOfs = myLoc.getOfs();
+        return emptyLoc;
+    }
+
+    /*
+      Record* Extent::newRecord(int len) {
+      if( firstEmptyRegion.isNull() )
+      return 0;
+
+      assert(len > 0);
+      int newRecSize = len + Record::HeaderSize;
+      DiskLoc newRecordLoc = firstEmptyRegion;
+      Record *r = getRecord(newRecordLoc);
+      int left = r->netLength() - len;
+      if( left < 0 ) {
+      //
+      firstEmptyRegion.Null();
+      return 0;
+      }
+
+      DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion);
+      r->lengthWithHeaders = newRecSize;
+      r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent
+      if( !lastRecord.isNull() ) {
+      assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one
+      getRecord(lastRecord)->next.set(newRecordLoc); // until now
+      r->prev.set(lastRecord);
+      }
+      else {
+      r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent
+      assert( firstRecord.isNull() );
+      firstRecord = newRecordLoc;
+      }
+      lastRecord = newRecordLoc;
+
+      if( left < Record::HeaderSize + 32 ) {
+      firstEmptyRegion.Null();
+      }
+      else {
+      firstEmptyRegion.inc(newRecSize);
+      Record *empty = getRecord(firstEmptyRegion);
+      empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null.
+      empty->prev.Null();
+      empty->lengthWithHeaders = left;
+      }
+
+      return r;
+      }
+    */
+
+    /*---------------------------------------------------------------------*/
+
+    auto_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) {
+        DiskLoc loc;
+        bool found = nsindex(ns)->find(ns, loc);
+        if ( !found ) {
+            //		out() << "info: findAll() namespace does not exist: " << ns << endl;
+            return auto_ptr<Cursor>(new BasicCursor(DiskLoc()));
+        }
+
+        Extent *e = getExtent(loc);
+
+        DEBUGGING {
+            out() << "listing extents for " << ns << endl;
+            DiskLoc tmp = loc;
+            set<DiskLoc> extents;
+
+            while ( 1 ) {
+                Extent *f = getExtent(tmp);
+                out() << "extent: " << tmp.toString() << endl;
+                extents.insert(tmp);
+                tmp = f->xnext;
+                if ( tmp.isNull() )
+                    break;
+                f = f->getNextExtent();
+            }
+
+            out() << endl;
+            nsdetails(ns)->dumpDeleted(&extents);
+        }
+
+        if ( !nsdetails( ns )->capped ) {
+            if ( !startLoc.isNull() )
+                return auto_ptr<Cursor>(new BasicCursor( startLoc ));                
+            while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
+                /* todo: if extent is empty, free it for reuse elsewhere.
+                   that is a bit complicated have to clean up the freelists.
+                */
+                RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead " << ns << endl;
+                // find a nonempty extent
+                // it might be nice to free the whole extent here!  but have to clean up free recs then.
+                e = e->getNextExtent();
+            }
+            return auto_ptr<Cursor>(new BasicCursor( e->firstRecord ));
+        } else {
+            return auto_ptr< Cursor >( new ForwardCappedCursor( nsdetails( ns ), startLoc ) );
+        }
+    }
+
+    /* get a table scan cursor, but can be forward or reverse direction.
+       order.$natural - if set, > 0 means forward (asc), < 0 backward (desc).
+    */
+    auto_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) {
+        BSONElement el = order.findElement("$natural"); // e.g., { $natural : -1 }
+
+        if ( el.number() >= 0 )
+            return DataFileMgr::findAll(ns, startLoc);
+
+        // "reverse natural order"
+        NamespaceDetails *d = nsdetails(ns);
+        if ( !d )
+            return auto_ptr<Cursor>(new BasicCursor(DiskLoc()));
+        if ( !d->capped ) {
+            if ( !startLoc.isNull() )
+                return auto_ptr<Cursor>(new ReverseCursor( startLoc ));                
+            Extent *e = d->lastExtent.ext();
+            while ( e->lastRecord.isNull() && !e->xprev.isNull() ) {
+                OCCASIONALLY out() << "  findTableScan: extent empty, skipping ahead" << endl;
+                e = e->getPrevExtent();
+            }
+            return auto_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
+        } else {
+            return auto_ptr< Cursor >( new ReverseCappedCursor( d, startLoc ) );
+        }
+    }
+
+    void printFreeList() { 
+        string s = cc().database()->name + ".$freelist";
+        log() << "dump freelist " << s << '\n';
+        NamespaceDetails *freeExtents = nsdetails(s.c_str());
+        if( freeExtents == 0 ) { 
+            log() << "  freeExtents==0" << endl;
+            return;
+        }
+        DiskLoc a = freeExtents->firstExtent;
+        while( !a.isNull() ) { 
+            Extent *e = a.ext();
+            log() << "  " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << '\n';
+            a = e->xnext;
+        }
+
+        log() << "  end freelist" << endl;
+    }
+
+    /* drop a collection/namespace */
+    void dropNS(const string& nsToDrop) {
+        NamespaceDetails* d = nsdetails(nsToDrop.c_str());
+        uassert( 10086 ,  (string)"ns not found: " + nsToDrop , d );
+
+        NamespaceString s(nsToDrop);
+        assert( s.db == cc().database()->name );
+        if( s.isSystem() ) {
+            if( s.coll == "system.profile" ) 
+                uassert( 10087 ,  "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 );
+            else
+                uasserted( 12502, "can't drop system ns" );
+        }
+
+        {
+            // remove from the system catalog
+            BSONObj cond = BSON( "name" << nsToDrop );   // { name: "colltodropname" }
+            string system_namespaces = cc().database()->name + ".system.namespaces";
+            /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true);
+			// no check of return code as this ns won't exist for some of the new storage engines
+        }
+
+        // free extents
+        if( !d->firstExtent.isNull() ) {
+            string s = cc().database()->name + ".$freelist";
+            NamespaceDetails *freeExtents = nsdetails(s.c_str());
+            if( freeExtents == 0 ) { 
+                string err;
+                _userCreateNS(s.c_str(), BSONObj(), err);
+                freeExtents = nsdetails(s.c_str());
+                massert( 10361 , "can't create .$freelist", freeExtents);
+            }
+            if( freeExtents->firstExtent.isNull() ) { 
+                freeExtents->firstExtent = d->firstExtent;
+                freeExtents->lastExtent = d->lastExtent;
+            }
+            else { 
+                DiskLoc a = freeExtents->firstExtent;
+                assert( a.ext()->xprev.isNull() );
+                a.ext()->xprev = d->lastExtent;
+                d->lastExtent.ext()->xnext = a;
+                freeExtents->firstExtent = d->firstExtent;
+
+                d->firstExtent.setInvalid();
+                d->lastExtent.setInvalid();
+            }
+        }
+
+        // remove from the catalog hashtable
+        cc().database()->namespaceIndex.kill_ns(nsToDrop.c_str());
+    }
+
+    void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ) {
+        log(1) << "dropCollection: " << name << endl;
+        NamespaceDetails *d = nsdetails(name.c_str());
+        assert( d );
+        if ( d->nIndexes != 0 ) {
+            try { 
+                assert( deleteIndexes(d, name.c_str(), "*", errmsg, result, true) );
+            }
+            catch( DBException& ) {
+                uasserted(12503,"drop: deleteIndexes for collection failed - consider trying repair");
+            }
+            assert( d->nIndexes == 0 );
+        }
+        log(1) << "\t deleteIndexes done" << endl;
+        result.append("ns", name.c_str());
+        ClientCursor::invalidate(name.c_str());
+        dropNS(name);        
+    }
+    
+    int nUnindexes = 0;
+
+    void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) {
+        BSONObjSetDefaultOrder keys;
+        id.getKeysFromObject(obj, keys);
+        for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
+            BSONObj j = *i;
+            //		out() << "UNINDEX: j:" << j.toString() << " head:" << id.head.toString() << dl.toString() << endl;
+            if ( otherTraceLevel >= 5 ) {
+                out() << "_unindexRecord() " << obj.toString();
+                out() << "\n  unindex:" << j.toString() << endl;
+            }
+            nUnindexes++;
+            bool ok = false;
+            try {
+                ok = id.head.btree()->unindex(id.head, id, j, dl);
+            }
+            catch (AssertionException&) {
+                problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl;
+                out() << "Assertion failure: _unindex failed" << '\n';
+                out() << "  obj:" << obj.toString() << '\n';
+                out() << "  key:" << j.toString() << '\n';
+                out() << "  dl:" << dl.toString() << endl;
+                sayDbContext();
+            }
+
+            if ( !ok && logMissing ) {
+                out() << "unindex failed (key too big?) " << id.indexNamespace() << '\n';
+            }
+        }
+    }
+
+    /* unindex all keys in all indexes for this record. */
+    void  unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) {
+        if ( d->nIndexes == 0 ) return;
+        BSONObj obj(todelete);
+        NamespaceDetails::IndexIterator i = d->ii();
+        while( i.more() ) {
+            _unindexRecord(i.next(), obj, dl, !noWarn);
+        }
+    }
+
+    /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. 
+       caller must check if capped
+    */
+    void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl)
+    {
+        /* remove ourself from the record next/prev chain */
+        {
+            if ( todelete->prevOfs != DiskLoc::NullOfs )
+                todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs;
+            if ( todelete->nextOfs != DiskLoc::NullOfs )
+                todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs;
+        }
+
+        /* remove ourself from extent pointers */
+        {
+            Extent *e = todelete->myExtent(dl);
+            if ( e->firstRecord == dl ) {
+                if ( todelete->nextOfs == DiskLoc::NullOfs )
+                    e->firstRecord.Null();
+                else
+                    e->firstRecord.setOfs(dl.a(), todelete->nextOfs);
+            }
+            if ( e->lastRecord == dl ) {
+                if ( todelete->prevOfs == DiskLoc::NullOfs )
+                    e->lastRecord.Null();
+                else
+                    e->lastRecord.setOfs(dl.a(), todelete->prevOfs);
+            }
+        }
+
+        /* add to the free list */
+        {
+            d->nrecords--;
+            d->datasize -= todelete->netLength();
+            /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+               careful until validated more, as IndexDetails has pointers
+               to this disk location.  so an incorrectly done remove would cause
+               a lot of problems.
+            */
+            if ( strstr(ns, ".system.indexes") ) {
+                memset(todelete, 0, todelete->lengthWithHeaders);
+            }
+            else {
+                DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+                d->addDeletedRec((DeletedRecord*)todelete, dl);
+            }
+        }
+    }
+
+    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn)
+    {
+        dassert( todelete == dl.rec() );
+
+        NamespaceDetails* d = nsdetails(ns);
+        if ( d->capped && !cappedOK ) {
+            out() << "failing remove on a capped ns " << ns << endl;
+            uassert( 10089 ,  "can't remove from a capped collection" , 0 );
+            return;
+        }
+
+        /* check if any cursors point to us.  if so, advance them. */
+        ClientCursor::aboutToDelete(dl);
+
+        unindexRecord(d, todelete, dl, noWarn);
+
+        _deleteRecord(d, ns, todelete, dl);
+        NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
+    }
+
+
+    /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
+     */
+    const DiskLoc DataFileMgr::update(const char *ns,
+                                       Record *toupdate, const DiskLoc& dl,
+                                       const char *_buf, int _len, OpDebug& debug)
+    {
+        StringBuilder& ss = debug.str;
+        dassert( toupdate == dl.rec() );
+
+        NamespaceDetails *d = nsdetails(ns);
+
+        BSONObj objOld(toupdate);
+        BSONObj objNew(_buf);
+        assert( objNew.objsize() == _len );
+        assert( objNew.objdata() == _buf );
+
+        if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
+            /* add back the old _id value if the update removes it.  Note this implementation is slow 
+               (copies entire object multiple times), but this shouldn't happen often, so going for simple
+               code, not speed.
+            */
+            BSONObjBuilder b;
+            BSONElement e;
+            assert( objOld.getObjectID(e) );
+            b.append(e); // put _id first, for best performance
+            b.appendElements(objNew);
+            objNew = b.obj();
+        }
+
+        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further  
+           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
+        */
+        vector<IndexChanges> changes;
+        getIndexChanges(changes, *d, objNew, objOld);
+        dupCheck(changes, *d);
+
+        if ( toupdate->netLength() < objNew.objsize() ) {
+            // doesn't fit.  reallocate -----------------------------------------------------
+            uassert( 10003 , "E10003 failing update: objects in a capped ns cannot grow", !(d && d->capped));
+            d->paddingTooSmall();
+            if ( cc().database()->profile )
+                ss << " moved ";
+            deleteRecord(ns, toupdate, dl);
+            return insert(ns, objNew.objdata(), objNew.objsize(), false);
+        }
+
+        NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
+        d->paddingFits();
+
+        /* have any index keys changed? */
+        {
+            unsigned keyUpdates = 0;
+            for ( int x = 0; x < d->nIndexes; x++ ) {
+                IndexDetails& idx = d->idx(x);
+                for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) {
+                    try {
+                        idx.head.btree()->unindex(idx.head, idx, *changes[x].removed[i], dl);
+                    }
+                    catch (AssertionException&) {
+                        ss << " exception update unindex ";
+                        problem() << " caught assertion update unindex " << idx.indexNamespace() << endl;
+                    }
+                }
+                assert( !dl.isNull() );
+                BSONObj idxKey = idx.info.obj().getObjectField("key");
+                keyUpdates += changes[x].added.size();
+                for ( unsigned i = 0; i < changes[x].added.size(); i++ ) {
+                    try {
+                        /* we did the dupCheck() above.  so we don't have to worry about it here. */
+                        idx.head.btree()->bt_insert(
+                                                    idx.head,
+                                                    dl, *changes[x].added[i], idxKey, /*dupsAllowed*/true, idx);
+                    }
+                    catch (AssertionException&) {
+                        ss << " exception update index ";
+                        out() << " caught assertion update index " << idx.indexNamespace() << '\n';
+                        problem() << " caught assertion update index " << idx.indexNamespace() << endl;
+                    }
+                }
+            }
+            if( keyUpdates && cc().database()->profile )
+                ss << '\n' << keyUpdates << " key updates ";
+        }
+
+        //	update in place
+        memcpy(toupdate->data, objNew.objdata(), objNew.objsize());
+        return dl;
+    }
+
+    int followupExtentSize(int len, int lastExtentLen) {
+        int x = initialExtentSize(len);
+        int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2);
+        int sz = y > x ? y : x;
+        sz = ((int)sz) & 0xffffff00;
+        assert( sz > len );
+        return sz;
+    }
+
+    int deb=0;
+
+    /* add keys to indexes for a new record */
+    inline void  _indexRecord(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc newRecordLoc, bool dupsAllowed) {
+        IndexDetails& idx = d->idx(idxNo);
+        BSONObjSetDefaultOrder keys;
+        idx.getKeysFromObject(obj, keys);
+        BSONObj order = idx.keyPattern();
+        int n = 0;
+        for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
+            if( ++n == 2 ) { 
+                d->setIndexIsMultikey(idxNo);
+            }
+            assert( !newRecordLoc.isNull() );
+            try {
+                idx.head.btree()->bt_insert(idx.head, newRecordLoc,
+                                            *i, order, dupsAllowed, idx);
+            }
+            catch (AssertionException& ) {
+                if( !dupsAllowed ) {
+                    // dup key exception, presumably.
+                    throw;
+                }
+                problem() << " caught assertion _indexRecord " << idx.indexNamespace() << endl;
+            }
+        }
+    }
+
+    void testSorting() 
+    {
+        BSONObjBuilder b;
+        b.appendNull("");
+        BSONObj x = b.obj();
+
+        BSONObjExternalSorter sorter;
+
+        sorter.add(x, DiskLoc(3,7));
+        sorter.add(x, DiskLoc(4,7));
+        sorter.add(x, DiskLoc(2,7));
+        sorter.add(x, DiskLoc(1,7));
+        sorter.add(x, DiskLoc(3,77));
+
+        sorter.sort();
+        
+        auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+        while( i->more() ) { 
+            BSONObjExternalSorter::Data d = i->next();
+            cout << d.second.toString() << endl;
+            cout << d.first.objsize() << endl;
+            cout<<"SORTER next:" << d.first.toString() << endl;
+        }
+    }
+
+    // throws DBException
+    /* _ TODO dropDups 
+     */
+    unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+        //        testSorting();
+        Timer t;
+
+        log() << "Buildindex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl;
+
+        bool dupsAllowed = !idx.unique();
+        bool dropDups = idx.dropDups();
+        BSONObj order = idx.keyPattern();
+
+        idx.head.Null();
+
+        /* get and sort all the keys ----- */
+        unsigned long long n = 0;
+        auto_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+        BSONObjExternalSorter sorter(order);
+        unsigned long long nkeys = 0;
+        ProgressMeter pm( d->nrecords , 10 );
+        while ( c->ok() ) {
+            BSONObj o = c->current();
+            DiskLoc loc = c->currLoc();
+
+            BSONObjSetDefaultOrder keys;
+            idx.getKeysFromObject(o, keys);
+            int k = 0;
+            for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
+                if( ++k == 2 )
+                    d->setIndexIsMultikey(idxNo);
+                //cout<<"SORTER ADD " << i->toString() << ' ' << loc.toString() << endl;
+                sorter.add(*i, loc);
+                nkeys++;
+            }
+
+            c->advance();
+            n++;
+            pm.hit();
+        };
+        sorter.sort();
+        
+        log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
+
+        list<DiskLoc> dupsToDrop;
+
+        /* build index --- */ 
+        {
+            BtreeBuilder btBuilder(dupsAllowed, idx);
+            BSONObj keyLast;
+            auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+            ProgressMeter pm2( nkeys , 10 );
+            while( i->more() ) { 
+                RARELY killCurrentOp.checkForInterrupt();
+                BSONObjExternalSorter::Data d = i->next();
+
+                //cout<<"TEMP SORTER next " << d.first.toString() << endl;
+                try { 
+                    btBuilder.addKey(d.first, d.second);
+                }
+                catch( AssertionException& ) { 
+                    if ( dupsAllowed ){
+                        // unknow exception??
+                        throw;
+                    }
+                    
+                    if ( ! dropDups )
+                        throw;
+
+                    /* we could queue these on disk, but normally there are very few dups, so instead we 
+                       keep in ram and have a limit.
+                    */
+                    dupsToDrop.push_back(d.second);
+                    uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 );
+                }
+                pm2.hit();
+            }
+            btBuilder.commit();
+            wassert( btBuilder.getn() == nkeys || dropDups ); 
+        }
+        
+        log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
+
+        for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ )
+            theDataFileMgr.deleteRecord( ns, i->rec(), *i, false, true );
+
+        return n;
+    }
+
+    static class BackgroundIndexBuildJobs { 
+
+        unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+            bool dupsAllowed = !idx.unique();
+            bool dropDups = idx.dropDups();
+
+            unsigned long long n = 0;
+            auto_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+            while ( c->ok() ) {
+                BSONObj js = c->current();
+                try { 
+                    _indexRecord(d, idxNo, js, c->currLoc(),dupsAllowed);
+                    c->advance();
+                } catch( AssertionException& e ) { 
+                    if ( dropDups ) {
+                        DiskLoc toDelete = c->currLoc();
+                        c->advance();
+                        theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true );
+                    } else {
+                        _log() << endl;
+                        log(2) << "addExistingToIndex exception " << e.what() << endl;
+                        throw;
+                    }
+                }
+                n++;
+            };
+            return n;
+        }
+
+        /* we do set a flag in the namespace for quick checking, but this is our authoritative info - 
+           that way on a crash/restart, we don't think we are still building one. */
+        set<NamespaceDetails*> bgJobsInProgress;
+
+        void prep(NamespaceDetails *d) {
+            assertInWriteLock();
+            assert( bgJobsInProgress.count(d) == 0 );
+            bgJobsInProgress.insert(d);
+            d->backgroundIndexBuildInProgress = 1;
+        }
+
+    public:
+        /* Note you cannot even do a foreground index build if a background is in progress,
+           as bg build assumes it is the last index in the array!
+        */
+        void checkInProg(NamespaceDetails *d) { 
+            assertInWriteLock();
+            uassert(12580, "already building an index for this namespace in background", bgJobsInProgress.count(d) == 0);
+        }
+
+/* todo: clean bg flag on loading of NamespaceDetails  */
+
+        unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { 
+            unsigned long long n;
+            prep(d);
+            try { 
+                idx.head = BtreeBucket::addBucket(idx);
+                n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
+            }
+            catch(...) { 
+                assertInWriteLock();
+                bgJobsInProgress.erase(d);
+                d->backgroundIndexBuildInProgress = 0;
+                throw;
+            }
+            return n;
+        }
+    } backgroundIndex;
+
+    // throws DBException
+    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { 
+        log() << "building new index on " << idx.keyPattern() << " for " << ns << "..." << endl;
+        Timer t;
+		unsigned long long n;
+
+        BSONObj info = idx.info.obj();
+        bool background = info["background"].trueValue();
+        if( background ) { 
+            log() << "WARNING: background index build not yet implemented" << endl;
+        }
+
+        if( !background ) {
+			n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
+			assert( !idx.head.isNull() );
+		}
+		else {
+            n = backgroundIndex.go(ns, d, idx, idxNo);
+		}
+        log() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl;
+    }
+
+    /* add keys to indexes for a new record */
+    void  indexRecord(NamespaceDetails *d, const void *buf, int len, DiskLoc newRecordLoc) {
+        BSONObj obj((const char *)buf);
+
+        /*UNIQUE*/
+        for ( int i = 0; i < d->nIndexes; i++ ) {
+            try { 
+                bool unique = d->idx(i).unique();
+                _indexRecord(d, i, obj, newRecordLoc, /*dupsAllowed*/!unique);
+            }
+            catch( DBException& ) { 
+                /* try to roll back previously added index entries
+                   note <= i (not < i) is important here as the index we were just attempted
+                   may be multikey and require some cleanup.
+                */
+                for( int j = 0; j <= i; j++ ) { 
+                    try {
+                        _unindexRecord(d->idx(j), obj, newRecordLoc, false);
+                    }
+                    catch(...) { 
+                        log(3) << "unindex fails on rollback after unique failure\n";
+                    }
+                }
+                throw;
+            }
+        }
+    }
+
+    extern BSONObj id_obj; // { _id : ObjectId("000000000000000000000000") }
+
+    void ensureHaveIdIndex(const char *ns) {
+        NamespaceDetails *d = nsdetails(ns);
+        if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) )
+            return;
+
+        d->flags |= NamespaceDetails::Flag_HaveIdIndex;
+
+        {
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                if( i.next().isIdIndex() )
+                    return;
+            }
+        }
+
+        string system_indexes = cc().database()->name + ".system.indexes";
+
+        BSONObjBuilder b;
+        b.append("name", "_id_");
+        b.append("ns", ns);
+        b.append("key", id_obj);
+        BSONObj o = b.done();
+
+        /* edge case: note the insert could fail if we have hit maxindexes already */
+        theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize(), true);
+    }
+
+#pragma pack(1)
+    struct IDToInsert_ { 
+        char type;
+        char _id[4];
+        OID oid;
+        IDToInsert_() {
+            type = (char) jstOID;
+            strcpy(_id, "_id");
+            assert( sizeof(IDToInsert_) == 17 );
+        }
+    } idToInsert_;
+    struct IDToInsert : public BSONElement {
+        IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {}
+    } idToInsert;
+#pragma pack()
+    
+    void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) {
+        BSONObj tmp = o;
+        insert( ns, tmp, god );
+        logOp( "i", ns, tmp );
+    }
+    
+    DiskLoc DataFileMgr::insert(const char *ns, BSONObj &o, bool god) {
+        DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god );
+        if ( !loc.isNull() )
+            o = BSONObj( loc.rec() );
+        return loc;
+    }
+
+    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection);
+
+    /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc 
+             after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
+    */
+    DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) {
+        bool wouldAddIndex = false;
+        uassert( 10093 , "cannot insert into reserved $ collection", god || strchr(ns, '$') == 0 );
+        uassert( 10094 , "invalid ns", strchr( ns , '.' ) > 0 );
+        const char *sys = strstr(ns, "system.");
+        if ( sys ) {
+            uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
+            if ( strstr(ns, ".system.") ) {
+                // later:check for dba-type permissions here if have that at some point separate
+                if ( strstr(ns, ".system.indexes" ) )
+                    wouldAddIndex = true;
+                else if ( legalClientSystemNS( ns , true ) )
+                    ;
+                else if ( !god ) {
+                    out() << "ERROR: attempt to insert in system namespace " << ns << endl;
+                    return DiskLoc();
+                }
+            }
+            else
+                sys = 0;
+        }
+
+        bool addIndex = wouldAddIndex && mayAddIndex;
+
+        NamespaceDetails *d = nsdetails(ns);
+        if ( d == 0 ) {
+            addNewNamespaceToCatalog(ns);
+            /* todo: shouldn't be in the namespace catalog until after the allocations here work.
+               also if this is an addIndex, those checks should happen before this!
+            */
+            // This creates first file in the database.
+            cc().database()->newestFile()->createExtent(ns, initialExtentSize(len));
+            d = nsdetails(ns);
+            if ( !god )
+                ensureIdIndexForNewNs(ns);
+        }
+        d->paddingFits();
+
+        NamespaceDetails *tableToIndex = 0;
+
+        string tabletoidxns;
+        if ( addIndex ) {
+            BSONObj io((const char *) obuf);
+            backgroundIndex.checkInProg(d);
+            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) ) {
+                return DiskLoc();
+            }
+        }
+
+        const BSONElement *newId = &writeId;
+        int addID = 0;
+        if( !god ) {
+            /* Check if we have an _id field. If we don't, we'll add it. 
+               Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
+            */
+            BSONObj io((const char *) obuf);
+            BSONElement idField = io.getField( "_id" );
+            uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
+            if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 ) {
+                addID = len;
+                if ( writeId.eoo() ) {
+                    // Very likely we'll add this elt, so little harm in init'ing here.
+                    idToInsert_.oid.init();
+                    newId = &idToInsert;
+                }
+                len += newId->size();
+            }
+            
+            BSONElementManipulator::lookForTimestamps( io );
+        }
+
+        DiskLoc extentLoc;
+        int lenWHdr = len + Record::HeaderSize;
+        lenWHdr = (int) (lenWHdr * d->paddingFactor);
+        if ( lenWHdr == 0 ) {
+            // old datafiles, backward compatible here.
+            assert( d->paddingFactor == 0 );
+            d->paddingFactor = 1.0;
+            lenWHdr = len + Record::HeaderSize;
+        }
+        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+        if ( loc.isNull() ) {
+            // out of space
+            if ( d->capped == 0 ) { // size capped doesn't grow
+                log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
+                cc().database()->allocExtent(ns, followupExtentSize(lenWHdr, d->lastExtentSize), false);
+                loc = d->alloc(ns, lenWHdr, extentLoc);
+                if ( loc.isNull() ){
+                    log() << "WARNING: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
+                    for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ){
+                        log() << "try #" << zzz << endl;
+                        cc().database()->allocExtent(ns, followupExtentSize(len, d->lastExtentSize), false);
+                        loc = d->alloc(ns, lenWHdr, extentLoc);
+                        if ( ! loc.isNull() )
+                            break;
+                    }
+                }
+            }
+            if ( loc.isNull() ) {
+                log() << "out of space in datafile " << ns << " capped:" << d->capped << endl;
+                assert(d->capped);
+                return DiskLoc();
+            }
+        }
+
+        Record *r = loc.rec();
+        assert( r->lengthWithHeaders >= lenWHdr );
+        if( addID ) { 
+            /* a little effort was made here to avoid a double copy when we add an ID */
+            ((int&)*r->data) = *((int*) obuf) + newId->size();
+            memcpy(r->data+4, newId->rawdata(), newId->size());
+            memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4);
+        }
+        else {
+            if( obuf )
+                memcpy(r->data, obuf, len);
+        }
+        Extent *e = r->myExtent(loc);
+        if ( e->lastRecord.isNull() ) {
+            e->firstRecord = e->lastRecord = loc;
+            r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+        }
+        else {
+
+            Record *oldlast = e->lastRecord.rec();
+            r->prevOfs = e->lastRecord.getOfs();
+            r->nextOfs = DiskLoc::NullOfs;
+            oldlast->nextOfs = loc.getOfs();
+            e->lastRecord = loc;
+        }
+
+        d->nrecords++;
+        d->datasize += r->netLength();
+
+        // we don't bother clearing those stats for the god tables - also god is true when adidng a btree bucket
+        if ( !god )
+            NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
+        
+        if ( tableToIndex ) {
+            int idxNo = tableToIndex->nIndexes;
+            IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str()); // clear transient info caches so they refresh; increments nIndexes
+            idx.info = loc;
+            try {
+                buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo);
+            } catch( DBException& ) {
+                // save our error msg string as an exception on deleteIndexes will overwrite our message
+                LastError *le = lastError.get();
+                assert( le );
+                string saveerrmsg = le->msg;
+                assert( !saveerrmsg.empty() );
+
+                // roll back this index
+                string name = idx.indexName();
+                BSONObjBuilder b;
+                string errmsg;
+                bool ok = deleteIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
+                if( !ok ) {
+                    log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
+                }
+                raiseError(12506,saveerrmsg.c_str());
+                throw;
+            }
+        }
+
+        /* add this record to our indexes */
+        if ( d->nIndexes ) {
+            try { 
+                indexRecord(d, r->data/*buf*/, len, loc);
+            } 
+            catch( AssertionException& e ) { 
+                // should be a dup key error on _id index
+                if( tableToIndex || d->capped ) { 
+                    string s = e.toString();
+                    s += " : on addIndex/capped - collection and its index will not match";
+                    uassert_nothrow(s.c_str());
+                    log() << s << '\n';
+                }
+                else { 
+                    // normal case -- we can roll back
+                    _deleteRecord(d, ns, r, loc);
+                    throw;
+                }
+            }
+        }
+
+        //	out() << "   inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
+        return loc;
+    }
+
+    /* special version of insert for transaction logging -- streamlined a bit.
+       assumes ns is capped and no indexes
+    */
+    Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) {
+        RARELY assert( d == nsdetails(ns) );
+
+        DiskLoc extentLoc;
+        int lenWHdr = len + Record::HeaderSize;
+        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+        if ( loc.isNull() ) {
+            assert(false);
+            return 0;
+        }
+
+        Record *r = loc.rec();
+        assert( r->lengthWithHeaders >= lenWHdr );
+
+        Extent *e = r->myExtent(loc);
+        if ( e->lastRecord.isNull() ) {
+            e->firstRecord = e->lastRecord = loc;
+            r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+        }
+        else {
+            Record *oldlast = e->lastRecord.rec();
+            r->prevOfs = e->lastRecord.getOfs();
+            r->nextOfs = DiskLoc::NullOfs;
+            oldlast->nextOfs = loc.getOfs();
+            e->lastRecord = loc;
+        }
+
+        d->nrecords++;
+
+        return r;
+    }
+
+    void DataFileMgr::init(const string& path ) {
+        /*	boost::filesystem::path path( dir );
+        	path /= "temp.dat";
+        	string pathString = path.string();
+        	temp.open(pathString.c_str(), 64 * 1024 * 1024);
+        */
+    }
+
+    void pdfileInit() {
+        //	namespaceIndex.init(dbpath);
+        theDataFileMgr.init(dbpath);
+    }
+
+} // namespace mongo
+
+#include "clientcursor.h"
+
+namespace mongo {
+
+    void dropDatabase(const char *ns) {
+        // ns is of the form "<dbname>.$cmd"
+        char cl[256];
+        nsToDatabase(ns, cl);
+        log(1) << "dropDatabase " << cl << endl;
+        assert( cc().database()->name == cl );
+
+        closeDatabase( cl );
+        _deleteDataFiles(cl);
+    }
+
+    typedef boost::filesystem::path Path;
+
+    // back up original database files to 'temp' dir
+    void _renameForBackup( const char *database, const Path &reservedPath ) {
+        class Renamer : public FileOp {
+        public:
+            Renamer( const Path &reservedPath ) : reservedPath_( reservedPath ) {}
+        private:
+            const boost::filesystem::path &reservedPath_;
+            virtual bool apply( const Path &p ) {
+                if ( !boost::filesystem::exists( p ) )
+                    return false;
+                boost::filesystem::rename( p, reservedPath_ / ( p.leaf() + ".bak" ) );
+                return true;
+            }
+            virtual const char * op() const {
+                return "renaming";
+            }
+        } renamer( reservedPath );
+        _applyOpToDataFiles( database, renamer, true );
+    }
+
+    // move temp files to standard data dir
+    void _replaceWithRecovered( const char *database, const char *reservedPathString ) {
+        class : public FileOp {
+            virtual bool apply( const Path &p ) {
+                if ( !boost::filesystem::exists( p ) )
+                    return false;
+                boost::filesystem::rename( p, boost::filesystem::path(dbpath) / p.leaf() );
+                return true;
+            }
+            virtual const char * op() const {
+                return "renaming";
+            }
+        } renamer;
+        _applyOpToDataFiles( database, renamer, true, reservedPathString );
+    }
+
+    // generate a directory name for storing temp data files
+    Path uniqueReservedPath( const char *prefix ) {
+        Path dbPath = Path( dbpath );
+        Path reservedPath;
+        int i = 0;
+        bool exists = false;
+        do {
+            stringstream ss;
+            ss << prefix << "_repairDatabase_" << i++;
+            reservedPath = dbPath / ss.str();
+            BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
+        } while ( exists );
+        return reservedPath;
+    }
+
+    boost::intmax_t dbSize( const char *database ) {
+        class SizeAccumulator : public FileOp {
+        public:
+            SizeAccumulator() : totalSize_( 0 ) {}
+            boost::intmax_t size() const {
+                return totalSize_;
+            }
+        private:
+            virtual bool apply( const boost::filesystem::path &p ) {
+                if ( !boost::filesystem::exists( p ) )
+                    return false;
+                totalSize_ += boost::filesystem::file_size( p );
+                return true;
+            }
+            virtual const char *op() const {
+                return "checking size";
+            }
+            boost::intmax_t totalSize_;
+        };
+        SizeAccumulator sa;
+        _applyOpToDataFiles( database, sa );
+        return sa.size();
+    }
+
+#if !defined(_WIN32)
+} // namespace mongo
+#include <sys/statvfs.h>
+namespace mongo {
+#endif
+    boost::intmax_t freeSpace() {
+#if !defined(_WIN32)
+        struct statvfs info;
+        assert( !statvfs( dbpath.c_str() , &info ) );
+        return boost::intmax_t( info.f_bavail ) * info.f_frsize;
+#else
+        return -1;
+#endif
+    }
+
+    bool repairDatabase( const char *ns, string &errmsg,
+                         bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) {
+        stringstream ss;
+        ss << "localhost:" << cmdLine.port;
+        string localhost = ss.str();
+
+        // ns is of the form "<dbname>.$cmd"
+        char dbName[256];
+        nsToDatabase(ns, dbName);
+        problem() << "repairDatabase " << dbName << endl;
+        assert( cc().database()->name == dbName );
+
+        boost::intmax_t totalSize = dbSize( dbName );
+        boost::intmax_t freeSize = freeSpace();
+        if ( freeSize > -1 && freeSize < totalSize ) {
+            stringstream ss;
+            ss << "Cannot repair database " << dbName << " having size: " << totalSize
+               << " (bytes) because free disk space is: " << freeSize << " (bytes)";
+            errmsg = ss.str();
+            problem() << errmsg << endl;
+            return false;
+        }
+
+        Path reservedPath =
+            uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ?
+                                "backup" : "tmp" );
+        BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
+        string reservedPathString = reservedPath.native_directory_string();
+        assert( setClient( dbName, reservedPathString.c_str() ) );
+
+        bool res = cloneFrom(localhost.c_str(), errmsg, dbName, 
+                             /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
+        closeDatabase( dbName, reservedPathString.c_str() );
+
+        if ( !res ) {
+            problem() << "clone failed for " << dbName << " with error: " << errmsg << endl;
+            if ( !preserveClonedFilesOnFailure )
+                BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+            return false;
+        }
+
+        assert( !setClient( dbName ) );
+        closeDatabase( dbName );
+
+        if ( backupOriginalFiles )
+            _renameForBackup( dbName, reservedPath );
+        else
+            _deleteDataFiles( dbName );
+
+        _replaceWithRecovered( dbName, reservedPathString.c_str() );
+
+        if ( !backupOriginalFiles )
+            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+        return true;
+    }
+
+    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) {
+        if ( afterAllocator )
+            theFileAllocator().waitUntilFinished();
+        string c = database;
+        c += '.';
+        boost::filesystem::path p(path);
+        boost::filesystem::path q;
+        q = p / (c+"ns");
+        bool ok = false;
+        BOOST_CHECK_EXCEPTION( ok = fo.apply( q ) );
+        if ( ok )
+            log(2) << fo.op() << " file " << q.string() << '\n';
+        int i = 0;
+        int extra = 10; // should not be necessary, this is defensive in case there are missing files
+        while ( 1 ) {
+            assert( i <= DiskLoc::MaxFiles );
+            stringstream ss;
+            ss << c << i;
+            q = p / ss.str();
+            BOOST_CHECK_EXCEPTION( ok = fo.apply(q) );
+            if ( ok ) {
+                if ( extra != 10 ){
+                    log(1) << fo.op() << " file " << q.string() << '\n';
+                    log() << "  _applyOpToDataFiles() warning: extra == " << extra << endl;
+                }
+            }
+            else if ( --extra <= 0 )
+                break;
+            i++;
+        }
+    }
+
+    NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); }
+    
+    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result ){
+        log(2) << "DatabaseHolder::closeAll path:" << path << endl;
+        dbMutex.assertWriteLocked();
+        
+        map<string,Database*>& m = _paths[path];
+        _size -= m.size();
+        
+        set< string > dbs;
+        for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) {
+            dbs.insert( i->first );
+        }
+        
+        BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
+        int n = 0;
+        for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) {
+            string name = *i;
+            log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl;
+            setClient( name.c_str() , path );
+            closeDatabase( name.c_str() , path );
+            bb.append( bb.numStr( n++ ).c_str() , name );
+        }
+        bb.done();
+        
+        return true;
+    }
+    
+
+} // namespace mongo
diff --git a/db/pdfile.h b/db/pdfile.h
new file mode 100644
index 0000000..19a8322
--- /dev/null
+++ b/db/pdfile.h
@@ -0,0 +1,448 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* pdfile.h
+
+   Files:
+     database.ns - namespace index
+     database.1  - data files
+     database.2
+     ...
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "../util/mmap.h"
+#include "storage.h"
+#include "jsobjmanipulator.h"
+#include "namespace.h"
+#include "client.h"
+
+namespace mongo {
+
+    class MDFHeader;
+    class Extent;
+    class Record;
+    class Cursor;
+    class OpDebug;
+
+    void dropDatabase(const char *ns);
+    bool repairDatabase(const char *ns, string &errmsg, bool preserveClonedFilesOnFailure = false, bool backupOriginalFiles = false);
+
+    /* low level - only drops this ns */
+    void dropNS(const string& dropNs);
+    
+    /* deletes this ns, indexes and cursors */
+    void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ); 
+    bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication);
+    auto_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc());
+
+// -1 if library unavailable.
+    boost::intmax_t freeSpace();
+
+    /*---------------------------------------------------------------------*/
+
+    class MDFHeader;
+    class MongoDataFile {
+        friend class DataFileMgr;
+        friend class BasicCursor;
+    public:
+        MongoDataFile(int fn) : fileNo(fn) { }
+        void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false);
+
+        /* allocate a new extent from this datafile. 
+           @param capped - true if capped collection
+           @param loops is our recursion check variable - you want to pass in zero
+        */
+        Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0);
+
+        MDFHeader *getHeader() {
+            return header;
+        }
+
+        /* return max size an extent may be */
+        static int maxSize();
+
+    private:
+        int defaultSize( const char *filename ) const;
+
+        Extent* getExtent(DiskLoc loc);
+        Extent* _getExtent(DiskLoc loc);
+        Record* recordAt(DiskLoc dl);
+
+        MemoryMappedFile mmf;
+        MDFHeader *header;
+        int fileNo;
+    };
+
+    class DataFileMgr {
+        friend class BasicCursor;
+    public:
+        void init(const string& path );
+
+        /* see if we can find an extent of the right size in the freelist. */
+        static Extent* allocFromFreeList(const char *ns, int approxSize, bool capped = false);
+
+        /** @return DiskLoc where item ends up */
+        const DiskLoc update(
+            const char *ns,
+            Record *toupdate, const DiskLoc& dl,
+            const char *buf, int len, OpDebug& debug);
+        // The object o may be updated if modified on insert.                                
+        void insertAndLog( const char *ns, const BSONObj &o, bool god = false );
+        DiskLoc insert(const char *ns, BSONObj &o, bool god = false);
+        DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, const BSONElement &writeId = BSONElement(), bool mayAddIndex = true);
+        void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false);
+        static auto_ptr<Cursor> findAll(const char *ns, const DiskLoc &startLoc = DiskLoc());
+
+        /* special version of insert for transaction logging -- streamlined a bit.
+           assumes ns is capped and no indexes
+           no _id field check
+        */
+        Record* fast_oplog_insert(NamespaceDetails *d, const char *ns, int len);
+
+        static Extent* getExtent(const DiskLoc& dl);
+        static Record* getRecord(const DiskLoc& dl);
+
+        /* does not clean up indexes, etc. : just deletes the record in the pdfile. */
+        void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl);
+
+    private:
+        vector<MongoDataFile *> files;
+    };
+
+    extern DataFileMgr theDataFileMgr;
+
+#pragma pack(1)
+
+    class DeletedRecord {
+    public:
+        int lengthWithHeaders;
+        int extentOfs;
+        DiskLoc nextDeleted;
+        Extent* myExtent(const DiskLoc& myLoc) {
+            return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs));
+        }
+    };
+
+    /* Record is a record in a datafile.  DeletedRecord is similar but for deleted space.
+
+    *11:03:20 AM) dm10gen: regarding extentOfs...
+    (11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and DeleteRecords
+    (11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs.  (64 bit total)
+    (11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent address, we keep just the offset
+    (11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo
+    (11:04:33 AM) dm10gen: see class DiskLoc for more info
+    (11:04:43 AM) dm10gen: so that is how Record::myExtent() works
+    (11:04:53 AM) dm10gen: on an alloc(), when we build a new Record, we must popular its extentOfs then
+    */
+    class Record {
+    public:
+        enum HeaderSizeValue { HeaderSize = 16 };
+        int lengthWithHeaders;
+        int extentOfs;
+        int nextOfs;
+        int prevOfs;
+        char data[4];
+        int netLength() {
+            return lengthWithHeaders - HeaderSize;
+        }
+        //void setNewLength(int netlen) { lengthWithHeaders = netlen + HeaderSize; }
+
+        /* use this when a record is deleted. basically a union with next/prev fields */
+        DeletedRecord& asDeleted() {
+            return *((DeletedRecord*) this);
+        }
+
+        Extent* myExtent(const DiskLoc& myLoc) {
+            return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs));
+        }
+        /* get the next record in the namespace, traversing extents as necessary */
+        DiskLoc getNext(const DiskLoc& myLoc);
+        DiskLoc getPrev(const DiskLoc& myLoc);
+    };
+
+    /* extents are datafile regions where all the records within the region
+       belong to the same namespace.
+
+    (11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big DeletedRecord
+    (11:12:55 AM) dm10gen: and that is placed on the free list
+    */
+    class Extent {
+    public:
+        unsigned magic;
+        DiskLoc myLoc;
+        DiskLoc xnext, xprev; /* next/prev extent for this namespace */
+
+        /* which namespace this extent is for.  this is just for troubleshooting really 
+           and won't even be correct if the collection were renamed!
+        */
+        Namespace nsDiagnostic; 
+
+        int length;   /* size of the extent, including these fields */
+        DiskLoc firstRecord, lastRecord;
+        char extentData[4];
+
+        bool validates() {
+            return !(firstRecord.isNull() ^ lastRecord.isNull()) &&
+                   length >= 0 && !myLoc.isNull();
+        }
+
+        void dump(iostream& s) {
+            s << "    loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n';
+            s << "    nsdiag:" << nsDiagnostic.buf << '\n';
+            s << "    size:" << length << " firstRecord:" << firstRecord.toString() << " lastRecord:" << lastRecord.toString() << '\n';
+        }
+
+        /* assumes already zeroed -- insufficient for block 'reuse' perhaps
+        Returns a DeletedRecord location which is the data in the extent ready for us.
+        Caller will need to add that to the freelist structure in namespacedetail.
+        */
+        DiskLoc init(const char *nsname, int _length, int _fileNo, int _offset);
+
+        /* like init(), but for a reuse case */
+        DiskLoc reuse(const char *nsname);
+
+        void assertOk() {
+            assert(magic == 0x41424344);
+        }
+
+        Record* newRecord(int len);
+
+        Record* getRecord(DiskLoc dl) {
+            assert( !dl.isNull() );
+            assert( dl.sameFile(myLoc) );
+            int x = dl.getOfs() - myLoc.getOfs();
+            assert( x > 0 );
+            return (Record *) (((char *) this) + x);
+        }
+
+        Extent* getNextExtent() {
+            return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext);
+        }
+        Extent* getPrevExtent() {
+            return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev);
+        }
+    };
+
+    /*
+          ----------------------
+          Header
+          ----------------------
+          Extent (for a particular namespace)
+            Record
+            ...
+            Record (some chained for unused space)
+          ----------------------
+          more Extents...
+          ----------------------
+    */
+
+    /* data file header */
+    class MDFHeader {
+    public:
+        int version;
+        int versionMinor;
+        int fileLength;
+        DiskLoc unused; /* unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more */
+        int unusedLength;
+        char reserved[8192 - 4*4 - 8];
+
+        char data[4];
+
+        static int headerSize() {
+            return sizeof(MDFHeader) - 4;
+        }
+
+        bool currentVersion() const {
+            return ( version == VERSION ) && ( versionMinor == VERSION_MINOR );
+        }
+
+        bool uninitialized() const {
+            if ( version == 0 ) return true;
+            return false;
+        }
+
+        Record* getRecord(DiskLoc dl) {
+            int ofs = dl.getOfs();
+            assert( ofs >= headerSize() );
+            return (Record*) (((char *) this) + ofs);
+        }
+
+        void init(int fileno, int filelength) {
+            if ( uninitialized() ) {
+                assert(filelength > 32768 );
+                assert( headerSize() == 8192 );
+                fileLength = filelength;
+                version = VERSION;
+                versionMinor = VERSION_MINOR;
+                unused.setOfs( fileno, headerSize() );
+                assert( (data-(char*)this) == headerSize() );
+                unusedLength = fileLength - headerSize() - 16;
+                memcpy(data+unusedLength, "      \nthe end\n", 16);
+            }
+        }
+        
+        bool isEmpty() const {
+            return uninitialized() || ( unusedLength == fileLength - headerSize() - 16 );
+        }
+    };
+
+#pragma pack()
+
+    inline Extent* MongoDataFile::_getExtent(DiskLoc loc) {
+        loc.assertOk();
+        Extent *e = (Extent *) (((char *)header) + loc.getOfs());
+        return e;
+    }
+
+    inline Extent* MongoDataFile::getExtent(DiskLoc loc) {
+        Extent *e = _getExtent(loc);
+        e->assertOk();
+        return e;
+    }
+
+} // namespace mongo
+
+#include "cursor.h"
+
+namespace mongo {
+
+    inline Record* MongoDataFile::recordAt(DiskLoc dl) {
+        return header->getRecord(dl);
+    }
+
+    inline DiskLoc Record::getNext(const DiskLoc& myLoc) {
+        if ( nextOfs != DiskLoc::NullOfs ) {
+            /* defensive */
+            if ( nextOfs >= 0 && nextOfs < 10 ) {
+                sayDbContext("Assertion failure - Record::getNext() referencing a deleted record?");
+                return DiskLoc();
+            }
+
+            return DiskLoc(myLoc.a(), nextOfs);
+        }
+        Extent *e = myExtent(myLoc);
+        while ( 1 ) {
+            if ( e->xnext.isNull() )
+                return DiskLoc(); // end of table.
+            e = e->xnext.ext();
+            if ( !e->firstRecord.isNull() )
+                break;
+            // entire extent could be empty, keep looking
+        }
+        return e->firstRecord;
+    }
+    inline DiskLoc Record::getPrev(const DiskLoc& myLoc) {
+        if ( prevOfs != DiskLoc::NullOfs )
+            return DiskLoc(myLoc.a(), prevOfs);
+        Extent *e = myExtent(myLoc);
+        if ( e->xprev.isNull() )
+            return DiskLoc();
+        return e->xprev.ext()->lastRecord;
+    }
+
+    inline Record* DiskLoc::rec() const {
+        return DataFileMgr::getRecord(*this);
+    }
+    inline BSONObj DiskLoc::obj() const {
+        return BSONObj(rec());
+    }
+    inline DeletedRecord* DiskLoc::drec() const {
+        assert( fileNo != -1 );
+        return (DeletedRecord*) rec();
+    }
+    inline Extent* DiskLoc::ext() const {
+        return DataFileMgr::getExtent(*this);
+    }
+
+    /*---------------------------------------------------------------------*/
+
+} // namespace mongo
+
+#include "rec.h"
+#include "database.h"
+
+namespace mongo {
+
+    // Heritable class to implement an operation that may be applied to all
+    // files in a database using _applyOpToDataFiles()
+    class FileOp {
+    public:
+        virtual ~FileOp() {}
+        // Return true if file exists and operation successful
+        virtual bool apply( const boost::filesystem::path &p ) = 0;
+        virtual const char * op() const = 0;
+    };
+
+    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );
+
+    inline void _deleteDataFiles(const char *database) {
+        class : public FileOp {
+            virtual bool apply( const boost::filesystem::path &p ) {
+                return boost::filesystem::remove( p );
+            }
+            virtual const char * op() const {
+                return "remove";
+            }
+        } deleter;
+        _applyOpToDataFiles( database, deleter, true );
+    }
+
+    boost::intmax_t dbSize( const char *database );
+
+    inline NamespaceIndex* nsindex(const char *ns) {
+        Database *database = cc().database();
+        assert( database );
+        DEV {
+            char buf[256];
+            nsToDatabase(ns, buf);
+            if ( database->name != buf ) {
+                out() << "ERROR: attempt to write to wrong database database\n";
+                out() << " ns:" << ns << '\n';
+                out() << " database->name:" << database->name << endl;
+                assert( database->name == buf );
+            }
+        }
+        return &database->namespaceIndex;
+    }
+
+    inline NamespaceDetails* nsdetails(const char *ns) {
+        // if this faults, did you set the current db first?  (Client::Context + dblock)
+        return nsindex(ns)->details(ns);
+    }
+
+    inline MongoDataFile& DiskLoc::pdf() const {
+        assert( fileNo != -1 );
+        return *cc().database()->getFile(fileNo);
+    }
+
+    inline Extent* DataFileMgr::getExtent(const DiskLoc& dl) {
+        assert( dl.a() != -1 );
+        return cc().database()->getFile(dl.a())->getExtent(dl);
+    }
+
+    inline Record* DataFileMgr::getRecord(const DiskLoc& dl) {
+        assert( dl.a() != -1 );
+        return cc().database()->getFile(dl.a())->recordAt(dl);
+    }
+    
+    void ensureHaveIdIndex(const char *ns);
+    
+    bool deleteIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex );
+        
+} // namespace mongo
diff --git a/db/query.cpp b/db/query.cpp
new file mode 100644
index 0000000..9c82609
--- /dev/null
+++ b/db/query.cpp
@@ -0,0 +1,921 @@
+// query.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "stdafx.h"
+#include "query.h"
+#include "pdfile.h"
+#include "jsobjmanipulator.h"
+#include "../util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "json.h"
+#include "repl.h"
+#include "replset.h"
+#include "scanandorder.h"
+#include "security.h"
+#include "curop.h"
+#include "commands.h"
+#include "queryoptimizer.h"
+#include "lasterror.h"
+
+namespace mongo {
+
+    /* We cut off further objects once we cross this threshold; thus, you might get
+       a little bit more than this, it is a threshold rather than a limit.
+    */
+    const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024;
+
+    //ns->query->DiskLoc
+//    LRUishMap<BSONObj,DiskLoc,5> lrutest(123);
+
+    extern bool useCursors;
+    extern bool useHints;
+
+    // Just try to identify best plan.
+    class DeleteOp : public QueryOp {
+    public:
+        DeleteOp( bool justOne, int& bestCount ) :
+            justOne_( justOne ),
+            count_(),
+            bestCount_( bestCount ),
+            nScanned_() {
+        }
+        virtual void init() {
+            c_ = qp().newCursor();
+            matcher_.reset( new CoveredIndexMatcher( qp().query(), qp().indexKey() ) );
+        }
+        virtual void next() {
+            if ( !c_->ok() ) {
+                setComplete();
+                return;
+            }
+            
+            DiskLoc rloc = c_->currLoc();
+            
+            if ( matcher_->matches(c_->currKey(), rloc ) ) {
+                if ( !c_->getsetdup(rloc) )
+                    ++count_;
+            }
+
+            c_->advance();
+            ++nScanned_;
+            if ( count_ > bestCount_ )
+                bestCount_ = count_;
+            
+            if ( count_ > 0 ) {
+                if ( justOne_ )
+                    setComplete();
+                else if ( nScanned_ >= 100 && count_ == bestCount_ )
+                    setComplete();
+            }
+        }
+        virtual bool mayRecordPlan() const { return !justOne_; }
+        virtual QueryOp *clone() const {
+            return new DeleteOp( justOne_, bestCount_ );
+        }
+        auto_ptr< Cursor > newCursor() const { return qp().newCursor(); }
+    private:
+        bool justOne_;
+        int count_;
+        int &bestCount_;
+        long long nScanned_;
+        auto_ptr< Cursor > c_;
+        auto_ptr< CoveredIndexMatcher > matcher_;
+    };
+    
+    /* ns:      namespace, e.g. <database>.<collection>
+       pattern: the "where" clause / criteria
+       justOne: stop after 1 match
+    */
+    int deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop, bool god) {
+        if( !god ) {
+            if ( strstr(ns, ".system.") ) {
+                /* note a delete from system.indexes would corrupt the db 
+                if done here, as there are pointers into those objects in 
+                NamespaceDetails.
+                */
+                uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
+            }
+            if ( strchr( ns , '$' ) ){
+                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
+                uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
+            }
+        }
+
+        NamespaceDetails *d = nsdetails( ns );
+        if ( ! d )
+            return 0;
+        uassert( 10101 ,  "can't remove from a capped collection" , ! d->capped );
+
+        int nDeleted = 0;
+        QueryPlanSet s( ns, pattern, BSONObj() );
+        int best = 0;
+        DeleteOp original( justOne, best );
+        shared_ptr< DeleteOp > bestOp = s.runOp( original );
+        auto_ptr< Cursor > creal = bestOp->newCursor();
+        
+        if( !creal->ok() )
+            return nDeleted;
+
+        CoveredIndexMatcher matcher(pattern, creal->indexKeyPattern());
+
+        auto_ptr<ClientCursor> cc;
+        cc.reset( new ClientCursor() );
+        cc->c = creal;
+        cc->ns = ns;
+        cc->noTimeout();
+        cc->setDoingDeletes( true );
+
+        CursorId id = cc->cursorid;
+        
+        unsigned long long nScanned = 0;
+        do {
+            if ( ++nScanned % 128 == 0 && !matcher.docMatcher().atomic() ) {
+                if ( ! cc->yield() ){
+                    cc.release(); // has already been deleted elsewhere
+                    break;
+                }
+            }
+            
+            // this way we can avoid calling updateLocation() every time (expensive)
+            // as well as some other nuances handled
+            cc->setDoingDeletes( true );
+            
+            DiskLoc rloc = cc->c->currLoc();
+            BSONObj key = cc->c->currKey();
+            
+            cc->c->advance();
+            
+            if ( ! matcher.matches( key , rloc ) )
+                continue;
+            
+            assert( !cc->c->getsetdup(rloc) ); // can't be a dup, we deleted it!
+
+            if ( !justOne ) {
+                /* NOTE: this is SLOW.  this is not good, noteLocation() was designed to be called across getMore
+                   blocks.  here we might call millions of times which would be bad.
+                */
+                cc->c->noteLocation();
+            }
+            
+            if ( logop ) {
+                BSONElement e;
+                if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
+                    BSONObjBuilder b;
+                    b.append( e );
+                    bool replJustOne = true;
+                    logOp( "d", ns, b.done(), 0, &replJustOne );
+                } else {
+                    problem() << "deleted object without id, not logging" << endl;
+                }
+            }
+
+            theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
+            nDeleted++;
+            if ( justOne )
+                break;
+            cc->c->checkLocation();
+            
+        } while ( cc->c->ok() );
+
+        if ( cc.get() && ClientCursor::find( id , false ) == 0 ){
+            cc.release();
+        }
+
+        return nDeleted;
+    }
+
+    int otherTraceLevel = 0;
+
+    int initialExtentSize(int len);
+
+    bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+        try {
+            return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
+        }
+        catch ( AssertionException& e ) {
+            if ( !e.msg.empty() )
+                anObjBuilder.append("assertion", e.msg);
+        }
+        curop.debug().str << " assertion ";
+        anObjBuilder.append("errmsg", "db assertion failure");
+        anObjBuilder.append("ok", 0.0);
+        BSONObj x = anObjBuilder.done();
+        b.append((void*) x.objdata(), x.objsize());
+        return true;
+    }
+
+    int nCaught = 0;
+
+    void killCursors(int n, long long *ids) {
+        int k = 0;
+        for ( int i = 0; i < n; i++ ) {
+            if ( ClientCursor::erase(ids[i]) )
+                k++;
+        }
+        log( k == n ) << "killcursors: found " << k << " of " << n << '\n';
+    }
+
+    BSONObj id_obj = fromjson("{\"_id\":ObjectId( \"000000000000000000000000\" )}");
+    BSONObj empty_obj = fromjson("{}");
+
+    /* This is for languages whose "objects" are not well ordered (JSON is well ordered).
+       [ { a : ... } , { b : ... } ] -> { a : ..., b : ... }
+    */
+    inline BSONObj transformOrderFromArrayFormat(BSONObj order) {
+        /* note: this is slow, but that is ok as order will have very few pieces */
+        BSONObjBuilder b;
+        char p[2] = "0";
+
+        while ( 1 ) {
+            BSONObj j = order.getObjectField(p);
+            if ( j.isEmpty() )
+                break;
+            BSONElement e = j.firstElement();
+            uassert( 10102 , "bad order array", !e.eoo());
+            uassert( 10103 , "bad order array [2]", e.isNumber());
+            b.append(e);
+            (*p)++;
+            uassert( 10104 , "too many ordering elements", *p <= '9');
+        }
+
+        return b.obj();
+    }
+
+
+    //int dump = 0;
+
+    /* empty result for error conditions */
+    QueryResult* emptyMoreResult(long long cursorid) {
+        BufBuilder b(32768);
+        b.skip(sizeof(QueryResult));
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->cursorId = 0; // 0 indicates no more data to retrieve.
+        qr->startingFrom = 0;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->nReturned = 0;
+        b.decouple();
+        return qr;
+    }
+
+    QueryResult* getMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop ) {
+        StringBuilder& ss = curop.debug().str;
+        ClientCursor::Pointer p(cursorid);
+        ClientCursor *cc = p._c;
+        
+        int bufSize = 512;
+        if ( cc ){
+            bufSize += sizeof( QueryResult );
+            bufSize += ( ntoreturn ? 4 : 1 ) * 1024 * 1024;
+        }
+        BufBuilder b( bufSize );
+
+        b.skip(sizeof(QueryResult));
+
+        int resultFlags = 0; //QueryResult::ResultFlag_AwaitCapable;
+        int start = 0;
+        int n = 0;
+
+        if ( !cc ) {
+            log() << "getMore: cursorid not found " << ns << " " << cursorid << endl;
+            cursorid = 0;
+            resultFlags = QueryResult::ResultFlag_CursorNotFound;
+        }
+        else {
+            ss << " query: " << cc->query << " ";
+            start = cc->pos;
+            Cursor *c = cc->c.get();
+            c->checkLocation();
+            while ( 1 ) {
+                if ( !c->ok() ) {
+                    if ( c->tailable() ) {
+                        if ( c->advance() ) {
+                            continue;
+                        }
+                        break;
+                    }
+                    p.release();
+                    bool ok = ClientCursor::erase(cursorid);
+                    assert(ok);
+                    cursorid = 0;
+                    cc = 0;
+                    break;
+                }
+                if ( !cc->matcher->matches(c->currKey(), c->currLoc() ) ) {
+                }
+                else {
+                    //out() << "matches " << c->currLoc().toString() << '\n';
+                    if( c->getsetdup(c->currLoc()) ) {
+                        //out() << "  but it's a dup \n";
+                    }
+                    else {
+                        BSONObj js = c->current();
+                        fillQueryResultFromObj(b, cc->filter.get(), js);
+                        n++;
+                        if ( (ntoreturn>0 && (n >= ntoreturn || b.len() > MaxBytesToReturnToClientAtOnce)) ||
+                             (ntoreturn==0 && b.len()>1*1024*1024) ) {
+                            c->advance();
+                            cc->pos += n;
+                            //cc->updateLocation();
+                            break;
+                        }
+                    }
+                }
+                c->advance();
+            }
+            if ( cc ) {
+                cc->updateLocation();
+                cc->mayUpgradeStorage();
+            }
+        }
+
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->_resultFlags() = resultFlags;
+        qr->cursorId = cursorid;
+        qr->startingFrom = start;
+        qr->nReturned = n;
+        b.decouple();
+
+        return qr;
+    }
+
+    class CountOp : public QueryOp {
+    public:
+        CountOp( const BSONObj &spec ) : spec_( spec ), count_(), bc_() {}
+        virtual void init() {
+            query_ = spec_.getObjectField( "query" );
+            c_ = qp().newCursor();
+            matcher_.reset( new CoveredIndexMatcher( query_, c_->indexKeyPattern() ) );
+            if ( qp().exactKeyMatch() && ! matcher_->needRecord() ) {
+                query_ = qp().simplifiedQuery( qp().indexKey() );
+                bc_ = dynamic_cast< BtreeCursor* >( c_.get() );
+                bc_->forgetEndKey();
+            }
+            
+            skip_ = spec_["skip"].numberLong();
+            limit_ = spec_["limit"].numberLong();
+        }
+
+        virtual void next() {
+            if ( !c_->ok() ) {
+                setComplete();
+                return;
+            }
+            if ( bc_ ) {
+                if ( firstMatch_.isEmpty() ) {
+                    firstMatch_ = bc_->currKeyNode().key;
+                    // if not match
+                    if ( query_.woCompare( firstMatch_, BSONObj(), false ) ) {
+                        setComplete();
+                        return;
+                    }
+                    _gotOne();
+                } else {
+                    if ( !firstMatch_.woEqual( bc_->currKeyNode().key ) ) {
+                        setComplete();
+                        return;
+                    }
+                    _gotOne();
+                }
+            } else {
+                if ( !matcher_->matches(c_->currKey(), c_->currLoc() ) ) {
+                }
+                else if( !c_->getsetdup(c_->currLoc()) ) {
+                    _gotOne();
+                }                
+            }
+            c_->advance();
+        }
+        virtual QueryOp *clone() const {
+            return new CountOp( spec_ );
+        }
+        long long count() const { return count_; }
+        virtual bool mayRecordPlan() const { return true; }
+    private:
+        
+        void _gotOne(){
+            if ( skip_ ){
+                skip_--;
+                return;
+            }
+            
+            if ( limit_ > 0 && count_ >= limit_ ){
+                setComplete();
+                return;
+            }
+
+            count_++;
+        }
+
+        BSONObj spec_;
+        long long count_;
+        long long skip_;
+        long long limit_;
+        auto_ptr< Cursor > c_;
+        BSONObj query_;
+        BtreeCursor *bc_;
+        auto_ptr< CoveredIndexMatcher > matcher_;
+        BSONObj firstMatch_;
+    };
+    
+    /* { count: "collectionname"[, query: <query>] }
+       returns -1 on ns does not exist error.
+    */    
+    long long runCount( const char *ns, const BSONObj &cmd, string &err ) {
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d ) {
+            err = "ns missing";
+            return -1;
+        }
+        BSONObj query = cmd.getObjectField("query");
+
+        // count of all objects
+        if ( query.isEmpty() ){
+            long long num = d->nrecords;
+            num = num - cmd["skip"].numberLong();
+            if ( num < 0 ) {
+                num = 0;
+            }
+            if ( cmd["limit"].isNumber() ){
+                long long limit = cmd["limit"].numberLong();
+                if ( limit < num ){
+                    num = limit;
+                }
+            }
+            return num;
+        }
+        QueryPlanSet qps( ns, query, BSONObj() );
+        CountOp original( cmd );
+        shared_ptr< CountOp > res = qps.runOp( original );
+        if ( !res->complete() ) {
+            log() << "Count with ns: " << ns << " and query: " << query
+                  << " failed with exception: " << res->exceptionMessage()
+                  << endl;
+            return 0;
+        }
+        return res->count();
+    }
+
+    // Implements database 'query' requests using the query optimizer's QueryOp interface
+    class UserQueryOp : public QueryOp {
+    public:
+        UserQueryOp( int ntoskip, int ntoreturn, const BSONObj &order, bool wantMore,
+                   bool explain, FieldMatcher *filter, int queryOptions ) :
+            b_( 32768 ),
+            ntoskip_( ntoskip ),
+            ntoreturn_( ntoreturn ),
+            order_( order ),
+            wantMore_( wantMore ),
+            explain_( explain ),
+            filter_( filter ),
+            ordering_(),
+            nscanned_(),
+            queryOptions_( queryOptions ),
+            n_(),
+            soSize_(),
+            saveClientCursor_(),
+            findingStart_( (queryOptions & QueryOption_OplogReplay) != 0 ),
+            findingStartCursor_()
+        {
+            uassert( 10105 , "bad skip value in query", ntoskip >= 0);
+        }
+
+        virtual void init() {
+            b_.skip( sizeof( QueryResult ) );
+            
+            // findingStart mode is used to find the first operation of interest when
+            // we are scanning through a repl log.  For efficiency in the common case,
+            // where the first operation of interest is closer to the tail than the head,
+            // we start from the tail of the log and work backwards until we find the
+            // first operation of interest.  Then we scan forward from that first operation,
+            // actually returning results to the client.  During the findingStart phase,
+            // we release the db mutex occasionally to avoid blocking the db process for
+            // an extended period of time.
+            if ( findingStart_ ) {
+                // Use a ClientCursor here so we can release db mutex while scanning
+                // oplog (can take quite a while with large oplogs).
+                findingStartCursor_ = new ClientCursor();
+				findingStartCursor_->noTimeout();
+                findingStartCursor_->c = qp().newReverseCursor();
+                findingStartCursor_->ns = qp().ns();
+            } else {
+                c_ = qp().newCursor();
+            }
+            
+            matcher_.reset(new CoveredIndexMatcher(qp().query(), qp().indexKey()));
+            
+            if ( qp().scanAndOrderRequired() ) {
+                ordering_ = true;
+                so_.reset( new ScanAndOrder( ntoskip_, ntoreturn_, order_ ) );
+                wantMore_ = false;
+            }
+        }
+        virtual void next() {
+            if ( findingStart_ ) {
+                if ( !findingStartCursor_ || !findingStartCursor_->c->ok() ) {
+                    findingStart_ = false;
+                    c_ = qp().newCursor();
+                } else if ( !matcher_->matches( findingStartCursor_->c->currKey(), findingStartCursor_->c->currLoc() ) ) {
+                    findingStart_ = false;
+                    c_ = qp().newCursor( findingStartCursor_->c->currLoc() );
+                } else {
+                    findingStartCursor_->c->advance();
+                    RARELY {
+                        CursorId id = findingStartCursor_->cursorid;
+                        findingStartCursor_->updateLocation();
+                        {
+                            dbtemprelease t;
+                        }
+                        findingStartCursor_ = ClientCursor::find( id, false );
+                    }
+                    return;
+                }
+            }
+            
+            if ( findingStartCursor_ ) {
+                ClientCursor::erase( findingStartCursor_->cursorid );
+                findingStartCursor_ = 0;
+            }
+            
+            if ( !c_->ok() ) {
+                finish();
+                return;
+            }
+            
+            bool mayCreateCursor1 = wantMore_ && ntoreturn_ != 1 && useCursors;
+            
+            if( 0 ) { 
+                BSONObj js = c_->current();
+                cout << "SCANNING " << js << endl;
+            }
+
+            nscanned_++;
+            if ( !matcher_->matches(c_->currKey(), c_->currLoc() ) ) {
+                ;
+            }
+            else {
+                DiskLoc cl = c_->currLoc();
+                if( !c_->getsetdup(cl) ) { 
+                    BSONObj js = c_->current();
+                    // got a match.
+                    assert( js.objsize() >= 0 ); //defensive for segfaults
+                    if ( ordering_ ) {
+                        // note: no cursors for non-indexed, ordered results.  results must be fairly small.
+                        so_->add(js);
+                    }
+                    else if ( ntoskip_ > 0 ) {
+                        ntoskip_--;
+                    } else {
+                        if ( explain_ ) {
+                            n_++;
+                            if ( n_ >= ntoreturn_ && !wantMore_ ) {
+                                // .limit() was used, show just that much.
+                                finish();
+                                return;
+                            }
+                        }
+                        else {
+                            fillQueryResultFromObj(b_, filter_, js);
+                            n_++;
+                            if ( (ntoreturn_>0 && (n_ >= ntoreturn_ || b_.len() > MaxBytesToReturnToClientAtOnce)) ||
+                                 (ntoreturn_==0 && (b_.len()>1*1024*1024 || n_>=101)) ) {
+                                /* if ntoreturn is zero, we return up to 101 objects.  on the subsequent getmore, there
+                                   is only a size limit.  The idea is that on a find() where one doesn't use much results,
+                                   we don't return much, but once getmore kicks in, we start pushing significant quantities.
+                             
+                                   The n limit (vs. size) is important when someone fetches only one small field from big
+                                   objects, which causes massive scanning server-side.
+                                */
+                                /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
+                                if ( mayCreateCursor1 ) {
+                                    c_->advance();
+                                    if ( c_->ok() ) {
+                                        // more...so save a cursor
+                                        saveClientCursor_ = true;
+                                    }
+                                }
+                                finish();
+                                return;
+                                }
+                        }
+                    }
+                }
+            }
+            c_->advance();            
+        }
+        void finish() {
+            if ( explain_ ) {
+                n_ = ordering_ ? so_->size() : n_;
+            } else if ( ordering_ ) {
+                so_->fill(b_, filter_, n_);
+            }
+            if ( mayCreateCursor2() ) {
+                c_->setTailable();
+            }
+            // If the tailing request succeeded.
+            if ( c_->tailable() ) {
+                saveClientCursor_ = true;
+            }
+            setComplete();            
+        }
+        virtual bool mayRecordPlan() const { return ntoreturn_ != 1; }
+        virtual QueryOp *clone() const {
+            return new UserQueryOp( ntoskip_, ntoreturn_, order_, wantMore_, explain_, filter_, queryOptions_ );
+        }
+        BufBuilder &builder() { return b_; }
+        bool scanAndOrderRequired() const { return ordering_; }
+        auto_ptr< Cursor > cursor() { return c_; }
+        auto_ptr< CoveredIndexMatcher > matcher() { return matcher_; }
+        int n() const { return n_; }
+        long long nscanned() const { return nscanned_; }
+        bool saveClientCursor() const { return saveClientCursor_; }
+        bool mayCreateCursor2() const { return ( queryOptions_ & QueryOption_CursorTailable ) && ntoreturn_ != 1; }
+    private:
+        BufBuilder b_;
+        int ntoskip_;
+        int ntoreturn_;
+        BSONObj order_;
+        bool wantMore_;
+        bool explain_;
+        FieldMatcher *filter_;   
+        bool ordering_;
+        auto_ptr< Cursor > c_;
+        long long nscanned_;
+        int queryOptions_;
+        auto_ptr< CoveredIndexMatcher > matcher_;
+        int n_;
+        int soSize_;
+        bool saveClientCursor_;
+        auto_ptr< ScanAndOrder > so_;
+        bool findingStart_;
+        ClientCursor * findingStartCursor_;
+    };
+    
+    /* run a query -- includes checking for and running a Command */
+    auto_ptr< QueryResult > runQuery(Message& m, QueryMessage& q, CurOp& curop ) {
+        StringBuilder& ss = curop.debug().str;
+        const char *ns = q.ns;
+        int ntoskip = q.ntoskip;
+        int _ntoreturn = q.ntoreturn;
+        BSONObj jsobj = q.query;
+        auto_ptr< FieldMatcher > filter = q.fields; // what fields to return (unspecified = full object)
+        int queryOptions = q.queryOptions;
+        BSONObj snapshotHint;
+        
+        Timer t;
+        if( logLevel >= 2 )
+            log() << "runQuery: " << ns << jsobj << endl;
+        
+        long long nscanned = 0;
+        bool wantMore = true;
+        int ntoreturn = _ntoreturn;
+        if ( _ntoreturn < 0 ) {
+            /* _ntoreturn greater than zero is simply a hint on how many objects to send back per 
+               "cursor batch".
+               A negative number indicates a hard limit.
+            */
+            ntoreturn = -_ntoreturn;
+            wantMore = false;
+        }
+        ss << "query " << ns << " ntoreturn:" << ntoreturn;
+        curop.setQuery(jsobj);
+        
+        BufBuilder bb;
+        BSONObjBuilder cmdResBuf;
+        long long cursorid = 0;
+        
+        bb.skip(sizeof(QueryResult));
+        
+        auto_ptr< QueryResult > qr;
+        int n = 0;
+        
+        Client& c = cc();
+        /* we assume you are using findOne() for running a cmd... */
+        if ( ntoreturn == 1 && runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
+            n = 1;
+            qr.reset( (QueryResult *) bb.buf() );
+            bb.decouple();
+            qr->setResultFlagsToOk();
+            qr->len = bb.len();
+            ss << " reslen:" << bb.len();
+            //	qr->channel = 0;
+            qr->setOperation(opReply);
+            qr->cursorId = cursorid;
+            qr->startingFrom = 0;
+            qr->nReturned = n;            
+        }
+        else {
+            /* regular query */
+            
+            AuthenticationInfo *ai = currentClient.get()->ai;
+            uassert( 10106 , "unauthorized", ai->isAuthorized(c.database()->name.c_str()));
+
+			/* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair 
+			   so that queries to a pair are realtime consistent as much as possible.  use setSlaveOk() to 
+			   query the nonmaster member of a replica pair.
+			*/
+            uassert( 10107 ,  "not master", isMaster() || (queryOptions & QueryOption_SlaveOk) || slave == SimpleSlave );
+
+            BSONElement hint;
+            BSONObj min;
+            BSONObj max;
+            bool explain = false;
+            bool _gotquery = false;
+            bool snapshot = false;
+            BSONObj query;
+            {
+                BSONElement e = jsobj.findElement("$query");
+                if ( e.eoo() )
+                    e = jsobj.findElement("query");                    
+                if ( !e.eoo() && (e.type() == Object || e.type() == Array) ) {
+                    query = e.embeddedObject();
+                    _gotquery = true;
+                }
+            }
+            BSONObj order;
+            {
+                BSONElement e = jsobj.findElement("$orderby");
+                if ( e.eoo() )
+                    e = jsobj.findElement("orderby");                    
+                if ( !e.eoo() ) {
+                    order = e.embeddedObjectUserCheck();
+                    if ( e.type() == Array )
+                        order = transformOrderFromArrayFormat(order);
+                }
+            }
+            if ( !_gotquery && order.isEmpty() )
+                query = jsobj;
+            else {
+                explain = jsobj.getBoolField("$explain");
+                if ( useHints )
+                    hint = jsobj.getField("$hint");
+                min = jsobj.getObjectField("$min");
+                max = jsobj.getObjectField("$max");
+                BSONElement e = jsobj.getField("$snapshot");
+                snapshot = !e.eoo() && e.trueValue();
+                if( snapshot ) { 
+                    uassert( 12001 , "E12001 can't sort with $snapshot", order.isEmpty());
+					uassert( 12002 , "E12002 can't use hint with $snapshot", hint.eoo());
+                    NamespaceDetails *d = nsdetails(ns);
+                    if ( d ){
+                        int i = d->findIdIndex();
+                        if( i < 0 ) { 
+                            if ( strstr( ns , ".system." ) == 0 )
+                                log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
+                        }
+                        else {
+                            /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
+                               probably need a better way to specify "use the _id index" as a hint.  if someone is
+                               in the query optimizer please fix this then!
+                            */
+                            BSONObjBuilder b;
+                            b.append("$hint", d->idx(i).indexName());
+                            snapshotHint = b.obj();
+                            hint = snapshotHint.firstElement();
+                        }
+                    }
+                }
+            }
+            
+            /* The ElemIter will not be happy if this isn't really an object. So throw exception
+               here when that is true.
+               (Which may indicate bad data from client.)
+            */
+            if ( query.objsize() == 0 ) {
+                out() << "Bad query object?\n  jsobj:";
+                out() << jsobj.toString() << "\n  query:";
+                out() << query.toString() << endl;
+                uassert( 10110 , "bad query object", false);
+            }
+            
+            bool idHackWorked = false;
+
+            if ( strcmp( query.firstElement().fieldName() , "_id" ) == 0 && query.nFields() == 1 && query.firstElement().isSimpleType() ){
+                nscanned = 1;
+
+                bool nsFound = false;
+                bool indexFound = false;
+
+                BSONObj resObject;
+                bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
+                if ( nsFound == false || indexFound == true ){
+                    idHackWorked = true;
+                    if ( found ){
+                        n = 1;
+                        fillQueryResultFromObj( bb , filter.get() , resObject );
+                    }
+                    qr.reset( (QueryResult *) bb.buf() );
+                    bb.decouple();
+                    qr->setResultFlagsToOk();
+                    qr->len = bb.len();
+                    ss << " reslen:" << bb.len();
+                    qr->setOperation(opReply);
+                    qr->cursorId = cursorid;
+                    qr->startingFrom = 0;
+                    qr->nReturned = n;       
+                }     
+            }
+            
+            if ( ! idHackWorked ){ // non-simple _id lookup
+                BSONObj oldPlan;
+                if ( explain && hint.eoo() && min.isEmpty() && max.isEmpty() ) {
+                    QueryPlanSet qps( ns, query, order );
+                    if ( qps.usingPrerecordedPlan() )
+                        oldPlan = qps.explain();
+                }
+                QueryPlanSet qps( ns, query, order, &hint, !explain, min, max );
+                UserQueryOp original( ntoskip, ntoreturn, order, wantMore, explain, filter.get(), queryOptions );
+                shared_ptr< UserQueryOp > o = qps.runOp( original );
+                UserQueryOp &dqo = *o;
+                massert( 10362 ,  dqo.exceptionMessage(), dqo.complete() );
+                n = dqo.n();
+                nscanned = dqo.nscanned();
+                if ( dqo.scanAndOrderRequired() )
+                    ss << " scanAndOrder ";
+                auto_ptr< Cursor > c = dqo.cursor();
+                log( 5 ) << "   used cursor: " << c.get() << endl;
+                if ( dqo.saveClientCursor() ) {
+                    ClientCursor *cc = new ClientCursor();
+                    if ( queryOptions & QueryOption_NoCursorTimeout )
+                        cc->noTimeout();
+                    cc->c = c;
+                    cursorid = cc->cursorid;
+                    cc->query = jsobj.getOwned();
+                    DEV out() << "  query has more, cursorid: " << cursorid << endl;
+                    cc->matcher = dqo.matcher();
+                    cc->ns = ns;
+                    cc->pos = n;
+                    cc->filter = filter;
+                    cc->originalMessage = m;
+                    cc->updateLocation();
+                    if ( !cc->c->ok() && cc->c->tailable() ) {
+                        DEV out() << "  query has no more but tailable, cursorid: " << cursorid << endl;
+                    } else {
+                        DEV out() << "  query has more, cursorid: " << cursorid << endl;
+                    }
+                }
+                if ( explain ) {
+                    BSONObjBuilder builder;
+                    builder.append("cursor", c->toString());
+                    builder.append("startKey", c->prettyStartKey());
+                    builder.append("endKey", c->prettyEndKey());
+                    builder.append("nscanned", double( dqo.nscanned() ) );
+                    builder.append("n", n);
+                    if ( dqo.scanAndOrderRequired() )
+                        builder.append("scanAndOrder", true);
+                    builder.append("millis", t.millis());
+                    if ( !oldPlan.isEmpty() )
+                        builder.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
+                    if ( hint.eoo() )
+                        builder.appendElements(qps.explain());
+                    BSONObj obj = builder.done();
+                    fillQueryResultFromObj(dqo.builder(), 0, obj);
+                    n = 1;
+                }
+                qr.reset( (QueryResult *) dqo.builder().buf() );
+                dqo.builder().decouple();
+                qr->cursorId = cursorid;
+                qr->setResultFlagsToOk();
+                qr->len = dqo.builder().len();
+                ss << " reslen:" << qr->len;
+                qr->setOperation(opReply);
+                qr->startingFrom = 0;
+                qr->nReturned = n;
+            }
+        }
+        
+        int duration = t.millis();
+        Database *database = c.database();
+        if ( (database && database->profile) || duration >= 100 ) {
+            ss << " nscanned:" << nscanned << ' ';
+            if ( ntoskip )
+                ss << " ntoskip:" << ntoskip;
+            if ( database && database->profile )
+                ss << " \nquery: ";
+            ss << jsobj << ' ';
+        }
+        ss << " nreturned:" << n;
+        return qr;        
+    }    
+    
+} // namespace mongo
diff --git a/db/query.h b/db/query.h
new file mode 100644
index 0000000..d69b6d9
--- /dev/null
+++ b/db/query.h
@@ -0,0 +1,115 @@
+// query.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../stdafx.h"
+#include "../util/message.h"
+#include "dbmessage.h"
+#include "jsobj.h"
+#include "storage.h"
+
+/* db request message format
+
+   unsigned opid;         // arbitary; will be echoed back
+   byte operation;
+   int options;
+
+   then for:
+
+   dbInsert:
+      string collection;
+      a series of JSObjects
+   dbDelete:
+      string collection;
+	  int flags=0; // 1=DeleteSingle
+      JSObject query;
+   dbUpdate:
+      string collection;
+	  int flags; // 1=upsert
+      JSObject query;
+	  JSObject objectToUpdate;
+        objectToUpdate may include { $inc: <field> } or { $set: ... }, see struct Mod.
+   dbQuery:
+      string collection;
+	  int nToSkip;
+	  int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit)            
+                     // greater than zero is simply a hint on how many objects to send back per "cursor batch".
+                     // a negative number indicates a hard limit.
+      JSObject query;
+	  [JSObject fieldsToReturn]
+   dbGetMore:
+	  string collection; // redundant, might use for security.
+      int nToReturn;
+      int64 cursorID;
+   dbKillCursors=2007:
+      int n;
+	  int64 cursorIDs[n];
+
+   Note that on Update, there is only one object, which is different
+   from insert where you can pass a list of objects to insert in the db.
+   Note that the update field layout is very similar layout to Query.
+*/
+
+// struct QueryOptions, QueryResult, QueryResultFlags in:
+#include "../client/dbclient.h"
+
+namespace mongo {
+
+    // for an existing query (ie a ClientCursor), send back additional information.
+    QueryResult* getMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op);
+
+    struct UpdateResult {
+        bool existing;
+        bool mod;
+        unsigned long long num;
+
+        UpdateResult( bool e, bool m, unsigned long long n )
+            : existing(e) , mod(m), num(n ){}
+
+        int oldCode(){
+            if ( ! num )
+                return 0;
+            
+            if ( existing ){
+                if ( mod )
+                    return 2;
+                return 1;
+            }
+            
+            if ( mod )
+                return 3;
+            return 4;
+        }
+    };
+    
+    /* returns true if an existing object was updated, false if no existing object was found.
+       multi - update multiple objects - mostly useful with things like $set
+    */
+    UpdateResult updateObjects(const char *ns, BSONObj updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
+
+    // If justOne is true, deletedId is set to the id of the deleted object.
+    int deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false);
+
+    long long runCount(const char *ns, const BSONObj& cmd, string& err);
+    
+    auto_ptr< QueryResult > runQuery(Message& m, QueryMessage& q, CurOp& curop );
+    
+} // namespace mongo
+
+#include "clientcursor.h"
diff --git a/db/queryoptimizer.cpp b/db/queryoptimizer.cpp
new file mode 100644
index 0000000..499417a
--- /dev/null
+++ b/db/queryoptimizer.cpp
@@ -0,0 +1,624 @@
+/* queryoptimizer.cpp */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "queryoptimizer.h"
+#include "cmdline.h"
+
+namespace mongo {
+
+    void checkTableScanAllowed( const char * ns ){
+        if ( ! cmdLine.notablescan )
+            return;
+        
+        if ( strstr( ns , ".system." ) ||
+             strstr( ns , "local." ) )
+            return;
+        
+        if ( ! nsdetails( ns ) )
+            return;
+
+        uassert( 10111 ,  (string)"table scans not allowed:" + ns , ! cmdLine.notablescan );
+    }
+    
+    double elementDirection( const BSONElement &e ) {
+        if ( e.isNumber() )
+            return e.number();
+        return 1;
+    }
+    
+    QueryPlan::QueryPlan( 
+        NamespaceDetails *_d, int _idxNo,
+        const FieldRangeSet &fbs, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey ) :
+    d(_d), idxNo(_idxNo),
+    fbs_( fbs ),
+    order_( order ),
+    index_( 0 ),
+    optimal_( false ),
+    scanAndOrderRequired_( true ),
+    exactKeyMatch_( false ),
+    direction_( 0 ),
+    endKeyInclusive_( endKey.isEmpty() ),
+    unhelpful_( false ) {
+
+        if ( !fbs_.matchPossible() ) {
+            unhelpful_ = true;
+            scanAndOrderRequired_ = false;
+            return;
+        }
+
+        if( idxNo >= 0 ) {
+            index_ = &d->idx(idxNo);
+        } else {
+            // full table scan case
+            if ( order_.isEmpty() || !strcmp( order_.firstElement().fieldName(), "$natural" ) )
+                scanAndOrderRequired_ = false;
+            return;
+        }
+
+        BSONObj idxKey = index_->keyPattern();
+        BSONObjIterator o( order );
+        BSONObjIterator k( idxKey );
+        if ( !o.moreWithEOO() )
+            scanAndOrderRequired_ = false;
+        while( o.moreWithEOO() ) {
+            BSONElement oe = o.next();
+            if ( oe.eoo() ) {
+                scanAndOrderRequired_ = false;
+                break;
+            }
+            if ( !k.moreWithEOO() )
+                break;
+            BSONElement ke;
+            while( 1 ) {
+                ke = k.next();
+                if ( ke.eoo() )
+                    goto doneCheckOrder;
+                if ( strcmp( oe.fieldName(), ke.fieldName() ) == 0 )
+                    break;
+                if ( !fbs.range( ke.fieldName() ).equality() )
+                    goto doneCheckOrder;
+            }
+            int d = elementDirection( oe ) == elementDirection( ke ) ? 1 : -1;
+            if ( direction_ == 0 )
+                direction_ = d;
+            else if ( direction_ != d )
+                break;
+        }
+    doneCheckOrder:
+        if ( scanAndOrderRequired_ )
+            direction_ = 0;
+        BSONObjIterator i( idxKey );
+        int exactIndexedQueryCount = 0;
+        int optimalIndexedQueryCount = 0;
+        bool stillOptimalIndexedQueryCount = true;
+        set< string > orderFieldsUnindexed;
+        order.getFieldNames( orderFieldsUnindexed );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            const FieldRange &fb = fbs.range( e.fieldName() );
+            if ( stillOptimalIndexedQueryCount ) {
+                if ( fb.nontrivial() )
+                    ++optimalIndexedQueryCount;
+                if ( !fb.equality() )
+                    stillOptimalIndexedQueryCount = false;
+            } else {
+                if ( fb.nontrivial() )
+                    optimalIndexedQueryCount = -1;
+            }
+            if ( fb.equality() ) {
+                BSONElement e = fb.max();
+                if ( !e.isNumber() && !e.mayEncapsulate() && e.type() != RegEx )
+                    ++exactIndexedQueryCount;
+            }
+            orderFieldsUnindexed.erase( e.fieldName() );
+        }
+        if ( !scanAndOrderRequired_ &&
+             ( optimalIndexedQueryCount == fbs.nNontrivialRanges() ) )
+            optimal_ = true;
+        if ( exactIndexedQueryCount == fbs.nNontrivialRanges() &&
+            orderFieldsUnindexed.size() == 0 &&
+            exactIndexedQueryCount == index_->keyPattern().nFields() &&
+            exactIndexedQueryCount == fbs.query().nFields() ) {
+            exactKeyMatch_ = true;
+        }
+        indexBounds_ = fbs.indexBounds( idxKey, direction_ );
+        if ( !startKey.isEmpty() || !endKey.isEmpty() ) {
+            BSONObj newStart, newEnd;
+            if ( !startKey.isEmpty() )
+                newStart = startKey;
+            else
+                newStart = indexBounds_[ 0 ].first;
+            if ( !endKey.isEmpty() )
+                newEnd = endKey;
+            else
+                newEnd = indexBounds_[ indexBounds_.size() - 1 ].second;
+            BoundList newBounds;
+            newBounds.push_back( make_pair( newStart, newEnd ) );
+            indexBounds_ = newBounds;
+        }
+        if ( ( scanAndOrderRequired_ || order_.isEmpty() ) &&
+            !fbs.range( idxKey.firstElement().fieldName() ).nontrivial() )
+            unhelpful_ = true;
+    }
+    
+    auto_ptr< Cursor > QueryPlan::newCursor( const DiskLoc &startLoc ) const {
+        if ( !fbs_.matchPossible() ){
+            if ( fbs_.nNontrivialRanges() )
+                checkTableScanAllowed( fbs_.ns() );
+            return auto_ptr< Cursor >( new BasicCursor( DiskLoc() ) );
+        }
+        if ( !index_ ){
+            if ( fbs_.nNontrivialRanges() )
+                checkTableScanAllowed( fbs_.ns() );
+            return findTableScan( fbs_.ns(), order_, startLoc );
+        }
+
+        massert( 10363 ,  "newCursor() with start location not implemented for indexed plans", startLoc.isNull() );
+        
+        if ( indexBounds_.size() < 2 ) {
+            // we are sure to spec endKeyInclusive_
+            return auto_ptr< Cursor >( new BtreeCursor( d, idxNo, *index_, indexBounds_[ 0 ].first, indexBounds_[ 0 ].second, endKeyInclusive_, direction_ >= 0 ? 1 : -1 ) );
+        } else {
+            return auto_ptr< Cursor >( new BtreeCursor( d, idxNo, *index_, indexBounds_, direction_ >= 0 ? 1 : -1 ) );
+        }
+    }
+    
+    auto_ptr< Cursor > QueryPlan::newReverseCursor() const {
+        if ( !fbs_.matchPossible() )
+            return auto_ptr< Cursor >( new BasicCursor( DiskLoc() ) );
+        if ( !index_ ) {
+            int orderSpec = order_.getIntField( "$natural" );
+            if ( orderSpec == INT_MIN )
+                orderSpec = 1;
+            return findTableScan( fbs_.ns(), BSON( "$natural" << -orderSpec ) );
+        }
+        massert( 10364 ,  "newReverseCursor() not implemented for indexed plans", false );
+        return auto_ptr< Cursor >( 0 );
+    }
+    
+    BSONObj QueryPlan::indexKey() const {
+        if ( !index_ )
+            return BSON( "$natural" << 1 );
+        return index_->keyPattern();
+    }
+    
+    void QueryPlan::registerSelf( long long nScanned ) const {
+        if ( fbs_.matchPossible() ) {
+            boostlock lk(NamespaceDetailsTransient::_qcMutex);
+            NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( fbs_.pattern( order_ ), indexKey(), nScanned );  
+        }
+    }
+    
+    QueryPlanSet::QueryPlanSet( const char *_ns, const BSONObj &query, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max ) :
+    ns(_ns),
+    fbs_( _ns, query ),
+    mayRecordPlan_( true ),
+    usingPrerecordedPlan_( false ),
+    hint_( BSONObj() ),
+    order_( order.getOwned() ),
+    oldNScanned_( 0 ),
+    honorRecordedPlan_( honorRecordedPlan ),
+    min_( min.getOwned() ),
+    max_( max.getOwned() ) {
+        if ( hint && !hint->eoo() ) {
+            BSONObjBuilder b;
+            b.append( *hint );
+            hint_ = b.obj();
+        }
+        init();
+    }
+    
+    void QueryPlanSet::addHint( IndexDetails &id ) {
+        if ( !min_.isEmpty() || !max_.isEmpty() ) {
+            string errmsg;
+            BSONObj keyPattern = id.keyPattern();
+            // This reformats min_ and max_ to be used for index lookup.
+            massert( 10365 ,  errmsg, indexDetailsForRange( fbs_.ns(), errmsg, min_, max_, keyPattern ) );
+        }
+        NamespaceDetails *d = nsdetails(ns);
+        plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(id), fbs_, order_, min_, max_ ) ) );
+    }
+    
+    void QueryPlanSet::init() {
+        plans_.clear();
+        mayRecordPlan_ = true;
+        usingPrerecordedPlan_ = false;
+        
+        const char *ns = fbs_.ns();
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d || !fbs_.matchPossible() ) {
+            // Table scan plan, when no matches are possible
+            plans_.push_back( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ) );
+            return;
+        }
+        
+        BSONElement hint = hint_.firstElement();
+        if ( !hint.eoo() ) {
+            mayRecordPlan_ = false;
+            if( hint.type() == String ) {
+                string hintstr = hint.valuestr();
+                NamespaceDetails::IndexIterator i = d->ii();
+                while( i.more() ) {
+                    IndexDetails& ii = i.next();
+                    if ( ii.indexName() == hintstr ) {
+                        addHint( ii );
+                        return;
+                    }
+                }
+            }
+            else if( hint.type() == Object ) { 
+                BSONObj hintobj = hint.embeddedObject();
+                uassert( 10112 ,  "bad hint", !hintobj.isEmpty() );
+                if ( !strcmp( hintobj.firstElement().fieldName(), "$natural" ) ) {
+                    massert( 10366 ,  "natural order cannot be specified with $min/$max", min_.isEmpty() && max_.isEmpty() );
+                    // Table scan plan
+                    plans_.push_back( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ) );
+                    return;
+                }
+                NamespaceDetails::IndexIterator i = d->ii();
+                while( i.more() ) {
+                    IndexDetails& ii = i.next();
+                    if( ii.keyPattern().woCompare(hintobj) == 0 ) {
+                        addHint( ii );
+                        return;
+                    }
+                }
+            }
+            uassert( 10113 ,  "bad hint", false );
+        }
+        
+        if ( !min_.isEmpty() || !max_.isEmpty() ) {
+            string errmsg;
+            BSONObj keyPattern;
+            IndexDetails *idx = indexDetailsForRange( ns, errmsg, min_, max_, keyPattern );
+            massert( 10367 ,  errmsg, idx );
+            plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(*idx), fbs_, order_, min_, max_ ) ) );
+            return;
+        }
+        
+        if ( honorRecordedPlan_ ) {
+            boostlock lk(NamespaceDetailsTransient::_qcMutex);
+            NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( ns );
+            BSONObj bestIndex = nsd.indexForPattern( fbs_.pattern( order_ ) );
+            if ( !bestIndex.isEmpty() ) {
+                usingPrerecordedPlan_ = true;
+                mayRecordPlan_ = false;
+                oldNScanned_ = nsd.nScannedForPattern( fbs_.pattern( order_ ) );
+                if ( !strcmp( bestIndex.firstElement().fieldName(), "$natural" ) ) {
+                    // Table scan plan
+                    plans_.push_back( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ) );
+                    return;
+                }
+
+                NamespaceDetails::IndexIterator i = d->ii();
+                while( i.more() ) {
+                    int j = i.pos();
+                    IndexDetails& ii = i.next();
+                    if( ii.keyPattern().woCompare(bestIndex) == 0 ) {
+                        plans_.push_back( PlanPtr( new QueryPlan( d, j, fbs_, order_ ) ) );
+                        return;
+                    }
+                }
+                massert( 10368 ,  "Unable to locate previously recorded index", false );
+            }
+        }
+        
+        addOtherPlans( false );
+    }
+    
+    void QueryPlanSet::addOtherPlans( bool checkFirst ) {
+        const char *ns = fbs_.ns();
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d )
+            return;
+
+        // If table scan is optimal or natural order requested
+        if ( !fbs_.matchPossible() || ( fbs_.nNontrivialRanges() == 0 && order_.isEmpty() ) ||
+            ( !order_.isEmpty() && !strcmp( order_.firstElement().fieldName(), "$natural" ) ) ) {
+            // Table scan plan
+            addPlan( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ), checkFirst );
+            return;
+        }
+        
+        PlanSet plans;
+        for( int i = 0; i < d->nIndexes; ++i ) {
+            PlanPtr p( new QueryPlan( d, i, fbs_, order_ ) );
+            if ( p->optimal() ) {
+                addPlan( p, checkFirst );
+                return;
+            } else if ( !p->unhelpful() ) {
+                plans.push_back( p );
+            }
+        }
+        for( PlanSet::iterator i = plans.begin(); i != plans.end(); ++i )
+            addPlan( *i, checkFirst );
+
+        // Table scan plan
+        addPlan( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ), checkFirst );
+    }
+    
+    shared_ptr< QueryOp > QueryPlanSet::runOp( QueryOp &op ) {
+        if ( usingPrerecordedPlan_ ) {
+            Runner r( *this, op );
+            shared_ptr< QueryOp > res = r.run();
+            // plans_.size() > 1 if addOtherPlans was called in Runner::run().
+            if ( res->complete() || plans_.size() > 1 )
+                return res;
+            {
+                boostlock lk(NamespaceDetailsTransient::_qcMutex);
+                NamespaceDetailsTransient::get_inlock( fbs_.ns() ).registerIndexForPattern( fbs_.pattern( order_ ), BSONObj(), 0 );
+            }
+            init();
+        }
+        Runner r( *this, op );
+        return r.run();
+    }
+    
+    BSONObj QueryPlanSet::explain() const {
+        vector< BSONObj > arr;
+        for( PlanSet::const_iterator i = plans_.begin(); i != plans_.end(); ++i ) {
+            auto_ptr< Cursor > c = (*i)->newCursor();
+            arr.push_back( BSON( "cursor" << c->toString() << "startKey" << c->prettyStartKey() << "endKey" << c->prettyEndKey() ) );
+        }
+        BSONObjBuilder b;
+        b.append( "allPlans", arr );
+        return b.obj();
+    }
+    
+    QueryPlanSet::Runner::Runner( QueryPlanSet &plans, QueryOp &op ) :
+    op_( op ),
+    plans_( plans ) {
+    }
+    
+    shared_ptr< QueryOp > QueryPlanSet::Runner::run() {
+        massert( 10369 ,  "no plans", plans_.plans_.size() > 0 );
+        
+        if ( plans_.plans_.size() > 1 )
+            log(1) << "  running multiple plans" << endl;
+
+        vector< shared_ptr< QueryOp > > ops;
+        for( PlanSet::iterator i = plans_.plans_.begin(); i != plans_.plans_.end(); ++i ) {
+            shared_ptr< QueryOp > op( op_.clone() );
+            op->setQueryPlan( i->get() );
+            ops.push_back( op );
+        }
+
+        for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) {
+            initOp( **i );
+            if ( (*i)->complete() )
+                return *i;
+        }
+        
+        long long nScanned = 0;
+        long long nScannedBackup = 0;
+        while( 1 ) {
+            ++nScanned;
+            unsigned errCount = 0;
+            bool first = true;
+            for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) {
+                QueryOp &op = **i;
+                nextOp( op );
+                if ( op.complete() ) {
+                    if ( first )
+                        nScanned += nScannedBackup;
+                    if ( plans_.mayRecordPlan_ && op.mayRecordPlan() )
+                        op.qp().registerSelf( nScanned );
+                    return *i;
+                }
+                if ( op.error() )
+                    ++errCount;
+                first = false;
+            }
+            if ( errCount == ops.size() )
+                break;
+            if ( plans_.usingPrerecordedPlan_ && nScanned > plans_.oldNScanned_ * 10 ) {
+                plans_.addOtherPlans( true );
+                PlanSet::iterator i = plans_.plans_.begin();
+                ++i;
+                for( ; i != plans_.plans_.end(); ++i ) {
+                    shared_ptr< QueryOp > op( op_.clone() );
+                    op->setQueryPlan( i->get() );
+                    ops.push_back( op );
+                    initOp( *op );
+                    if ( op->complete() )
+                        return op;
+                }                
+                plans_.mayRecordPlan_ = true;
+                plans_.usingPrerecordedPlan_ = false;
+                nScannedBackup = nScanned;
+                nScanned = 0;
+            }
+        }
+        return ops[ 0 ];
+    }
+    
+    void QueryPlanSet::Runner::initOp( QueryOp &op ) {
+        try {
+            op.init();
+        } catch ( const std::exception &e ) {
+            op.setExceptionMessage( e.what() );
+        } catch ( ... ) {
+            op.setExceptionMessage( "Caught unknown exception" );
+        }        
+    }
+
+    void QueryPlanSet::Runner::nextOp( QueryOp &op ) {
+        try {
+            if ( !op.error() )
+                op.next();
+        } catch ( const std::exception &e ) {
+            op.setExceptionMessage( e.what() );
+        } catch ( ... ) {
+            op.setExceptionMessage( "Caught unknown exception" );
+        }        
+    }
+
+    bool indexWorks( const BSONObj &idxPattern, const BSONObj &sampleKey, int direction, int firstSignificantField ) {
+        BSONObjIterator p( idxPattern );
+        BSONObjIterator k( sampleKey );
+        int i = 0;
+        while( 1 ) {
+            BSONElement pe = p.next();
+            BSONElement ke = k.next();
+            if ( pe.eoo() && ke.eoo() )
+                return true;
+            if ( pe.eoo() || ke.eoo() )
+                return false;
+            if ( strcmp( pe.fieldName(), ke.fieldName() ) != 0 )
+                return false;
+            if ( ( i == firstSignificantField ) && !( ( direction > 0 ) == ( pe.number() > 0 ) ) )
+                return false;
+            ++i;
+        }
+        return false;
+    }
+
+    BSONObj extremeKeyForIndex( const BSONObj &idxPattern, int baseDirection ) {
+        BSONObjIterator i( idxPattern );
+        BSONObjBuilder b;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            int idxDirection = e.number() >= 0 ? 1 : -1;
+            int direction = idxDirection * baseDirection;
+            switch( direction ) {
+                case 1:
+                    b.appendMaxKey( e.fieldName() );
+                    break;
+                case -1:
+                    b.appendMinKey( e.fieldName() );
+                    break;
+                default:
+                    assert( false );
+            }
+        }
+        return b.obj();        
+    }
+    
+    pair< int, int > keyAudit( const BSONObj &min, const BSONObj &max ) {
+        int direction = 0;
+        int firstSignificantField = 0;
+        BSONObjIterator i( min );
+        BSONObjIterator a( max );
+        while( 1 ) {
+            BSONElement ie = i.next();
+            BSONElement ae = a.next();
+            if ( ie.eoo() && ae.eoo() )
+                break;
+            if ( ie.eoo() || ae.eoo() || strcmp( ie.fieldName(), ae.fieldName() ) != 0 ) {
+                return make_pair( -1, -1 );
+            }
+            int cmp = ie.woCompare( ae );
+            if ( cmp < 0 )
+                direction = 1;
+            if ( cmp > 0 )
+                direction = -1;
+            if ( direction != 0 )
+                break;
+            ++firstSignificantField;
+        }
+        return make_pair( direction, firstSignificantField );
+    }
+
+    pair< int, int > flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) {
+        if ( min.isEmpty() || max.isEmpty() ) {
+            return make_pair( 1, -1 );
+        } else {
+            return keyAudit( min, max );
+        }
+    }
+    
+    // NOTE min, max, and keyPattern will be updated to be consistent with the selected index.
+    IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
+        if ( min.isEmpty() && max.isEmpty() ) {
+            errmsg = "one of min or max must be specified";
+            return 0;
+        }
+        
+        setClient( ns );
+        IndexDetails *id = 0;
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d ) {
+            errmsg = "ns not found";
+            return 0;
+        }
+        
+        pair< int, int > ret = flexibleKeyAudit( min, max );
+        if ( ret == make_pair( -1, -1 ) ) {
+            errmsg = "min and max keys do not share pattern";
+            return 0;
+        }
+        if ( keyPattern.isEmpty() ) {
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) {
+                    id = &ii;
+                    keyPattern = ii.keyPattern();
+                    break;
+                }
+            }
+            
+        } else {            
+            if ( !indexWorks( keyPattern, min.isEmpty() ? max : min, ret.first, ret.second ) ) {
+                errmsg = "requested keyPattern does not match specified keys";
+                return 0;
+            }
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if( ii.keyPattern().woCompare(keyPattern) == 0 ) {
+                    id = &ii;
+                    break;
+                }
+                if ( keyPattern.nFields() == 1 && ii.keyPattern().nFields() == 1 &&
+                     IndexDetails::isIdIndexPattern( keyPattern ) &&
+                     ii.isIdIndex() ){
+                    id = &ii;
+                    break;
+                }
+                     
+            }
+        }
+
+        if ( min.isEmpty() ) {
+            min = extremeKeyForIndex( keyPattern, -1 );
+        } else if ( max.isEmpty() ) {
+            max = extremeKeyForIndex( keyPattern, 1 );
+        }
+                
+        if ( !id ) {
+            errmsg = (string)"no index found for specified keyPattern: " + keyPattern.toString();
+            return 0;
+        }
+        
+        min = min.extractFieldsUnDotted( keyPattern );
+        max = max.extractFieldsUnDotted( keyPattern );
+
+        return id;
+    }
+        
+} // namespace mongo
diff --git a/db/queryoptimizer.h b/db/queryoptimizer.h
new file mode 100644
index 0000000..e4a79d8
--- /dev/null
+++ b/db/queryoptimizer.h
@@ -0,0 +1,161 @@
+/* queryoptimizer.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "cursor.h"
+#include "jsobj.h"
+#include "queryutil.h"
+
+namespace mongo {
+    
+    class IndexDetails;
+    class QueryPlan : boost::noncopyable {
+    public:
+        QueryPlan(NamespaceDetails *_d, 
+                  int _idxNo, // -1 = no index
+                  const FieldRangeSet &fbs,
+                  const BSONObj &order,
+                  const BSONObj &startKey = BSONObj(),
+                  const BSONObj &endKey = BSONObj() );
+
+        /* If true, no other index can do better. */
+        bool optimal() const { return optimal_; }
+        /* ScanAndOrder processing will be required if true */
+        bool scanAndOrderRequired() const { return scanAndOrderRequired_; }
+        /* When true, the index we are using has keys such that it can completely resolve the
+         query expression to match by itself without ever checking the main object.
+         */
+        bool exactKeyMatch() const { return exactKeyMatch_; }
+        /* If true, the startKey and endKey are unhelpful and the index order doesn't match the 
+           requested sort order */
+        bool unhelpful() const { return unhelpful_; }
+        int direction() const { return direction_; }
+        auto_ptr< Cursor > newCursor( const DiskLoc &startLoc = DiskLoc() ) const;
+        auto_ptr< Cursor > newReverseCursor() const;
+        BSONObj indexKey() const;
+        const char *ns() const { return fbs_.ns(); }
+        BSONObj query() const { return fbs_.query(); }
+        BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return fbs_.simplifiedQuery( fields ); }
+        const FieldRange &range( const char *fieldName ) const { return fbs_.range( fieldName ); }
+        void registerSelf( long long nScanned ) const;
+        // just for testing
+        BoundList indexBounds() const { return indexBounds_; }
+    private:
+        NamespaceDetails *d;
+        int idxNo;
+        const FieldRangeSet &fbs_;
+        const BSONObj &order_;
+        const IndexDetails *index_;
+        bool optimal_;
+        bool scanAndOrderRequired_;
+        bool exactKeyMatch_;
+        int direction_;
+        BoundList indexBounds_;
+        bool endKeyInclusive_;
+        bool unhelpful_;
+    };
+
+    // Inherit from this interface to implement a new query operation.
+    // The query optimizer will clone the QueryOp that is provided, giving
+    // each clone its own query plan.
+    class QueryOp {
+    public:
+        QueryOp() : complete_(), qp_(), error_() {}
+        virtual ~QueryOp() {}
+        virtual void init() = 0;
+        virtual void next() = 0;
+        virtual bool mayRecordPlan() const = 0;
+        // Return a copy of the inheriting class, which will be run with its own
+        // query plan.
+        virtual QueryOp *clone() const = 0;
+        bool complete() const { return complete_; }
+        bool error() const { return error_; }
+        string exceptionMessage() const { return exceptionMessage_; }
+        const QueryPlan &qp() const { return *qp_; }
+        // To be called by QueryPlanSet::Runner only.
+        void setQueryPlan( const QueryPlan *qp ) { qp_ = qp; }
+        void setExceptionMessage( const string &exceptionMessage ) {
+            error_ = true;
+            exceptionMessage_ = exceptionMessage;
+        }
+    protected:
+        void setComplete() { complete_ = true; }
+    private:
+        bool complete_;
+        string exceptionMessage_;
+        const QueryPlan *qp_;
+        bool error_;
+    };
+    
+    // Set of candidate query plans for a particular query.  Used for running
+    // a QueryOp on these plans.
+    class QueryPlanSet {
+    public:
+        QueryPlanSet( const char *ns,
+                     const BSONObj &query,
+                     const BSONObj &order,
+                     const BSONElement *hint = 0,
+                     bool honorRecordedPlan = true,
+                     const BSONObj &min = BSONObj(),
+                     const BSONObj &max = BSONObj() );
+        int nPlans() const { return plans_.size(); }
+        shared_ptr< QueryOp > runOp( QueryOp &op );
+        template< class T >
+        shared_ptr< T > runOp( T &op ) {
+            return dynamic_pointer_cast< T >( runOp( static_cast< QueryOp& >( op ) ) );
+        }
+        const FieldRangeSet &fbs() const { return fbs_; }
+        BSONObj explain() const;
+        bool usingPrerecordedPlan() const { return usingPrerecordedPlan_; }
+    private:
+        void addOtherPlans( bool checkFirst );
+        typedef boost::shared_ptr< QueryPlan > PlanPtr;
+        typedef vector< PlanPtr > PlanSet;
+        void addPlan( PlanPtr plan, bool checkFirst ) {
+            if ( checkFirst && plan->indexKey().woCompare( plans_[ 0 ]->indexKey() ) == 0 )
+                return;
+            plans_.push_back( plan );
+        }
+        void init();
+        void addHint( IndexDetails &id );
+        struct Runner {
+            Runner( QueryPlanSet &plans, QueryOp &op );
+            shared_ptr< QueryOp > run();
+            QueryOp &op_;
+            QueryPlanSet &plans_;
+            static void initOp( QueryOp &op );
+            static void nextOp( QueryOp &op );
+        };
+        const char *ns;
+        FieldRangeSet fbs_;
+        PlanSet plans_;
+        bool mayRecordPlan_;
+        bool usingPrerecordedPlan_;
+        BSONObj hint_;
+        BSONObj order_;
+        long long oldNScanned_;
+        bool honorRecordedPlan_;
+        BSONObj min_;
+        BSONObj max_;
+    };
+
+    // NOTE min, max, and keyPattern will be updated to be consistent with the selected index.
+    IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern );
+        
+} // namespace mongo
diff --git a/db/queryutil.cpp b/db/queryutil.cpp
new file mode 100644
index 0000000..d8854be
--- /dev/null
+++ b/db/queryutil.cpp
@@ -0,0 +1,594 @@
+// queryutil.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "stdafx.h"
+
+#include "btree.h"
+#include "matcher.h"
+#include "pdfile.h"
+#include "queryoptimizer.h"
+#include "../util/unittest.h"
+
+namespace mongo {
+    namespace {
+        /** returns a string that when used as a matcher, would match a super set of regex()
+            returns "" for complex regular expressions
+            used to optimize queries in some simple regex cases that start with '^'
+        */
+        inline string simpleRegexHelper(const char* regex, const char* flags){
+            string r = "";
+
+            bool extended = false;
+            while (*flags){
+                switch (*(flags++)){
+                    case 'm': // multiline
+                        continue;
+                    case 'x': // extended
+                        extended = true;
+                        break;
+                    default:
+                        return r; // cant use index
+                }
+            }
+
+            if ( *(regex++) != '^' )
+                return r;
+
+            stringstream ss;
+
+            while(*regex){
+                char c = *(regex++);
+                if ( c == '*' || c == '?' ){
+                    // These are the only two symbols that make the last char optional
+                    r = ss.str();
+                    r = r.substr( 0 , r.size() - 1 );
+                    return r; //breaking here fails with /^a?/
+                } else if (c == '\\'){
+                    // slash followed by non-alphanumeric represents the following char
+                    c = *(regex++);
+                    if ((c >= 'A' && c <= 'Z') ||
+                        (c >= 'a' && c <= 'z') ||
+                        (c >= '0' && c <= '0') ||
+                        (c == '\0'))
+                    {
+                        r = ss.str();
+                        break;
+                    } else {
+                        ss << c;
+                    }
+                } else if (strchr("^$.[|()+{", c)){
+                    // list of "metacharacters" from man pcrepattern
+                    r = ss.str();
+                    break;
+                } else if (extended && c == '#'){
+                    // comment
+                    r = ss.str();
+                    break;
+                } else if (extended && isspace(c)){
+                    continue;
+                } else {
+                    // self-matching char
+                    ss << c;
+                }
+            }
+
+            if ( r.size() == 0 && *regex == 0 )
+                r = ss.str();
+
+            return r;
+        }
+        inline string simpleRegex(const BSONElement& e){
+            switch(e.type()){
+                case RegEx:
+                    return simpleRegexHelper(e.regex(), e.regexFlags());
+                case Object:{
+                    BSONObj o = e.embeddedObject();
+                    return simpleRegexHelper(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe());
+                }
+                default: assert(false); return ""; //return squashes compiler warning
+            }
+        }
+    }
+    
+    FieldRange::FieldRange( const BSONElement &e, bool optimize ) {
+        if ( !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) {
+            set< BSONElement, element_lt > vals;
+            BSONObjIterator i( e.embeddedObject() );
+            while( i.more() )
+                vals.insert( i.next() );
+
+            for( set< BSONElement, element_lt >::const_iterator i = vals.begin(); i != vals.end(); ++i )
+                intervals_.push_back( FieldInterval(*i) );
+
+            return;
+        }
+        
+        if ( e.type() == Array && e.getGtLtOp() == BSONObj::Equality ){
+            
+            intervals_.push_back( FieldInterval(e) );
+            
+            const BSONElement& temp = e.embeddedObject().firstElement();
+            if ( ! temp.eoo() ){
+                if ( temp < e )
+                    intervals_.insert( intervals_.begin() , temp );
+                else
+                    intervals_.push_back( FieldInterval(temp) );
+            }
+            
+            return;
+        }
+
+        intervals_.push_back( FieldInterval() );
+        FieldInterval &initial = intervals_[ 0 ];
+        BSONElement &lower = initial.lower_.bound_;
+        bool &lowerInclusive = initial.lower_.inclusive_;
+        BSONElement &upper = initial.upper_.bound_;
+        bool &upperInclusive = initial.upper_.inclusive_;
+        lower = minKey.firstElement();
+        lowerInclusive = true;
+        upper = maxKey.firstElement();
+        upperInclusive = true;
+
+        if ( e.eoo() )
+            return;
+        if ( e.type() == RegEx
+             || (e.type() == Object && !e.embeddedObject()["$regex"].eoo())
+           )
+        {
+            const string r = simpleRegex(e);
+            if ( r.size() ) {
+                lower = addObj( BSON( "" << r ) ).firstElement();
+                upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement();
+                upperInclusive = false;
+            }            
+            return;
+        }
+        switch( e.getGtLtOp() ) {
+        case BSONObj::Equality:
+            lower = upper = e;
+            break;
+        case BSONObj::LT:
+            upperInclusive = false;
+        case BSONObj::LTE:
+            upper = e;
+            break;
+        case BSONObj::GT:
+            lowerInclusive = false;
+        case BSONObj::GTE:
+            lower = e;
+            break;
+        case BSONObj::opALL: {
+            massert( 10370 ,  "$all requires array", e.type() == Array );
+            BSONObjIterator i( e.embeddedObject() );
+            if ( i.more() )
+                lower = upper = i.next();
+            break;
+        }
+        case BSONObj::opMOD: {
+            {
+                BSONObjBuilder b;
+                b.appendMinForType( "" , NumberDouble );
+                lower = addObj( b.obj() ).firstElement();
+            }
+            {
+                BSONObjBuilder b;
+                b.appendMaxForType( "" , NumberDouble );
+                upper = addObj( b.obj() ).firstElement();
+            }            
+            break;
+        }
+        case BSONObj::opTYPE: {
+            BSONType t = (BSONType)e.numberInt();
+            {
+                BSONObjBuilder b;
+                b.appendMinForType( "" , t );
+                lower = addObj( b.obj() ).firstElement();
+            }
+            {
+                BSONObjBuilder b;
+                b.appendMaxForType( "" , t );
+                upper = addObj( b.obj() ).firstElement();
+            }
+            
+            break;
+        }
+        case BSONObj::opELEM_MATCH: {
+            log() << "warning: shouldn't get here?" << endl;
+            break;
+        }
+        default:
+            break;
+        }
+        
+        if ( optimize ){
+            if ( lower.type() != MinKey && upper.type() == MaxKey && lower.isSimpleType() ){ // TODO: get rid of isSimpleType
+                BSONObjBuilder b;
+                b.appendMaxForType( lower.fieldName() , lower.type() );
+                upper = addObj( b.obj() ).firstElement();
+            }
+            else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ){ // TODO: get rid of isSimpleType
+                BSONObjBuilder b;
+                b.appendMinForType( upper.fieldName() , upper.type() );
+                lower = addObj( b.obj() ).firstElement();
+            }
+        }
+
+    }
+
+    // as called, these functions find the max/min of a bound in the
+    // opposite direction, so inclusive bounds are considered less
+    // superlative
+    FieldBound maxFieldBound( const FieldBound &a, const FieldBound &b ) {
+        int cmp = a.bound_.woCompare( b.bound_, false );
+        if ( ( cmp == 0 && !b.inclusive_ ) || cmp < 0 )
+            return b;
+        return a;
+    }
+
+    FieldBound minFieldBound( const FieldBound &a, const FieldBound &b ) {
+        int cmp = a.bound_.woCompare( b.bound_, false );
+        if ( ( cmp == 0 && !b.inclusive_ ) || cmp > 0 )
+            return b;
+        return a;
+    }
+
+    bool fieldIntervalOverlap( const FieldInterval &one, const FieldInterval &two, FieldInterval &result ) {
+        result.lower_ = maxFieldBound( one.lower_, two.lower_ );
+        result.upper_ = minFieldBound( one.upper_, two.upper_ );
+        return result.valid();
+    }
+    
+	// NOTE Not yet tested for complex $or bounds, just for simple bounds generated by $in
+    const FieldRange &FieldRange::operator&=( const FieldRange &other ) {
+        vector< FieldInterval > newIntervals;
+        vector< FieldInterval >::const_iterator i = intervals_.begin();
+        vector< FieldInterval >::const_iterator j = other.intervals_.begin();
+        while( i != intervals_.end() && j != other.intervals_.end() ) {
+            FieldInterval overlap;
+            if ( fieldIntervalOverlap( *i, *j, overlap ) )
+                newIntervals.push_back( overlap );
+            if ( i->upper_ == minFieldBound( i->upper_, j->upper_ ) )
+                ++i;
+            else
+                ++j;      
+        }
+        intervals_ = newIntervals;
+        for( vector< BSONObj >::const_iterator i = other.objData_.begin(); i != other.objData_.end(); ++i )
+            objData_.push_back( *i );
+        return *this;
+    }
+    
+    string FieldRange::simpleRegexEnd( string regex ) {
+        ++regex[ regex.length() - 1 ];
+        return regex;
+    }    
+    
+    BSONObj FieldRange::addObj( const BSONObj &o ) {
+        objData_.push_back( o );
+        return o;
+    }
+    
+    FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query , bool optimize )
+        : ns_( ns ), query_( query.getOwned() ) {
+        BSONObjIterator i( query_ );
+        
+        while( i.more() ) {
+            BSONElement e = i.next();
+            // e could be x:1 or x:{$gt:1}
+
+            if ( strcmp( e.fieldName(), "$where" ) == 0 )
+                continue;
+
+            int op = getGtLtOp( e );
+            
+            if ( op == BSONObj::Equality || op == BSONObj::opREGEX || op == BSONObj::opOPTIONS ) {
+                ranges_[ e.fieldName() ] &= FieldRange( e , optimize );
+            }
+            else if ( op == BSONObj::opELEM_MATCH ){
+                BSONObjIterator i( e.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck() );
+                while ( i.more() ){
+                    BSONElement f = i.next();
+                    StringBuilder buf(32);
+                    buf << e.fieldName() << "." << f.fieldName();
+                    string fullname = buf.str();
+
+                    int op2 = getGtLtOp( f );
+                    if ( op2 == BSONObj::Equality ){
+                        ranges_[ fullname ] &= FieldRange( f , optimize );
+                    }
+                    else {
+                        BSONObjIterator j( f.embeddedObject() );
+                        while ( j.more() ){
+                            ranges_[ fullname ] &= FieldRange( j.next() , optimize );
+                        }
+                    }
+                }
+            }
+            else {
+                BSONObjIterator i( e.embeddedObject() );
+                while( i.more() ) {
+                    BSONElement f = i.next();
+                    ranges_[ e.fieldName() ] &= FieldRange( f , optimize );
+                }                
+            }
+        }
+    }
+
+    FieldRange *FieldRangeSet::trivialRange_ = 0;
+    FieldRange &FieldRangeSet::trivialRange() {
+        if ( trivialRange_ == 0 )
+            trivialRange_ = new FieldRange();
+        return *trivialRange_;
+    }
+    
+    BSONObj FieldRangeSet::simplifiedQuery( const BSONObj &_fields ) const {
+        BSONObj fields = _fields;
+        if ( fields.isEmpty() ) {
+            BSONObjBuilder b;
+            for( map< string, FieldRange >::const_iterator i = ranges_.begin(); i != ranges_.end(); ++i ) {
+                b.append( i->first.c_str(), 1 );
+            }
+            fields = b.obj();
+        }
+        BSONObjBuilder b;
+        BSONObjIterator i( fields );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            const char *name = e.fieldName();
+            const FieldRange &range = ranges_[ name ];
+            assert( !range.empty() );
+            if ( range.equality() )
+                b.appendAs( range.min(), name );
+            else if ( range.nontrivial() ) {
+                BSONObjBuilder c;
+                if ( range.min().type() != MinKey )
+                    c.appendAs( range.min(), range.minInclusive() ? "$gte" : "$gt" );
+                if ( range.max().type() != MaxKey )
+                    c.appendAs( range.max(), range.maxInclusive() ? "$lte" : "$lt" );
+                b.append( name, c.done() );                
+            }
+        }
+        return b.obj();
+    }
+    
+    QueryPattern FieldRangeSet::pattern( const BSONObj &sort ) const {
+        QueryPattern qp;
+        for( map< string, FieldRange >::const_iterator i = ranges_.begin(); i != ranges_.end(); ++i ) {
+            assert( !i->second.empty() );
+            if ( i->second.equality() ) {
+                qp.fieldTypes_[ i->first ] = QueryPattern::Equality;
+            } else if ( i->second.nontrivial() ) {
+                bool upper = i->second.max().type() != MaxKey;
+                bool lower = i->second.min().type() != MinKey;
+                if ( upper && lower )
+                    qp.fieldTypes_[ i->first ] = QueryPattern::UpperAndLowerBound;
+                else if ( upper )
+                    qp.fieldTypes_[ i->first ] = QueryPattern::UpperBound;
+                else if ( lower )
+                    qp.fieldTypes_[ i->first ] = QueryPattern::LowerBound;                    
+            }
+        }
+        qp.setSort( sort );
+        return qp;
+    }
+    
+    BoundList FieldRangeSet::indexBounds( const BSONObj &keyPattern, int direction ) const {
+        BSONObjBuilder equalityBuilder;
+        typedef vector< pair< shared_ptr< BSONObjBuilder >, shared_ptr< BSONObjBuilder > > > BoundBuilders;
+        BoundBuilders builders;
+        BSONObjIterator i( keyPattern );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            const FieldRange &fr = range( e.fieldName() );
+            int number = (int) e.number(); // returns 0.0 if not numeric
+            bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
+            if ( builders.empty() ) {
+                if ( fr.equality() ) {
+                    equalityBuilder.appendAs( fr.min(), "" );
+                } else {
+                    BSONObj equalityObj = equalityBuilder.done();
+                    const vector< FieldInterval > &intervals = fr.intervals();
+                    if ( forward ) {
+                        for( vector< FieldInterval >::const_iterator j = intervals.begin(); j != intervals.end(); ++j ) {
+                            builders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );
+                            builders.back().first->appendElements( equalityObj );
+                            builders.back().second->appendElements( equalityObj );
+                            builders.back().first->appendAs( j->lower_.bound_, "" );
+                            builders.back().second->appendAs( j->upper_.bound_, "" );
+                        }
+                    } else {
+                        for( vector< FieldInterval >::const_reverse_iterator j = intervals.rbegin(); j != intervals.rend(); ++j ) {
+                            builders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );
+                            builders.back().first->appendElements( equalityObj );
+                            builders.back().second->appendElements( equalityObj );
+                            builders.back().first->appendAs( j->upper_.bound_, "" );
+                            builders.back().second->appendAs( j->lower_.bound_, "" );
+                        }                       
+                    }
+                }
+            } else {
+                for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) {
+                    j->first->appendAs( forward ? fr.min() : fr.max(), "" );
+                    j->second->appendAs( forward ? fr.max() : fr.min(), "" );
+                }
+            }
+        }
+        if ( builders.empty() ) {
+            BSONObj equalityObj = equalityBuilder.done();
+            assert( !equalityObj.isEmpty() );
+            builders.push_back( make_pair( shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ), shared_ptr< BSONObjBuilder >( new BSONObjBuilder() ) ) );
+            builders.back().first->appendElements( equalityObj );
+            builders.back().second->appendElements( equalityObj );            
+        }
+        BoundList ret;
+        for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i )
+            ret.push_back( make_pair( i->first->obj(), i->second->obj() ) );
+        return ret;
+    }
+
+    ///////////////////
+    // FieldMatcher //
+    ///////////////////
+    
+    void FieldMatcher::add( const BSONObj& o ){
+        massert( 10371 , "can only add to FieldMatcher once", source_.isEmpty());
+        source_ = o;
+
+        BSONObjIterator i( o );
+        int true_false = -1;
+        while ( i.more() ){
+            BSONElement e = i.next();
+            add (e.fieldName(), e.trueValue());
+
+            // validate input
+            if (true_false == -1){
+                true_false = e.trueValue();
+                include_ = !e.trueValue();
+            }else{
+                if((bool) true_false != e.trueValue())
+                    errmsg = "You cannot currently mix including and excluding fields. Contact us if this is an issue.";
+            }
+        }
+    }
+
+    void FieldMatcher::add(const string& field, bool include){
+        if (field.empty()){ // this is the field the user referred to
+            include_ = include;
+        } else {
+            const size_t dot = field.find('.');
+            const string subfield = field.substr(0,dot);
+            const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); 
+
+            boost::shared_ptr<FieldMatcher>& fm = fields_[subfield];
+            if (!fm)
+                fm.reset(new FieldMatcher(!include));
+
+            fm->add(rest, include);
+        }
+    }
+
+    BSONObj FieldMatcher::getSpec() const{
+        return source_;
+    }
+
+    //b will be the value part of an array-typed BSONElement
+    void FieldMatcher::appendArray( BSONObjBuilder& b , const BSONObj& a ) const {
+        int i=0;
+        BSONObjIterator it(a);
+        while (it.more()){
+            BSONElement e = it.next();
+
+            switch(e.type()){
+                case Array:{
+                    BSONObjBuilder subb;
+                    appendArray(subb , e.embeddedObject());
+                    b.appendArray(b.numStr(i++).c_str(), subb.obj());
+                    break;
+                }
+                case Object:{
+                    BSONObjBuilder subb;
+                    BSONObjIterator jt(e.embeddedObject());
+                    while (jt.more()){
+                        append(subb , jt.next());
+                    }
+                    b.append(b.numStr(i++), subb.obj());
+                    break;
+                }
+                default:
+                    if (include_)
+                        b.appendAs(e, b.numStr(i++).c_str());
+            }
+            
+
+        }
+    }
+
+    void FieldMatcher::append( BSONObjBuilder& b , const BSONElement& e ) const {
+        FieldMap::const_iterator field = fields_.find( e.fieldName() );
+        
+        if (field == fields_.end()){
+            if (include_)
+                b.append(e);
+        } else {
+            FieldMatcher& subfm = *field->second;
+
+            if (subfm.fields_.empty() || !(e.type()==Object || e.type()==Array) ){
+                if (subfm.include_)
+                    b.append(e);
+            } else if (e.type() == Object){ 
+                BSONObjBuilder subb;
+                BSONObjIterator it(e.embeddedObject());
+                while (it.more()){
+                    subfm.append(subb, it.next());
+                }
+                b.append(e.fieldName(), subb.obj());
+
+            } else { //Array
+                BSONObjBuilder subb;
+                subfm.appendArray(subb, e.embeddedObject());
+                b.appendArray(e.fieldName(), subb.obj());
+            }
+        }
+    }
+    
+    struct SimpleRegexUnitTest : UnitTest {
+        void run(){
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^foo");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "foo" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f?oo");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^fz?oo");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f", "");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f", "m");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f", "mi");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f \t\vo\n\ro  \\ \\# #comment", "mx");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "foo #" );
+            }
+        }
+    } simple_regex_unittest;
+} // namespace mongo
diff --git a/db/queryutil.h b/db/queryutil.h
new file mode 100644
index 0000000..2122a7f
--- /dev/null
+++ b/db/queryutil.h
@@ -0,0 +1,210 @@
+// queryutil.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+
+    struct FieldBound {
+        BSONElement bound_;
+        bool inclusive_;
+        bool operator==( const FieldBound &other ) const {
+            return bound_.woCompare( other.bound_ ) == 0 &&
+            inclusive_ == other.inclusive_;
+        }
+    };
+
+    struct FieldInterval {
+        FieldInterval(){}
+        FieldInterval( const BSONElement& e ){
+            lower_.bound_ = upper_.bound_ = e;
+            lower_.inclusive_ = upper_.inclusive_ = true;
+        }
+        FieldBound lower_;
+        FieldBound upper_;
+        bool valid() const {
+            int cmp = lower_.bound_.woCompare( upper_.bound_, false );
+            return ( cmp < 0 || ( cmp == 0 && lower_.inclusive_ && upper_.inclusive_ ) );
+        }
+    };
+
+    // range of a field's value that may be determined from query -- used to
+    // determine index limits
+    class FieldRange {
+    public:
+        FieldRange( const BSONElement &e = BSONObj().firstElement() , bool optimize=true );
+        const FieldRange &operator&=( const FieldRange &other );
+        BSONElement min() const { assert( !empty() ); return intervals_[ 0 ].lower_.bound_; }
+        BSONElement max() const { assert( !empty() ); return intervals_[ intervals_.size() - 1 ].upper_.bound_; }
+        bool minInclusive() const { assert( !empty() ); return intervals_[ 0 ].lower_.inclusive_; }
+        bool maxInclusive() const { assert( !empty() ); return intervals_[ intervals_.size() - 1 ].upper_.inclusive_; }
+        bool equality() const {
+            return
+                !empty() &&
+                min().woCompare( max(), false ) == 0 &&
+                maxInclusive() &&
+                minInclusive();
+        }
+        bool nontrivial() const {
+            return
+                ! empty() && 
+                ( minKey.firstElement().woCompare( min(), false ) != 0 ||
+                  maxKey.firstElement().woCompare( max(), false ) != 0 );
+        }
+        bool empty() const { return intervals_.empty(); }
+		const vector< FieldInterval > &intervals() const { return intervals_; }
+    private:
+        BSONObj addObj( const BSONObj &o );
+        string simpleRegexEnd( string regex );
+        vector< FieldInterval > intervals_;
+        vector< BSONObj > objData_;
+    };
+    
+    // implements query pattern matching, used to determine if a query is
+    // similar to an earlier query and should use the same plan
+    class QueryPattern {
+    public:
+        friend class FieldRangeSet;
+        enum Type {
+            Equality,
+            LowerBound,
+            UpperBound,
+            UpperAndLowerBound
+        };
+        // for testing only, speed unimportant
+        bool operator==( const QueryPattern &other ) const {
+            bool less = operator<( other );
+            bool more = other.operator<( *this );
+            assert( !( less && more ) );
+            return !( less || more );
+        }
+        bool operator!=( const QueryPattern &other ) const {
+            return !operator==( other );
+        }
+        bool operator<( const QueryPattern &other ) const {
+            map< string, Type >::const_iterator i = fieldTypes_.begin();
+            map< string, Type >::const_iterator j = other.fieldTypes_.begin();
+            while( i != fieldTypes_.end() ) {
+                if ( j == other.fieldTypes_.end() )
+                    return false;
+                if ( i->first < j->first )
+                    return true;
+                else if ( i->first > j->first )
+                    return false;
+                if ( i->second < j->second )
+                    return true;
+                else if ( i->second > j->second )
+                    return false;
+                ++i;
+                ++j;
+            }
+            if ( j != other.fieldTypes_.end() )
+                return true;
+            return sort_.woCompare( other.sort_ ) < 0;
+        }
+    private:
+        QueryPattern() {}
+        void setSort( const BSONObj sort ) {
+            sort_ = normalizeSort( sort );
+        }
+        BSONObj static normalizeSort( const BSONObj &spec ) {
+            if ( spec.isEmpty() )
+                return spec;
+            int direction = ( spec.firstElement().number() >= 0 ) ? 1 : -1;
+            BSONObjIterator i( spec );
+            BSONObjBuilder b;
+            while( i.moreWithEOO() ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                b.append( e.fieldName(), direction * ( ( e.number() >= 0 ) ? -1 : 1 ) );
+            }
+            return b.obj();
+        }
+        map< string, Type > fieldTypes_;
+        BSONObj sort_;
+    };
+    
+    // ranges of fields' value that may be determined from query -- used to
+    // determine index limits
+    class FieldRangeSet {
+    public:
+        FieldRangeSet( const char *ns, const BSONObj &query , bool optimize=true );
+        const FieldRange &range( const char *fieldName ) const {
+            map< string, FieldRange >::const_iterator f = ranges_.find( fieldName );
+            if ( f == ranges_.end() )
+                return trivialRange();
+            return f->second;
+        }
+        int nNontrivialRanges() const {
+            int count = 0;
+            for( map< string, FieldRange >::const_iterator i = ranges_.begin(); i != ranges_.end(); ++i )
+                if ( i->second.nontrivial() )
+                    ++count;
+            return count;
+        }
+        const char *ns() const { return ns_; }
+        BSONObj query() const { return query_; }
+        // if fields is specified, order fields of returned object to match those of 'fields'
+        BSONObj simplifiedQuery( const BSONObj &fields = BSONObj() ) const;
+        bool matchPossible() const {
+            for( map< string, FieldRange >::const_iterator i = ranges_.begin(); i != ranges_.end(); ++i )
+                if ( i->second.empty() )
+                    return false;
+            return true;
+        }
+        QueryPattern pattern( const BSONObj &sort = BSONObj() ) const;
+        BoundList indexBounds( const BSONObj &keyPattern, int direction ) const;
+    private:
+        static FieldRange *trivialRange_;
+        static FieldRange &trivialRange();
+        mutable map< string, FieldRange > ranges_;
+        const char *ns_;
+        BSONObj query_;
+    };
+
+    /**
+       used for doing field limiting
+     */
+    class FieldMatcher {
+    public:
+
+        FieldMatcher(bool include=false) : errmsg(NULL), include_(include)  {}
+        
+        void add( const BSONObj& o );
+
+        void append( BSONObjBuilder& b , const BSONElement& e ) const;
+
+        BSONObj getSpec() const;
+
+        const char* errmsg; //null if FieldMatcher is valid
+    private:
+
+        void add( const string& field, bool include );
+        void appendArray( BSONObjBuilder& b , const BSONObj& a ) const;
+
+        bool include_; // true if default at this level is to include
+        //TODO: benchmark vector<pair> vs map
+        typedef map<string, boost::shared_ptr<FieldMatcher> > FieldMap;
+        FieldMap fields_;
+        BSONObj source_;
+    };
+
+
+} // namespace mongo
diff --git a/db/rec.h b/db/rec.h
new file mode 100644
index 0000000..b749dd8
--- /dev/null
+++ b/db/rec.h
@@ -0,0 +1,119 @@
+// rec.h
+
+/* TODO for _RECSTORE
+
+   _ support > 2GB data per file
+   _ multiple files, not just indexes.dat
+   _ lazier writes? (may be done?)
+   _ configurable cache size
+   _ fix on abnormal terminations to be able to restart some
+*/
+
+#pragma once
+
+#include "reci.h"
+#include "reccache.h"
+
+namespace mongo { 
+
+/* --------------------------------------------------------------------------
+   A RecStoreInterface for the normal mongo mem mapped file (MongoDataFile) 
+   storage
+*/
+
+NamespaceDetails* nsdetails_notinline(const char *ns);
+
+class MongoMemMapped_RecStore : public RecStoreInterface { 
+public:
+    virtual char* get(DiskLoc d, unsigned len) { return d.rec()->data; }
+
+    virtual DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { 
+        return theDataFileMgr.insert(ns, obuf, len, god);
+    }
+
+    virtual void deleteRecord(const char *ns, DiskLoc d) { 
+        theDataFileMgr._deleteRecord(nsdetails_notinline(ns), ns, d.rec(), d);
+    }
+
+    virtual void modified(DiskLoc d) { }
+
+    virtual void drop(const char *ns) { 
+        dropNS(ns);
+    }
+
+    virtual void rename(const char *fromNs, const char *toNs) {
+      renameNamespace( fromNs, toNs );
+    }
+
+    /* close datafiles associated with the db specified. */
+    virtual void closeFiles(string dbname, string path) {
+        /* as this is only used for indexes so far, and we are in the same 
+           PDFiles as the nonindex data, we just rely on them having been closed 
+           at the same time.  one day this may need to change.
+        */
+    }
+
+};
+
+/* An in memory RecStoreInterface implementation ----------------------------
+*/
+
+#if 0
+class InMem_RecStore : public RecStoreInterface { 
+    enum InmemfileValue { INMEMFILE = 0x70000000 };
+public:
+    static char* get(DiskLoc d, unsigned len) { 
+        assert( d.a() == INMEMFILE );
+#ifdef __LP64__
+		massert( 10372 , "64 bit not done", false);
+		return 0;
+#else
+		return (char *) d.getOfs();
+#endif
+    }
+
+    static DiskLoc insert(const char *ns, const void *obuf, int len, bool god) {
+#ifdef __LP64__
+      assert( 0 );
+      throw -1;
+#else
+        char *p = (char *) malloc(len);
+        assert( p );
+        memcpy(p, obuf, len);
+        int b = (int) p;
+        assert( b > 0 );
+        return DiskLoc(INMEMFILE, b);
+#endif
+    }
+
+    static void modified(DiskLoc d) { }
+
+    static void drop(const char *ns) { 
+        log() << "warning: drop() not yet implemented for InMem_RecStore" << endl;
+    }
+
+    virtual void rename(const char *fromNs, const char *toNs) {
+      massert( 10373 ,  "rename not yet implemented for InMem_RecStore", false );
+    }
+};
+#endif
+
+/* Glue btree to RecStoreInterface: ---------------------------- */
+
+extern RecStoreInterface *btreeStore;
+
+const int BucketSize = 8192;
+
+inline BtreeBucket* DiskLoc::btree() const {
+    assert( fileNo != -1 );
+    return (BtreeBucket*) btreeStore->get(*this, BucketSize);
+}
+
+inline BtreeBucket* DiskLoc::btreemod() const {
+    assert( fileNo != -1 );
+    BtreeBucket *b = (BtreeBucket*) btreeStore->get(*this, BucketSize);
+    btreeStore->modified(*this);
+    return b;
+}
+
+}
diff --git a/db/reccache.cpp b/db/reccache.cpp
new file mode 100644
index 0000000..66dd4e3
--- /dev/null
+++ b/db/reccache.cpp
@@ -0,0 +1,401 @@
+// storage.cpp
+
+#include "stdafx.h"
+#include "pdfile.h"
+#include "reccache.h"
+#include "rec.h"
+#include "db.h"
+
+namespace mongo {
+
+RecCache theRecCache(BucketSize);
+
+// 100k * 8KB = 800MB
+unsigned RecCache::MAXNODES = 50000;
+
+void setRecCacheSize(unsigned mb) {
+    unsigned long long MB = mb;
+    log(2) << "reccache size: " << MB << "MB\n";
+    uassert( 10114 ,  "bad cache size", MB > 0 && MB < 1000000 );
+    RecCache::MAXNODES = (unsigned) MB * 1024 * 1024 / 8192;
+    log(3) << "RecCache::MAXNODES=" << RecCache::MAXNODES << '\n';
+}
+
+void writerThread() { 
+    sleepsecs(10);
+    while( 1 ) { 
+        try { 
+            theRecCache.writeLazily();
+        }
+        catch(...) { 
+            log() << "exception in writerThread()" << endl;
+            sleepsecs(3);
+        }
+    }
+}
+
+// called on program exit.
+void recCacheCloseAll() { 
+#if defined(_RECSTORE)
+    theRecCache.closing();
+#endif
+}
+
+int ndirtywritten;
+
+inline static string escape(const char *ns) {
+    char buf[256];
+    char *p = buf;
+    while( 1 ) {
+        if( *ns == '$' ) *p = '~';
+        else
+            *p = *ns;
+        if( *ns == 0 )
+            break;
+        p++; ns++;
+    }
+    assert( p - buf < (int) sizeof(buf) );
+    return buf;
+}
+
+inline static string unescape(const char *ns) {
+    char buf[256];
+    char *p = buf;
+    while( 1 ) {
+        if( *ns == '~' ) *p = '$';
+        else
+            *p = *ns;
+        if( *ns == 0 )
+            break;
+        p++; ns++;
+    }
+    assert( p - buf < (int) sizeof(buf) );
+    return buf;
+}
+
+string RecCache::directory() { 
+    return cc().database()->path;
+}
+
+/* filename format is 
+
+     <n>-<ns>.idx
+*/
+
+BasicRecStore* RecCache::_initStore(string fname) { 
+
+    assert( strchr(fname.c_str(), '/') == 0 );
+    assert( strchr(fname.c_str(), '\\') == 0 );
+
+    stringstream ss(fname);
+    int n;
+    ss >> n;
+    assert( n >= 0 );
+    char ch;
+    ss >> ch;
+    assert( ch == '-' );
+    string rest;
+    ss >> rest;
+    const char *p = rest.c_str();
+    const char *q = strstr(p, ".idx");
+    assert( q );
+    string escaped_ns(p, q-p);
+
+    // arbitrary limit.  if you are hitting, we should use fewer files and put multiple 
+    // indexes in a single file (which is easy to do)
+    massert( 10374 ,  "too many index files", n < 10000 );
+
+    if( stores.size() < (unsigned)n+1 )
+        stores.resize(n+1);
+    assert( stores[n] == 0 );
+    BasicRecStore *rs = new BasicRecStore(n);
+    path pf(directory());
+    pf /= fname;
+    string full = pf.string();
+    rs->init(full.c_str(), recsize);
+    stores[n] = rs;
+    string ns = unescape(escaped_ns.c_str());
+    storesByNsKey[mknskey(ns.c_str())] = rs;
+    return rs;
+}
+
+BasicRecStore* RecCache::initStore(int n) { 
+    string ns;
+    { 
+        stringstream ss;
+        ss << '/' << n << '-';
+        ns = ss.str();
+    }
+
+    /* this will be slow if there are thousands of files */
+    path dir(directory());
+    directory_iterator end;
+    try {
+        directory_iterator i(dir);
+        while ( i != end ) {
+            string s = i->string();
+            const char *p = strstr(s.c_str(), ns.c_str());
+            if( p && strstr(p, ".idx") ) { 
+                // found it
+                path P = *i;
+                return _initStore(P.leaf());
+            }
+            i++;
+        }
+    }
+    catch( DBException & ) { 
+        throw;
+    }
+    catch (...) {
+        string s = string("i/o error looking for .idx file in ") + directory();
+        massert( 10375 , s, false);
+    }
+    stringstream ss;
+    ss << "index datafile missing? n=" << n;
+    uasserted(12500,ss.str());
+    return 0;
+}
+
+/* find the filename for a given ns.
+   format is 
+     <n>-<escaped_ns>.idx
+   returns filename.  found is true if found.  If false, a proposed name is returned for (optional) creation
+   of the file.
+*/
+string RecCache::findStoreFilename(const char *_ns, bool& found) {
+    string namefrag;
+    { 
+        stringstream ss;
+        ss << '-';
+        ss << escape(_ns);
+        ss << ".idx";
+        namefrag = ss.str();
+    }
+
+    path dir(directory());
+    directory_iterator end;
+    int nmax = -1;
+    try {
+        directory_iterator i(dir);
+        while ( i != end ) {
+            string s = path(*i).leaf();
+            const char *p = strstr(s.c_str(), namefrag.c_str());
+            if( p ) {
+                found = true;
+                return s;
+            }
+            if( strstr(s.c_str(), ".idx") ) { 
+                stringstream ss(s);
+                int n = -1;
+                ss >> n;
+                if( n > nmax )
+                    nmax = n;
+            }
+            i++;
+        }
+    }
+    catch (...) {
+        string s = string("i/o error looking for .idx file in ") + directory();
+        massert( 10376 , s, false);
+    }
+
+    // DNE.  return a name that would work.
+    stringstream ss;
+    ss << nmax+1 << namefrag;
+    found = false;
+    return ss.str();
+}
+
+void RecCache::initStoreByNs(const char *_ns, const string& nskey) {
+    bool found;
+    string fn = findStoreFilename(_ns, found);
+    _initStore(fn);
+}
+
+inline void RecCache::writeIfDirty(Node *n) {
+    if( n->dirty ) {
+        ndirtywritten++;
+        n->dirty = false;
+        store(n->loc).update(fileOfs(n->loc), n->data, recsize);
+    }
+}
+
+void RecCache::closeFiles(string dbname, string path) { 
+    assertInWriteLock();
+    boostlock lk(rcmutex);
+
+    // first we write all dirty pages.  it is not easy to check which Nodes are for a particular
+    // db, so we just write them all.
+    writeDirty( dirtyl.begin(), true );
+
+    string key = path + dbname + '.';
+    unsigned sz = key.size();
+    for( map<string, BasicRecStore*>::iterator i = storesByNsKey.begin(); i != storesByNsKey.end(); i++ ) { 
+        map<string, BasicRecStore*>::iterator j = i;
+        i++;
+        if( strncmp(j->first.c_str(), key.c_str(), sz) == 0 ) {
+            assert( stores[j->second->fileNumber] != 0 );
+            stores[j->second->fileNumber] = 0;
+            delete j->second;
+            storesByNsKey.erase(j);
+        }
+    }
+}
+
+void RecCache::closing() { 
+    boostlock lk(rcmutex);
+    (cout << "TEMP: recCacheCloseAll() writing dirty pages...\n").flush();
+    writeDirty( dirtyl.begin(), true );
+    for( unsigned i = 0; i < stores.size(); i++ ) { 
+        if( stores[i] ) {
+            delete stores[i];
+        }
+    }
+    (cout << "TEMP: write dirty done\n").flush();
+}
+
+/* note that this is written in order, as much as possible, given that dirtyl is of type set. */
+void RecCache::writeDirty( set<DiskLoc>::iterator startAt, bool rawLog ) { 
+    try { 
+        ndirtywritten=0;
+        for( set<DiskLoc>::iterator i = startAt; i != dirtyl.end(); i++ ) { 
+            map<DiskLoc, Node*>::iterator j = m.find(*i);
+            if( j != m.end() )
+                writeIfDirty(j->second);
+        }
+        OCCASIONALLY out() << "TEMP: ndirtywritten: " << ndirtywritten << endl;
+    }
+    catch(...) {
+        const char *message = "Problem: bad() in RecCache::writeDirty, file io error\n";
+
+        if ( rawLog )
+            rawOut( message );
+        else
+            ( log() << message ).flush();
+    }
+    dirtyl.clear();
+}
+
+void RecCache::writeLazily() {
+    int sleep = 0;
+    int k;
+    {
+        boostlock lk(rcmutex);
+        Timer t;
+        set<DiskLoc>::iterator i = dirtyl.end();
+        for( k = 0; k < 100; k++ ) {
+            if( i == dirtyl.begin() ) { 
+                // we're not very far behind
+                sleep = k < 20 ? 2000 : 1000;
+                break;
+            }
+            i--;
+        }
+        writeDirty(i);
+        if( sleep == 0 ) {
+            sleep = t.millis() * 4 + 10;
+        }
+    }
+
+    OCCASIONALLY cout << "writeLazily " << k << " sleep:" << sleep << '\n';
+    sleepmillis(sleep);
+}
+
+void RecCache::_ejectOld() { 
+    boostlock lk(rcmutex);
+    if( nnodes <= MAXNODES )
+        return;
+    Node *n = oldest;
+    while( 1 ) {
+        if( nnodes <= MAXNODES - 4 ) { 
+            n->older = 0;
+            oldest = n;
+            assert( oldest ) ;
+            break;
+        }
+        nnodes--;
+        assert(n);
+        Node *nxt = n->newer;
+        writeIfDirty(n);
+        m.erase(n->loc);
+        delete n;
+        n = nxt;
+    }
+}
+
+void RecCache::dump() { 
+    Node *n = oldest;
+    Node *last = 0;
+    while( n ) { 
+        assert( n->older == last );
+        last = n;
+//        cout << n << ' ' << n->older << ' ' << n->newer << '\n';
+        n=n->newer;
+    }
+    assert( newest == last );
+//    cout << endl;
+}
+
+/* cleans up everything EXCEPT storesByNsKey.
+   note this function is slow should not be invoked often
+*/
+void RecCache::closeStore(BasicRecStore *rs) { 
+    int n = rs->fileNumber + Base;
+    for( set<DiskLoc>::iterator i = dirtyl.begin(); i != dirtyl.end(); ) { 
+        DiskLoc k = *i++;
+        if( k.a() == n )
+            dirtyl.erase(k);
+    }
+
+    for( map<DiskLoc,Node*>::iterator i = m.begin(); i != m.end(); ) { 
+        DiskLoc k = i->first;
+        i++;
+        if( k.a() == n )
+            m.erase(k);
+    }
+
+    assert( stores[rs->fileNumber] != 0 );
+    stores[rs->fileNumber] = 0;
+/*
+    for( unsigned i = 0; i < stores.size(); i++ ) { 
+        if( stores[i] == rs ) { 
+            stores[i] = 0;
+            break;
+        }
+    }*/
+    delete rs; // closes file
+}
+
+void RecCache::drop(const char *_ns) { 
+    // todo: test with a non clean shutdown file
+    boostlock lk(rcmutex);
+
+    map<string, BasicRecStore*>::iterator it = storesByNsKey.find(mknskey(_ns));
+    string fname;
+    if( it != storesByNsKey.end() ) {
+        fname = it->second->filename;
+        closeStore(it->second); // cleans up stores[] etc.
+        storesByNsKey.erase(it);
+    }
+    else { 
+        bool found;
+        fname = findStoreFilename(_ns, found);
+        if( !found ) { 
+            log() << "RecCache::drop: no idx file found for " << _ns << endl;
+            return;
+        }
+        path pf(directory());
+        pf /= fname;
+        fname = pf.string();
+    }
+    try {
+        if( !boost::filesystem::exists(fname) ) 
+            log() << "RecCache::drop: can't find file to remove " << fname << endl;
+        boost::filesystem::remove(fname);
+    } 
+    catch(...) { 
+        log() << "RecCache::drop: exception removing file " << fname << endl;
+    }
+}
+
+}
diff --git a/db/reccache.h b/db/reccache.h
new file mode 100644
index 0000000..42943c5
--- /dev/null
+++ b/db/reccache.h
@@ -0,0 +1,242 @@
+// reccache.h
+
+/* CachedBasicRecStore
+   This is our store which implements a traditional page-cache type of storage
+   (not memory mapped files).
+*/
+
+/* LOCK HIERARCHY
+     
+     dblock
+       RecCache::rcmutex
+
+     i.e. always lock dblock first if you lock both
+
+*/
+
+#pragma once
+
+#include "reci.h"
+#include "recstore.h"
+
+namespace mongo { 
+
+class RecCache {
+    struct Node { 
+        Node(void* _data) : data((char *) _data) { dirty = false; newer = 0; }
+        ~Node() { 
+            free(data);
+            data = 0;
+        }
+        char *data;
+        DiskLoc loc;
+        bool dirty;
+        Node *older, *newer; // lru
+    };
+    boost::mutex &rcmutex; // mainly to coordinate with the lazy writer thread
+    unsigned recsize;
+    map<DiskLoc, Node*> m; // the cache
+    Node *newest, *oldest;
+    unsigned nnodes;
+    set<DiskLoc> dirtyl;
+    vector<BasicRecStore*> stores; // DiskLoc::a() indicates the index into this vector
+    map<string, BasicRecStore*> storesByNsKey; // nskey -> BasicRecStore*
+public:
+    static unsigned MAXNODES;
+    enum BaseValue { Base = 10000 };
+private:
+    BasicRecStore* _initStore(string fname);
+    BasicRecStore* initStore(int n);
+    string findStoreFilename(const char *_ns, bool& found);
+    void initStoreByNs(const char *ns, const string& nskey);
+    void closeStore(BasicRecStore *rs);
+
+    static string directory();
+    static string mknskey(const char *ns) { 
+        return directory() + ns;
+    }
+
+    /* get the right file for a given diskloc */
+    BasicRecStore& store(DiskLoc& d) { 
+        int n = d.a() - Base;
+        if( (int) stores.size() > n ) { 
+            BasicRecStore *rs = stores[n];
+            if( rs ) {
+                assert( rs->fileNumber == n );
+                return *rs;
+            }
+        }
+        return *initStore(n);
+    }
+    BasicRecStore& store(const char *ns) {
+        string nskey = mknskey(ns);
+        BasicRecStore *&rs = storesByNsKey[nskey];
+        if( rs )
+            return *rs;
+        initStoreByNs(ns, nskey);
+        return *rs;
+    }
+
+    void writeDirty( set<DiskLoc>::iterator i, bool rawLog = false );
+    void writeIfDirty(Node *n);
+    void touch(Node* n) { 
+        if( n == newest )
+            return;
+        if( n == oldest ) {
+            oldest = oldest->newer;
+            assert( oldest || nnodes == 1 );
+        }
+        if( n->older ) 
+            n->older->newer = n->newer;
+        if( n->newer ) 
+            n->newer->older = n->older;
+        n->newer = 0;        
+        n->older = newest;
+        newest->newer = n;
+        newest = n;
+    }
+    Node* mkNode() { 
+        Node *n = new Node(calloc(recsize,1)); // calloc is TEMP for testing.  change to malloc
+        n->older = newest;
+        if( newest )
+            newest->newer = n;
+        else {
+            assert( oldest == 0 );
+            oldest = n;
+        }
+        newest = n;
+        nnodes++;
+        return n;
+    }
+    fileofs fileOfs(DiskLoc d) { 
+        return ((fileofs) d.getOfs()) * recsize;
+    }
+
+    void dump();
+    void _ejectOld();
+
+public:
+    /* all public functions (except constructor) should use the mutex */
+
+    RecCache(unsigned recsz) : rcmutex( *( new boost::mutex() ) ), recsize(recsz) { 
+        nnodes = 0;
+        newest = oldest = 0;
+    }
+
+    /* call this after doing some work, after you are sure you are done with modifications.
+       we call it from dbunlocking().
+    */
+    void ejectOld() { 
+        if( nnodes > MAXNODES ) // just enough here to be inlineable for speed reasons.  _ejectOld does the real work
+            _ejectOld();
+    }
+
+    /* bg writer thread invokes this */
+    void writeLazily();
+
+    /* Note that this may be called BEFORE the actual writing to the node 
+       takes place.  We do flushing later on a dbunlocking() call, which happens 
+       after the writing.
+    */
+    void dirty(DiskLoc d) {
+        assert( d.a() >= Base );
+        boostlock lk(rcmutex);
+        map<DiskLoc, Node*>::iterator i = m.find(d);
+        if( i != m.end() ) {
+            Node *n = i->second;
+            if( !n->dirty ) { 
+                n->dirty = true;
+                dirtyl.insert(n->loc);
+            }
+        }
+    }
+
+    char* get(DiskLoc d, unsigned len) { 
+        assert( d.a() >= Base );
+        assert( len == recsize );
+
+        boostlock lk(rcmutex);
+        map<DiskLoc, Node*>::iterator i = m.find(d);
+        if( i != m.end() ) {
+            touch(i->second);
+            return i->second->data;
+        }
+
+        Node *n = mkNode();
+        n->loc = d;
+        store(d).get(fileOfs(d), n->data, recsize); // could throw exception
+        m.insert( pair<DiskLoc, Node*>(d, n) );
+        return n->data;
+    }
+
+    void drop(const char *ns);
+
+    DiskLoc insert(const char *ns, const void *obuf, int len, bool god) {
+        boostlock lk(rcmutex);
+        BasicRecStore& rs = store(ns);
+        fileofs o = rs.insert((const char *) obuf, len);
+        assert( o % recsize == 0 );
+        fileofs recnum = o / recsize;
+        massert( 10377 ,  "RecCache file too large?", recnum <= 0x7fffffff );
+        Node *n = mkNode();
+        memcpy(n->data, obuf, len);
+        DiskLoc d(rs.fileNumber + Base, (int) recnum);
+        n->loc = d;
+        m[d] = n;
+        return d;
+    }
+
+    void closeFiles(string dbname, string path);
+
+    // at termination: write dirty pages and close all files
+    void closing();
+};
+
+extern RecCache theRecCache;
+
+class CachedBasicRecStore : public RecStoreInterface { 
+public:
+    virtual char* get(DiskLoc d, unsigned len) { 
+        return theRecCache.get(d, len);
+    }
+
+    virtual DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { 
+        return theRecCache.insert(ns, obuf, len, god);
+    }
+
+    virtual void modified(DiskLoc d) { 
+        theRecCache.dirty(d);
+    }
+
+    /* drop collection */
+    virtual void drop(const char *ns) { 
+        theRecCache.drop(ns);
+    }
+
+    virtual void rename(const char *fromNs, const char *toNs) {
+      massert( 10378 ,  "rename not yet implemented for CachedBasicRecStore", false );
+    }
+
+    /* close datafiles associated with the db specified. */
+    virtual void closeFiles(string dbname, string path) {
+        theRecCache.closeFiles(dbname, dbpath);
+    }
+};
+
+/* see concurrency.h - note on a lock reset from read->write we don't 
+   call dbunlocking_read, we just wait for the final dbunlocking_write 
+   call 
+*/
+
+inline void dbunlocking_read() { 
+    Client *c = currentClient.get();
+    if ( c )
+        c->top.clientStop();
+}
+
+inline void dbunlocking_write() { 
+    theRecCache.ejectOld();
+	dbunlocking_read();
+}
+
+} /*namespace*/
diff --git a/db/reci.h b/db/reci.h
new file mode 100644
index 0000000..295388c
--- /dev/null
+++ b/db/reci.h
@@ -0,0 +1,45 @@
+// reci.h
+
+#pragma once
+
+#include "storage.h"
+
+namespace mongo { 
+
+/* Subclass this and implement your real storage interface.
+*/
+class RecStoreInterface {
+public:
+    virtual ~RecStoreInterface() {}
+
+    /* Get a pointer to the data at diskloc d.  Pointer guaranteed to stay in
+       scope through the current database operation's life.
+    */
+    virtual char* get(DiskLoc d, unsigned len) = 0;
+
+    /* indicate that the diskloc specified has been updated. note that as-is today, the modification may come AFTER this 
+       call -- we handle that currently -- until the dblock finishes.
+    */
+    virtual void modified(DiskLoc d) = 0;
+
+    /* insert specified data as a record */
+    virtual DiskLoc insert(const char *ns, const void *obuf, int len, bool god) = 0;
+
+    virtual void deleteRecord(const char *ns, DiskLoc d) { massert( 10379 , "not implemented RecStoreInterface::deleteRecord", false); }
+
+    /* drop the collection */
+    virtual void drop(const char *ns) = 0;
+
+    /* rename collection */
+    virtual void rename(const char *fromNs, const char *toNs) = 0;
+
+    /* close datafiles associated with the db specified. */
+    virtual void closeFiles(string dbname, string path) = 0;
+
+    /* todo add: 
+       closeFiles(dbname)
+       eraseFiles(dbname)
+    */
+};
+
+}
diff --git a/db/recstore.h b/db/recstore.h
new file mode 100644
index 0000000..2e6a90a
--- /dev/null
+++ b/db/recstore.h
@@ -0,0 +1,108 @@
+// recstore.h
+
+#pragma once
+
+#include "../util/file.h"
+
+namespace mongo { 
+
+using boost::uint32_t;
+using boost::uint64_t;
+
+/* Current version supports only consistent record sizes within a store. */
+
+class BasicRecStore { 
+    struct RecStoreHeader { 
+        uint32_t version;
+        uint32_t recsize;
+        uint64_t leof; // logical eof, actual file might be prealloc'd further
+        uint64_t firstDeleted; // 0 = no deleted recs
+        uint32_t cleanShutdown; // 0 = clean
+        char reserved[8192-8-8-4-4-4]; // we want our records page-aligned in the file if they are a multiple of a page's size -- so we make this 8KB with that goal
+        RecStoreHeader() { 
+            version = 65;
+            recsize = 0;
+            leof = sizeof(RecStoreHeader);
+            firstDeleted = 0;
+            cleanShutdown = 1;
+            memset(reserved, 0, sizeof(reserved));
+        }
+    };
+
+public:
+    BasicRecStore(int _fileNumber) : fileNumber(_fileNumber) { }
+    ~BasicRecStore();
+    void init(const char *fn, unsigned recsize);
+    fileofs insert(const char *buf, unsigned len);
+    void update(fileofs o, const char *buf, unsigned len);
+    void remove(fileofs o, unsigned len);
+    void get(fileofs o, char *buf, unsigned len);
+
+    int fileNumber; // this goes in DiskLoc::a
+
+    string filename;
+
+private:
+
+    void writeHeader();
+    File f;
+    fileofs len;
+    RecStoreHeader h; // h.reserved is wasteful here; fix later.
+    void write(fileofs ofs, const char *data, unsigned len) { 
+        f.write(ofs, data, len);
+        massert( 10380 , "basicrecstore write io error", !f.bad());
+    }
+};
+
+/* --- implementation --- */
+
+inline BasicRecStore::~BasicRecStore() { 
+    h.cleanShutdown = 0;
+    if( f.is_open() ) {
+        writeHeader();
+        f.fsync();
+    }
+}
+
+inline void BasicRecStore::writeHeader() { 
+    write(0, (const char *) &h, 28); // update header in file for new leof
+    uassert( 10115 , "file io error in BasicRecStore [1]", !f.bad()); 
+}
+
+inline fileofs BasicRecStore::insert(const char *buf, unsigned reclen) { 
+    if( h.firstDeleted ) { 
+        uasserted(11500, "deleted not yet implemented recstoreinsert");
+    }
+    massert( 10381 , "bad len", reclen == h.recsize);
+    fileofs ofs = h.leof;
+    h.leof += reclen;
+    if( h.leof > len ) { 
+        // grow the file.  we grow quite a bit to avoid excessive file system fragmentations
+        len += (len / 8) + h.recsize;
+        uassert( 10116 ,  "recstore file too big for 32 bit", len <= 0x7fffffff || sizeof(std::streamoff) > 4 );
+        write(len, "", 0);
+    }
+    writeHeader();
+    write(ofs, buf, reclen);
+    uassert( 10117 , "file io error in BasicRecStore [2]", !f.bad());
+    return ofs;
+}
+
+/* so far, it's ok to read or update a subset of a record */
+
+inline void BasicRecStore::update(fileofs o, const char *buf, unsigned len) { 
+    assert(o <= h.leof && o >= sizeof(RecStoreHeader));
+    write(o, buf, len);
+}
+
+inline void BasicRecStore::get(fileofs o, char *buf, unsigned len) { 
+    assert(o <= h.leof && o >= sizeof(RecStoreHeader));
+    f.read(o, buf, len);
+    massert( 10382 , "basicrestore::get I/O error", !f.bad());
+}
+
+inline void BasicRecStore::remove(fileofs o, unsigned len) { 
+    uasserted(11501, "not yet implemented recstoreremove");
+}
+
+}
diff --git a/db/repl.cpp b/db/repl.cpp
new file mode 100644
index 0000000..04c8d73
--- /dev/null
+++ b/db/repl.cpp
@@ -0,0 +1,1769 @@
+// repl.cpp
+
+/* TODO
+
+   PAIRING
+    _ on a syncexception, don't allow going back to master state?
+
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Collections we use:
+
+   local.sources         - indicates what sources we pull from as a "slave", and the last update of each
+   local.oplog.$main     - our op log as "master"
+   local.dbinfo.<dbname>
+   local.pair.startup    - can contain a special value indicating for a pair that we have the master copy.
+                           used when replacing other half of the pair which has permanently failed.
+   local.pair.sync       - { initialsynccomplete: 1 }
+*/
+
+#include "stdafx.h"
+#include "jsobj.h"
+#include "../util/goodies.h"
+#include "repl.h"
+#include "../util/message.h"
+#include "../client/dbclient.h"
+#include "pdfile.h"
+#include "query.h"
+#include "db.h"
+#include "commands.h"
+#include "security.h"
+#include "cmdline.h"
+
+namespace mongo {
+
+    void ensureHaveIdIndex(const char *ns);
+
+    /* if 1 sync() is running */
+    int syncing = 0;
+
+    /* if true replace our peer in a replication pair -- don't worry about if his
+       local.oplog.$main is empty.
+    */
+    bool replacePeer = false;
+
+    /* "dead" means something really bad happened like replication falling completely out of sync.
+       when non-null, we are dead and the string is informational
+    */
+    const char *replAllDead = 0;
+
+    extern bool autoresync;
+    time_t lastForcedResync = 0;
+    
+    IdTracker &idTracker = *( new IdTracker() );
+    
+} // namespace mongo
+
+#include "replset.h"
+
+namespace mongo {
+
+    PairSync *pairSync = new PairSync();
+    bool getInitialSyncCompleted() {
+        return pairSync->initialSyncCompleted();
+    }
+
+    /* --- ReplPair -------------------------------- */
+
+    ReplPair *replPair = 0;
+
+    /* output by the web console */
+    const char *replInfo = "";
+    struct ReplInfo {
+        ReplInfo(const char *msg) {
+            replInfo = msg;
+        }
+        ~ReplInfo() {
+            replInfo = "?";
+        }
+    };
+
+    void ReplPair::setMaster(int n, const char *_comment ) {
+        if ( n == State_Master && !getInitialSyncCompleted() )
+            return;
+        info = _comment;
+        if ( n != state && !cmdLine.quiet )
+            log() << "pair: setting master=" << n << " was " << state << '\n';
+        state = n;
+    }
+
+    /* peer unreachable, try our arbiter */
+    void ReplPair::arbitrate() {
+        ReplInfo r("arbitrate");
+
+        if ( arbHost == "-" ) {
+            // no arbiter. we are up, let's assume partner is down and network is not partitioned.
+            setMasterLocked(State_Master, "remote unreachable");
+            return;
+        }
+
+        auto_ptr<DBClientConnection> conn( newClientConnection() );
+        string errmsg;
+        if ( !conn->connect(arbHost.c_str(), errmsg) ) {
+            log() << "repl:   cantconn arbiter " << errmsg << endl;
+            setMasterLocked(State_CantArb, "can't connect to arb");
+            return;
+        }
+
+        negotiate( conn.get(), "arbiter" );
+    }
+
+    /* --------------------------------------------- */
+
+    class CmdReplacePeer : public Command {
+    public:
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual bool adminOnly() {
+            return true;
+        }
+        virtual bool logTheOp() {
+            return false;
+        }
+        CmdReplacePeer() : Command("replacepeer") { }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( replPair == 0 ) {
+                errmsg = "not paired";
+                return false;
+            }
+            if ( !getInitialSyncCompleted() ) {
+                errmsg = "not caught up cannot replace peer";
+                return false;
+            }
+            if ( syncing < 0 ) {
+                errmsg = "replacepeer already invoked";
+                return false;
+            }
+            Timer t;
+            while ( 1 ) {
+                if ( syncing == 0 || t.millis() > 20000 )
+                    break;
+                {
+                    dbtemprelease t;
+                    sleepmillis(10);
+                }
+            }
+            if ( syncing ) {
+                assert( syncing > 0 );
+                errmsg = "timeout waiting for sync() to finish";
+                return false;
+            }
+            {
+                ReplSource::SourceVector sources;
+                ReplSource::loadAll(sources);
+                if ( sources.size() != 1 ) {
+                    errmsg = "local.sources.count() != 1, cannot replace peer";
+                    return false;
+                }
+            }
+            {
+                Helpers::emptyCollection("local.sources");
+                BSONObj o = fromjson("{\"replacepeer\":1}");
+                Helpers::putSingleton("local.pair.startup", o);
+            }
+            syncing = -1;
+            replAllDead = "replacepeer invoked -- adjust local.sources hostname then restart this db process";
+            result.append("info", "adjust local.sources hostname; db restart now required");
+            return true;
+        }
+    } cmdReplacePeer;
+
+    class CmdForceDead : public Command {
+    public:
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual bool adminOnly() {
+            return true;
+        }
+        virtual bool logTheOp() {
+            return false;   
+        }
+        CmdForceDead() : Command("forcedead") { }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            replAllDead = "forced by command";
+            return true;
+        }
+    } cmdForceDead;
+    
+    /* operator requested resynchronization of replication (on the slave).  { resync : 1 } */
+    class CmdResync : public Command {
+    public:
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual bool adminOnly() {
+            return true;
+        }
+        virtual bool logTheOp() {
+            return false;
+        }
+        CmdResync() : Command("resync") { }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( cmdObj.getBoolField( "force" ) ) {
+                if ( !waitForSyncToFinish( errmsg ) )
+                    return false;
+                replAllDead = "resync forced";
+            }            
+            if ( !replAllDead ) {
+                errmsg = "not dead, no need to resync";
+                return false;
+            }
+            if ( !waitForSyncToFinish( errmsg ) )
+                return false;
+            
+            ReplSource::forceResyncDead( "client" );
+            result.append( "info", "triggered resync for all sources" );
+            return true;                
+        }        
+        bool waitForSyncToFinish( string &errmsg ) const {
+            // Wait for slave thread to finish syncing, so sources will be be
+            // reloaded with new saved state on next pass.
+            Timer t;
+            while ( 1 ) {
+                if ( syncing == 0 || t.millis() > 20000 )
+                    break;
+                {
+                    dbtemprelease t;
+                    sleepmillis(10);
+                }
+            }
+            if ( syncing ) {
+                errmsg = "timeout waiting for sync() to finish";
+                return false;
+            }
+            return true;
+        }
+    } cmdResync;
+    
+    class CmdIsMaster : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdIsMaster() : Command("ismaster") { }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+			/* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not 
+			   authenticated.
+			   we allow unauthenticated ismaster but we aren't as verbose informationally if 
+			   one is not authenticated for admin db to be safe.
+			*/
+            AuthenticationInfo *ai = currentClient.get()->ai;
+			bool authed = ai->isAuthorized("admin");
+
+            if ( replAllDead ) {
+                result.append("ismaster", 0.0);
+				if( authed ) { 
+					if ( replPair )
+						result.append("remote", replPair->remote);
+					result.append("info", replAllDead);
+				}
+            }
+            else if ( replPair ) {
+                result.append("ismaster", replPair->state);
+				if( authed ) {
+					result.append("remote", replPair->remote);
+					if ( !replPair->info.empty() )
+						result.append("info", replPair->info);
+				}
+			}
+            else {
+                result.append("ismaster", slave ? 0 : 1);
+				result.append("msg", "not paired");
+            }
+            
+            return true;
+        }
+    } cmdismaster;
+
+    class CmdIsInitialSyncComplete : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdIsInitialSyncComplete() : Command( "isinitialsynccomplete" ) {}
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            result.appendBool( "initialsynccomplete", getInitialSyncCompleted() );
+            return true;
+        }
+    } cmdisinitialsynccomplete;
+    
+    /* negotiate who is master
+
+       -1=not set (probably means we just booted)
+        0=was slave
+        1=was master
+
+       remote,local -> new remote,local
+       !1,1  -> 0,1
+       1,!1  -> 1,0
+       -1,-1 -> dominant->1, nondom->0
+       0,0   -> dominant->1, nondom->0
+       1,1   -> dominant->1, nondom->0
+
+       { negotiatemaster:1, i_was:<state>, your_name:<hostname> }
+       returns:
+       { ok:1, you_are:..., i_am:... }
+    */
+    class CmdNegotiateMaster : public Command {
+    public:
+        CmdNegotiateMaster() : Command("negotiatemaster") { }
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual bool adminOnly() {
+            return true;
+        }
+
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            if ( replPair == 0 ) {
+                massert( 10383 ,  "Another mongod instance believes incorrectly that this node is its peer", !cmdObj.getBoolField( "fromArbiter" ) );
+                // assume that we are an arbiter and should forward the request
+                string host = cmdObj.getStringField("your_name");
+                int port = cmdObj.getIntField( "your_port" );
+                if ( port == INT_MIN ) {
+                    errmsg = "no port specified";
+                    problem() << errmsg << endl;
+                    return false;
+                }
+                stringstream ss;
+                ss << host << ":" << port;
+                string remote = ss.str();
+                BSONObj ret;
+                {
+                    dbtemprelease t;
+                    auto_ptr<DBClientConnection> conn( new DBClientConnection() );
+                    if ( !conn->connect( remote.c_str(), errmsg ) ) {
+                        result.append( "you_are", ReplPair::State_Master );
+                        return true;
+                    }
+                    BSONObjBuilder forwardCommand;
+                    forwardCommand.appendElements( cmdObj );
+                    forwardCommand.appendBool( "fromArbiter", true );
+                    ret = conn->findOne( "admin.$cmd", forwardCommand.done() );
+                }
+                BSONObjIterator i( ret );
+                while( i.moreWithEOO() ) {
+                    BSONElement e = i.next();
+                    if ( e.eoo() )
+                        break;
+                    if ( e.fieldName() != string( "ok" ) )
+                        result.append( e );
+                }
+                return ( ret.getIntField("ok") == 1 );
+            }
+
+            int was = cmdObj.getIntField("i_was");
+            string myname = cmdObj.getStringField("your_name");
+            if ( myname.empty() || was < -3 ) {
+                errmsg = "your_name/i_was not specified";
+                return false;
+            }
+
+            int N = ReplPair::State_Negotiating;
+            int M = ReplPair::State_Master;
+            int S = ReplPair::State_Slave;
+
+            if ( !replPair->dominant( myname ) ) {
+                result.append( "you_are", N );
+                result.append( "i_am", replPair->state );
+                return true;
+            }
+
+            int me, you;
+            if ( !getInitialSyncCompleted() || ( replPair->state != M && was == M ) ) {
+                me=S;
+                you=M;
+            }
+            else {
+                me=M;
+                you=S;
+            }
+            replPair->setMaster( me, "CmdNegotiateMaster::run()" );
+
+            result.append("you_are", you);
+            result.append("i_am", me);
+
+            return true;
+        }
+    } cmdnegotiatemaster;
+    
+    int ReplPair::negotiate(DBClientConnection *conn, string method) {
+        BSONObjBuilder b;
+        b.append("negotiatemaster",1);
+        b.append("i_was", state);
+        b.append("your_name", remoteHost);
+        b.append("your_port", remotePort);
+        BSONObj cmd = b.done();
+        BSONObj res = conn->findOne("admin.$cmd", cmd);
+        if ( res.getIntField("ok") != 1 ) {
+            string message = method + " negotiate failed";
+            problem() << message << ": " << res.toString() << '\n';
+            setMasterLocked(State_Confused, message.c_str());
+            return State_Confused;
+        }
+        int x = res.getIntField("you_are");
+        int remote = res.getIntField("i_am");
+        // State_Negotiating means the remote node is not dominant and cannot
+        // choose who is master.
+        if ( x != State_Slave && x != State_Master && x != State_Negotiating ) {
+            problem() << method << " negotiate: bad you_are value " << res.toString() << endl;
+        } else if ( x != State_Negotiating ) {
+            string message = method + " negotiation";
+            setMasterLocked(x, message.c_str());
+        }
+        return remote;
+    }
+
+    struct TestOpTime {
+        TestOpTime() {
+            OpTime t;
+            for ( int i = 0; i < 10; i++ ) {
+                OpTime s = OpTime::now();
+                assert( s != t );
+                t = s;
+            }
+            OpTime q = t;
+            assert( q == t );
+            assert( !(q != t) );
+        }
+    } testoptime;
+
+    /* --------------------------------------------------------------*/
+
+    ReplSource::ReplSource() {
+        replacing = false;
+        nClonedThisPass = 0;
+        paired = false;
+    }
+
+    ReplSource::ReplSource(BSONObj o) : nClonedThisPass(0) {
+        replacing = false;
+        paired = false;
+        only = o.getStringField("only");
+        hostName = o.getStringField("host");
+        _sourceName = o.getStringField("source");
+        uassert( 10118 ,  "'host' field not set in sources collection object", !hostName.empty() );
+        uassert( 10119 ,  "only source='main' allowed for now with replication", sourceName() == "main" );
+        BSONElement e = o.getField("syncedTo");
+        if ( !e.eoo() ) {
+            uassert( 10120 ,  "bad sources 'syncedTo' field value", e.type() == Date || e.type() == Timestamp );
+            OpTime tmp( e.date() );
+            syncedTo = tmp;
+        }
+
+        BSONObj dbsObj = o.getObjectField("dbsNextPass");
+        if ( !dbsObj.isEmpty() ) {
+            BSONObjIterator i(dbsObj);
+            while ( 1 ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                addDbNextPass.insert( e.fieldName() );
+            }
+        }        
+        
+        dbsObj = o.getObjectField("incompleteCloneDbs");
+        if ( !dbsObj.isEmpty() ) {
+            BSONObjIterator i(dbsObj);
+            while ( 1 ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                incompleteCloneDbs.insert( e.fieldName() );
+            }
+        }        
+
+        _lastSavedLocalTs = OpTime( o.getField( "localLogTs" ).date() );
+    }
+
+    /* Turn our C++ Source object into a BSONObj */
+    BSONObj ReplSource::jsobj() {
+        BSONObjBuilder b;
+        b.append("host", hostName);
+        b.append("source", sourceName());
+        if ( !only.empty() )
+            b.append("only", only);
+        if ( !syncedTo.isNull() )
+            b.appendTimestamp("syncedTo", syncedTo.asDate());
+
+        b.appendTimestamp("localLogTs", _lastSavedLocalTs.asDate());
+        
+        BSONObjBuilder dbsNextPassBuilder;
+        int n = 0;
+        for ( set<string>::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++ ) {
+            n++;
+            dbsNextPassBuilder.appendBool(i->c_str(), 1);
+        }
+        if ( n )
+            b.append("dbsNextPass", dbsNextPassBuilder.done());
+
+        BSONObjBuilder incompleteCloneDbsBuilder;
+        n = 0;
+        for ( set<string>::iterator i = incompleteCloneDbs.begin(); i != incompleteCloneDbs.end(); i++ ) {
+            n++;
+            incompleteCloneDbsBuilder.appendBool(i->c_str(), 1);
+        }
+        if ( n )
+            b.append("incompleteCloneDbs", incompleteCloneDbsBuilder.done());
+
+        return b.obj();
+    }
+
+    void ReplSource::save() {
+        BSONObjBuilder b;
+        assert( !hostName.empty() );
+        b.append("host", hostName);
+        // todo: finish allowing multiple source configs.
+        // this line doesn't work right when source is null, if that is allowed as it is now:
+        //b.append("source", _sourceName);
+        BSONObj pattern = b.done();
+
+        BSONObj o = jsobj();
+        log( 1 ) << "Saving repl source: " << o << endl;
+
+        OpDebug debug;
+        setClient("local.sources");
+        UpdateResult res = updateObjects("local.sources", o, pattern, true/*upsert for pair feature*/, false,false,debug);
+        assert( ! res.mod );
+        assert( res.num == 1 );
+        cc().clearns();
+
+        if ( replacing ) {
+            /* if we were in "replace" mode, we now have synced up with the replacement,
+               so turn that off.
+               */
+            replacing = false;
+            wassert( replacePeer );
+            replacePeer = false;
+            Helpers::emptyCollection("local.pair.startup");
+        }
+    }
+
+    static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, const BSONObj &spec, ReplSource::SourceVector &old) {
+        if ( !s.syncedTo.isNull() ) { // Don't reuse old ReplSource if there was a forced resync.
+            for ( ReplSource::SourceVector::iterator i = old.begin(); i != old.end();  ) {
+                if ( s == **i ) {
+                    v.push_back(*i);
+                    old.erase(i);
+                    return;
+                }
+                i++;
+            }
+        }
+
+        v.push_back( shared_ptr< ReplSource >( new ReplSource( s ) ) );
+    }
+
+    /* we reuse our existing objects so that we can keep our existing connection
+       and cursor in effect.
+    */
+    void ReplSource::loadAll(SourceVector &v) {
+        SourceVector old = v;
+        v.clear();
+
+        bool gotPairWith = false;
+
+        if ( !cmdLine.source.empty() ) {
+            setClient("local.sources");
+            // --source <host> specified.
+            // check that no items are in sources other than that
+            // add if missing
+            auto_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+            int n = 0;
+            while ( c->ok() ) {
+                n++;
+                ReplSource tmp(c->current());
+                if ( tmp.hostName != cmdLine.source ) {
+                    log() << "--source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl;
+                    log() << "terminating after 30 seconds" << endl;
+                    sleepsecs(30);
+                    dbexit( EXIT_REPLICATION_ERROR );
+                }
+                if ( tmp.only != cmdLine.only ) {
+                    log() << "--only " << cmdLine.only << " != " << tmp.only << " from local.sources collection" << endl;
+                    log() << "terminating after 30 seconds" << endl;
+                    sleepsecs(30);
+                    dbexit( EXIT_REPLICATION_ERROR );
+                }
+                c->advance();
+            }
+            uassert( 10002 ,  "local.sources collection corrupt?", n<2 );
+            if ( n == 0 ) {
+                // source missing.  add.
+                ReplSource s;
+                s.hostName = cmdLine.source;
+                s.only = cmdLine.only;
+                s.save();
+            }
+        }
+        else {
+            try {
+                massert( 10384 , "--only requires use of --source", cmdLine.only.empty());
+            } catch ( ... ) {
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        
+        if ( replPair ) {
+            const string &remote = replPair->remote;
+            setClient( "local.sources" );
+            // --pairwith host specified.
+            // check that no items are in sources other than that
+            // add if missing
+            auto_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+            int n = 0;
+            while ( c->ok() ) {
+                n++;
+                ReplSource tmp(c->current());
+                if ( tmp.hostName != remote ) {
+                    log() << "pairwith " << remote << " != " << tmp.hostName << " from local.sources collection" << endl;
+                    log() << "terminating after 30 seconds" << endl;
+                    sleepsecs(30);
+                    dbexit( EXIT_REPLICATION_ERROR );
+                }
+                c->advance();
+            }
+            uassert( 10122 ,  "local.sources collection corrupt?", n<2 );
+            if ( n == 0 ) {
+                // source missing.  add.
+                ReplSource s;
+                s.hostName = remote;
+                s.save();
+            }
+        }
+
+        setClient("local.sources");
+        auto_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+        while ( c->ok() ) {
+            ReplSource tmp(c->current());
+            if ( replPair && tmp.hostName == replPair->remote && tmp.sourceName() == "main" ) {
+                gotPairWith = true;
+                tmp.paired = true;
+                if ( replacePeer ) {
+                    // peer was replaced -- start back at the beginning.
+                    tmp.syncedTo = OpTime();
+                    tmp.replacing = true;
+                }
+            }
+            addSourceToList(v, tmp, c->current(), old);
+            c->advance();
+        }
+        cc().clearns();
+
+        if ( !gotPairWith && replPair ) {
+            /* add the --pairwith server */
+            shared_ptr< ReplSource > s( new ReplSource() );
+            s->paired = true;
+            s->hostName = replPair->remote;
+            s->replacing = replacePeer;
+            v.push_back(s);
+        }
+    }
+
+    BSONObj opTimeQuery = fromjson("{\"getoptime\":1}");
+
+    bool ReplSource::throttledForceResyncDead( const char *requester ) {
+        if ( time( 0 ) - lastForcedResync > 600 ) {
+            forceResyncDead( requester );
+            lastForcedResync = time( 0 );
+            return true;
+        }
+        return false;
+    }
+    
+    void ReplSource::forceResyncDead( const char *requester ) {
+        if ( !replAllDead )
+            return;
+        SourceVector sources;
+        ReplSource::loadAll(sources);
+        for( SourceVector::iterator i = sources.begin(); i != sources.end(); ++i ) {
+            (*i)->forceResync( requester );
+        }
+        replAllDead = 0;        
+    }
+    
+    void ReplSource::forceResync( const char *requester ) {
+        BSONObj info;
+        {
+            dbtemprelease t;
+            connect();
+            bool ok = conn->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+            massert( 10385 ,  "Unable to get database list", ok );
+        }
+        BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            string name = e.embeddedObject().getField( "name" ).valuestr();
+            if ( !e.embeddedObject().getBoolField( "empty" ) ) {
+                if ( name != "local" ) {
+                    if ( only.empty() || only == name ) {
+                        resyncDrop( name.c_str(), requester );
+                    }
+                }
+            }
+        }        
+        syncedTo = OpTime();
+        addDbNextPass.clear();
+        save();
+    }
+
+    string ReplSource::resyncDrop( const char *db, const char *requester ) {
+        log() << "resync: dropping database " << db << endl;
+        string dummyns = string( db ) + ".";
+        setClient(dummyns.c_str());        
+        assert( cc().database()->name == db );
+        dropDatabase(dummyns.c_str());
+        return dummyns;
+    }
+    
+    /* grab initial copy of a database from the master */
+    bool ReplSource::resync(string db) {
+        string dummyNs = resyncDrop( db.c_str(), "internal" );
+        setClient( dummyNs.c_str() );
+        {
+            log() << "resync: cloning database " << db << endl;
+            ReplInfo r("resync: cloning a database");
+            string errmsg;
+            bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveok*/ true, /*replauth*/ true, /*snapshot*/false);
+            if ( !ok ) {
+                problem() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl;
+                throw SyncException();
+            }
+        }
+
+        log() << "resync: done " << db << endl;
+
+        return true;
+    }
+
+    void ReplSource::applyOperation(const BSONObj& op) {
+        log( 6 ) << "applying op: " << op << endl;
+        OpDebug debug;
+        BSONObj o = op.getObjectField("o");
+        const char *ns = op.getStringField("ns");
+        // operation type -- see logOp() comments for types
+        const char *opType = op.getStringField("op");
+        try {
+            if ( *opType == 'i' ) {
+                const char *p = strchr(ns, '.');
+                if ( p && strcmp(p, ".system.indexes") == 0 ) {
+                    // updates aren't allowed for indexes -- so we will do a regular insert. if index already
+                    // exists, that is ok.
+                    theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize());
+                }
+                else {
+                    // do upserts for inserts as we might get replayed more than once
+					BSONElement _id;
+					if( !o.getObjectID(_id) ) {
+						/* No _id.  This will be very slow. */
+                        Timer t;
+                        updateObjects(ns, o, o, true, false, false , debug );
+                        if( t.millis() >= 2 ) {
+                            RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
+                        }
+                    }
+                    else {
+                        BSONObjBuilder b;
+						b.append(_id);
+                        
+                        /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */
+                        RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow 
+
+                        updateObjects(ns, o, b.done(), true, false, false , debug );
+                    }
+                }
+            }
+            else if ( *opType == 'u' ) {
+                RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow
+                updateObjects(ns, o, op.getObjectField("o2"), op.getBoolField("b"), false, false , debug );
+            }
+            else if ( *opType == 'd' ) {
+                if ( opType[1] == 0 )
+                    deleteObjects(ns, o, op.getBoolField("b"));
+                else
+                    assert( opType[1] == 'b' ); // "db" advertisement
+            }
+            else if ( *opType == 'n' ) {
+                // no op
+            }
+            else {
+                BufBuilder bb;
+                BSONObjBuilder ob;
+                assert( *opType == 'c' );
+                _runCommands(ns, o, bb, ob, true, 0);
+            }
+        }
+        catch ( UserException& e ) {
+            log() << "sync: caught user assertion " << e << " while applying op: " << op << endl;;
+        }
+        catch ( DBException& e ) {
+            log() << "sync: caught db exception " << e << " while applying op: " << op << endl;;            
+        }
+    }
+    
+    /* local.$oplog.main is of the form:
+         { ts: ..., op: <optype>, ns: ..., o: <obj> , o2: <extraobj>, b: <boolflag> }
+         ...
+       see logOp() comments.
+    */
+    void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail) {
+        log( 6 ) << "processing op: " << op << endl;
+        // skip no-op
+        if ( op.getStringField( "op" )[ 0 ] == 'n' )
+            return;
+        
+        char clientName[MaxDatabaseLen];
+        const char *ns = op.getStringField("ns");
+        nsToDatabase(ns, clientName);
+
+        if ( *ns == '.' ) {
+            problem() << "skipping bad op in oplog: " << op.toString() << endl;
+            return;
+        }
+        else if ( *ns == 0 ) {
+            problem() << "halting replication, bad op in oplog:\n  " << op.toString() << endl;
+            replAllDead = "bad object in oplog";
+            throw SyncException();
+        }
+
+        if ( !only.empty() && only != clientName )
+            return;
+
+        dblock lk;
+
+        if ( localLogTail && replPair && replPair->state == ReplPair::State_Master ) {
+            updateSetsWithLocalOps( *localLogTail, true ); // allow unlocking
+            updateSetsWithLocalOps( *localLogTail, false ); // don't allow unlocking or conversion to db backed storage
+        }
+
+        if ( replAllDead ) {
+            // hmmm why is this check here and not at top of this function? does it get set between top and here?
+            log() << "replAllDead, throwing SyncException: " << replAllDead << endl;
+            throw SyncException();
+        }
+        
+        bool justCreated;
+        try {
+            justCreated = setClient(ns);
+        } catch ( AssertionException& ) {
+            problem() << "skipping bad(?) op in oplog, setClient() failed, ns: '" << ns << "'\n";
+            addDbNextPass.erase(clientName);
+            return;
+        }
+
+        bool empty = cc().database()->isEmpty();
+        bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0;
+
+        log( 6 ) << "ns: " << ns << ", justCreated: " << justCreated << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl;
+
+	// always apply admin command command
+	// this is a bit hacky -- the semantics of replication/commands aren't well specified
+	if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) {
+	  applyOperation( op );
+      cc().clearns();
+	  return;
+	}
+        
+        if ( justCreated || empty || incompleteClone ) {
+            // we must add to incomplete list now that setClient has been called
+            incompleteCloneDbs.insert( clientName );
+            if ( nClonedThisPass ) {
+                /* we only clone one database per pass, even if a lot need done.  This helps us
+                 avoid overflowing the master's transaction log by doing too much work before going
+                 back to read more transactions. (Imagine a scenario of slave startup where we try to
+                 clone 100 databases in one pass.)
+                 */
+                addDbNextPass.insert( clientName );
+            } else {
+                if ( incompleteClone ) {
+                    log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl;
+                }
+                save();
+                setClient( ns );
+                nClonedThisPass++;
+                resync(cc().database()->name);
+                addDbNextPass.erase(clientName);
+                incompleteCloneDbs.erase( clientName );
+            }
+            save();
+        } else {
+            bool mod;
+            if ( replPair && replPair->state == ReplPair::State_Master ) {
+                BSONObj id = idForOp( op, mod );
+                if ( !idTracker.haveId( ns, id ) ) {
+                    applyOperation( op );    
+                } else if ( idTracker.haveModId( ns, id ) ) {
+                    log( 6 ) << "skipping operation matching mod id object " << op << endl;
+                    BSONObj existing;
+                    if ( Helpers::findOne( ns, id, existing ) )
+                        logOp( "i", ns, existing );
+                } else {
+                    log( 6 ) << "skipping operation matching changed id object " << op << endl;
+                }
+            } else {
+                applyOperation( op );
+            }
+            addDbNextPass.erase( clientName );
+        }
+        cc().clearns();
+    }
+
+    BSONObj ReplSource::idForOp( const BSONObj &op, bool &mod ) {
+        mod = false;
+        const char *opType = op.getStringField( "op" );
+        BSONObj o = op.getObjectField( "o" );
+        switch( opType[ 0 ] ) {
+            case 'i': {
+                BSONObjBuilder idBuilder;
+                BSONElement id;
+                if ( !o.getObjectID( id ) )
+                    return BSONObj();                    
+                idBuilder.append( id );
+                return idBuilder.obj();
+            }
+            case 'u': {
+                BSONObj o2 = op.getObjectField( "o2" );
+                if ( strcmp( o2.firstElement().fieldName(), "_id" ) != 0 )
+                    return BSONObj();
+                if ( o.firstElement().fieldName()[ 0 ] == '$' )
+                    mod = true;
+                return o2;
+            }
+            case 'd': {
+                if ( opType[ 1 ] != '\0' )
+                    return BSONObj(); // skip "db" op type
+                return o;
+            }
+            default:
+                break;
+        }        
+        return BSONObj();
+    }
+    
+    void ReplSource::updateSetsWithOp( const BSONObj &op, bool mayUnlock ) {
+        if ( mayUnlock ) {
+            idTracker.mayUpgradeStorage();
+        }
+        bool mod;
+        BSONObj id = idForOp( op, mod );
+        if ( !id.isEmpty() ) {
+            const char *ns = op.getStringField( "ns" );
+            // Since our range of local ops may not be the same as our peer's
+            // range of unapplied ops, it is always necessary to rewrite objects
+            // to the oplog after a mod update.
+            if ( mod )
+                idTracker.haveModId( ns, id, true );
+            idTracker.haveId( ns, id, true );
+        }        
+    }
+    
+    void ReplSource::syncToTailOfRemoteLog() {
+        string _ns = ns();
+        BSONObj last = conn->findOne( _ns.c_str(), Query().sort( BSON( "$natural" << -1 ) ) );
+        if ( !last.isEmpty() ) {
+            BSONElement ts = last.findElement( "ts" );
+            massert( 10386 ,  "non Date ts found", ts.type() == Date || ts.type() == Timestamp );
+            syncedTo = OpTime( ts.date() );
+        }        
+    }
+    
+    OpTime ReplSource::nextLastSavedLocalTs() const {
+        setClient( "local.oplog.$main" );
+        auto_ptr< Cursor > c = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) );
+        if ( c->ok() )
+            return OpTime( c->current().getField( "ts" ).date() );        
+        return OpTime();
+    }
+    
+    void ReplSource::setLastSavedLocalTs( const OpTime &nextLocalTs ) {
+        _lastSavedLocalTs = nextLocalTs;
+        log( 3 ) << "updated _lastSavedLocalTs to: " << _lastSavedLocalTs << endl;
+    }
+    
+    void ReplSource::resetSlave() {
+        massert( 10387 ,  "request to kill slave replication falied",
+                conn->simpleCommand( "admin", 0, "forcedead" ) );        
+        syncToTailOfRemoteLog();
+        {
+            dblock lk;
+            setLastSavedLocalTs( nextLastSavedLocalTs() );
+            save();
+            cursor.reset();
+        }
+    }
+    
+    bool ReplSource::updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock ) {
+        setClient( "local.oplog.$main" );
+        auto_ptr< Cursor > localLog = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) );
+        OpTime newTail;
+        for( ; localLog->ok(); localLog->advance() ) {
+            BSONObj op = localLog->current();
+            OpTime ts( localLog->current().getField( "ts" ).date() );
+            if ( newTail.isNull() ) {
+                newTail = ts;
+            }
+            if ( !( localLogTail < ts ) )
+                break;
+            updateSetsWithOp( op, mayUnlock );
+            if ( mayUnlock ) {
+                RARELY {
+                    dbtemprelease t;
+                }
+            }
+        }
+        if ( !localLogTail.isNull() && !localLog->ok() ) {
+            // local log filled up
+            idTracker.reset();
+            dbtemprelease t;
+            resetSlave();
+            massert( 10388 ,  "local master log filled, forcing slave resync", false );
+        }        
+        if ( !newTail.isNull() )
+            localLogTail = newTail;
+        return true;
+    }
+    
+    /* slave: pull some data from the master's oplog
+       note: not yet in db mutex at this point. 
+    */
+    bool ReplSource::sync_pullOpLog(int& nApplied) {
+        string ns = string("local.oplog.$") + sourceName();
+        log(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n';
+
+        bool tailing = true;
+        DBClientCursor *c = cursor.get();
+        if ( c && c->isDead() ) {
+            log() << "repl:   old cursor isDead, initiating a new one\n";
+            c = 0;
+        }
+
+        if ( replPair && replPair->state == ReplPair::State_Master ) {
+            dblock lk;
+            idTracker.reset();
+        }
+        OpTime localLogTail = _lastSavedLocalTs;
+
+        bool initial = syncedTo.isNull();
+        
+        if ( c == 0 || initial ) {
+            if ( initial ) {
+                // Important to grab last oplog timestamp before listing databases.
+                syncToTailOfRemoteLog();
+                BSONObj info;
+                bool ok = conn->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+                massert( 10389 ,  "Unable to get database list", ok );
+                BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+                while( i.moreWithEOO() ) {
+                    BSONElement e = i.next();
+                    if ( e.eoo() )
+                        break;
+                    string name = e.embeddedObject().getField( "name" ).valuestr();
+                    if ( !e.embeddedObject().getBoolField( "empty" ) ) {
+                        if ( name != "local" ) {
+                            if ( only.empty() || only == name ) {
+                                log( 2 ) << "adding to 'addDbNextPass': " << name << endl;
+                                addDbNextPass.insert( name );
+                            }
+                        }
+                    }
+                }
+                dblock lk;
+                save();
+            }
+                        
+            BSONObjBuilder q;
+            q.appendDate("$gte", syncedTo.asDate());
+            BSONObjBuilder query;
+            query.append("ts", q.done());
+            if ( !only.empty() ) {
+               // note we may here skip a LOT of data table scanning, a lot of work for the master.
+                query.appendRegex("ns", string("^") + only);
+            }
+            BSONObj queryObj = query.done();
+            // queryObj = { ts: { $gte: syncedTo } }
+
+            log(2) << "repl: " << ns << ".find(" << queryObj.toString() << ')' << '\n';
+            cursor = conn->query( ns.c_str(), queryObj, 0, 0, 0, 
+                                  QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay |
+                                  QueryOption_AwaitData
+                                  );
+            c = cursor.get();
+            tailing = false;
+        }
+        else {
+            log(2) << "repl: tailing=true\n";
+        }
+
+        if ( c == 0 ) {
+            problem() << "repl:   dbclient::query returns null (conn closed?)" << endl;
+            resetConnection();
+            return false;
+        }
+
+        // show any deferred database creates from a previous pass
+        {
+            set<string>::iterator i = addDbNextPass.begin();
+            if ( i != addDbNextPass.end() ) {
+                BSONObjBuilder b;
+                b.append("ns", *i + '.');
+                b.append("op", "db");
+                BSONObj op = b.done();
+                sync_pullOpLog_applyOperation(op, 0);
+            }
+        }
+
+        if ( !c->more() ) {
+            if ( tailing ) {
+                log(2) << "repl: tailing & no new activity\n";
+            } else {
+                log() << "repl:   " << ns << " oplog is empty\n";
+            }
+            {
+                dblock lk;
+                OpTime nextLastSaved = nextLastSavedLocalTs();
+                {
+                    dbtemprelease t;
+                    if ( !c->more() ) {
+                        setLastSavedLocalTs( nextLastSaved );
+                    }
+                }
+                save();            
+            }
+            return true;
+        }
+
+        int n = 0;
+        BSONObj op = c->next();
+        BSONElement ts = op.findElement("ts");
+        if ( ts.type() != Date && ts.type() != Timestamp ) {
+            string err = op.getStringField("$err");
+            if ( !err.empty() ) {
+                problem() << "repl: $err reading remote oplog: " + err << '\n';
+                massert( 10390 ,  "got $err reading remote oplog", false );
+            }
+            else {
+                problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n';
+                massert( 10391 , "repl: bad object read from remote oplog", false);
+            }
+        }
+        
+        if ( replPair && replPair->state == ReplPair::State_Master ) {
+            
+            OpTime nextOpTime( ts.date() );
+            if ( !tailing && !initial && nextOpTime != syncedTo ) {
+                log() << "remote slave log filled, forcing slave resync" << endl;
+                resetSlave();
+                return true;
+            }            
+            
+            dblock lk;
+            updateSetsWithLocalOps( localLogTail, true );
+        }
+        
+        OpTime nextOpTime( ts.date() );
+        log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n';
+        if ( tailing || initial ) {
+            if ( initial )
+                log(1) << "repl:   initial run\n";
+            else
+                assert( syncedTo < nextOpTime );
+            sync_pullOpLog_applyOperation(op, &localLogTail);
+            n++;
+        }
+        else if ( nextOpTime != syncedTo ) {
+            Nullstream& l = log();
+            l << "repl:   nextOpTime " << nextOpTime.toStringLong() << ' ';
+            if ( nextOpTime < syncedTo )
+                l << "<??";
+            else
+                l << ">";
+
+            l << " syncedTo " << syncedTo.toStringLong() << '\n';
+            log() << "repl:   time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n";
+            log() << "repl:   tailing: " << tailing << '\n';
+            log() << "repl:   data too stale, halting replication" << endl;
+            replInfo = replAllDead = "data too stale halted replication";
+            assert( syncedTo < nextOpTime );
+            throw SyncException();
+        }
+        else {
+            /* t == syncedTo, so the first op was applied previously. */
+        }
+
+        // apply operations
+        {
+			time_t saveLast = time(0);
+            while ( 1 ) {
+                /* from a.s.:
+                   I think the idea here is that we can establish a sync point between the local op log and the remote log with the following steps:
+
+                   1) identify most recent op in local log -- call it O
+                   2) ask "does nextOpTime reflect the tail of the remote op log?" (in other words, is more() false?) - If yes, all subsequent ops after nextOpTime in the remote log must have occurred after O.  If no, we can't establish a sync point.
+
+                   Note that we can't do step (2) followed by step (1) because if we do so ops may be added to both machines between steps (2) and (1) and we can't establish a sync point.  (In particular, between (2) and (1) an op may be added to the remote log before a different op is added to the local log.  In this case, the newest remote op will have occurred after nextOpTime but before O.)
+
+                   Now, for performance reasons we don't want to have to identify the most recent op in the local log every time we call c->more() because in performance sensitive situations more() will be true most of the time.  So we do:
+
+                   0) more()?
+                   1) find most recent op in local log
+                   2) more()?
+                */
+                if ( !c->more() ) {
+                    dblock lk;
+                    OpTime nextLastSaved = nextLastSavedLocalTs(); // this may make c->more() become true
+                    {
+                        dbtemprelease t;
+                        if ( c->more() ) {
+                            continue;
+                        } else {
+                            setLastSavedLocalTs( nextLastSaved );
+                        }
+                    }
+                    syncedTo = nextOpTime;
+                    save(); // note how far we are synced up to now
+                    log() << "repl:   applied " << n << " operations" << endl;
+                    nApplied = n;
+                    log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl;
+                    break;
+                }
+
+                OCCASIONALLY if( n > 100000 || time(0) - saveLast > 60 ) { 
+					// periodically note our progress, in case we are doing a lot of work and crash
+					dblock lk;
+                    syncedTo = nextOpTime;
+                    // can't update local log ts since there are pending operations from our peer
+					save();
+                    log() << "repl:   checkpoint applied " << n << " operations" << endl;
+                    log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
+					saveLast = time(0);
+					n = 0;
+				}
+
+                BSONObj op = c->next();
+                ts = op.findElement("ts");
+                assert( ts.type() == Date || ts.type() == Timestamp );
+                OpTime last = nextOpTime;
+                OpTime tmp( ts.date() );
+                nextOpTime = tmp;
+                if ( !( last < nextOpTime ) ) {
+                    problem() << "sync error: last " << last.toString() << " >= nextOpTime " << nextOpTime.toString() << endl;
+                    uassert( 10123 , "bad 'ts' value in sources", false);
+                }
+
+                sync_pullOpLog_applyOperation(op, &localLogTail);
+                n++;
+            }
+        }
+
+        return true;
+    }
+
+	BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}");
+
+	bool replAuthenticate(DBClientConnection *conn) {
+        AuthenticationInfo *ai = currentClient.get()->ai;
+		if( !ai->isAuthorized("admin") ) { 
+			log() << "replauthenticate: requires admin permissions, failing\n";
+			return false;
+		}
+
+		BSONObj user;
+		{
+			dblock lk;
+			Client::Context ctxt("local.");
+			if( !Helpers::findOne("local.system.users", userReplQuery, user) ) { 
+				// try the first user is local
+				if( !Helpers::getSingleton("local.system.users", user) ) {
+					if( noauth ) 
+						return true; // presumably we are running a --noauth setup all around.
+
+					log() << "replauthenticate: no user in local.system.users to use for authentication\n";
+					return false;
+				}
+			}
+		}
+
+		string u = user.getStringField("user");
+		string p = user.getStringField("pwd");
+		massert( 10392 , "bad user object? [1]", !u.empty());
+		massert( 10393 , "bad user object? [2]", !p.empty());
+		string err;
+		if( !conn->auth("local", u.c_str(), p.c_str(), err, false) ) {
+			log() << "replauthenticate: can't authenticate to master server, user:" << u << endl;
+			return false;
+		}
+		return true;
+	}
+
+    bool ReplSource::connect() {
+        if ( conn.get() == 0 ) {
+            conn = auto_ptr<DBClientConnection>(new DBClientConnection());
+            string errmsg;
+            ReplInfo r("trying to connect to sync source");
+            if ( !conn->connect(hostName.c_str(), errmsg) || !replAuthenticate(conn.get()) ) {
+                resetConnection();
+                log() << "repl: " << errmsg << endl;
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    /* note: not yet in mutex at this point.
+       returns true if everything happy.  return false if you want to reconnect.
+    */
+    bool ReplSource::sync(int& nApplied) {
+        ReplInfo r("sync");
+        if ( !cmdLine.quiet )
+            log() << "repl: " << sourceName() << '@' << hostName << endl;
+        nClonedThisPass = 0;
+
+        // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName.
+        if ( (string("localhost") == hostName || string("127.0.0.1") == hostName) && cmdLine.port == CmdLine::DefaultDBPort ) {
+            log() << "repl:   can't sync from self (localhost). sources configuration may be wrong." << endl;
+            sleepsecs(5);
+            return false;
+        }
+
+        if ( !connect() ) {
+            if ( replPair && paired ) {
+                assert( startsWith(hostName.c_str(), replPair->remoteHost.c_str()) );
+                replPair->arbitrate();
+            }
+            {
+                ReplInfo r("can't connect to sync source");
+            }
+            return false;            
+        }
+        
+        if ( paired ) {
+            int remote = replPair->negotiate(conn.get(), "direct");
+            int nMasters = ( remote == ReplPair::State_Master ) + ( replPair->state == ReplPair::State_Master );
+            if ( getInitialSyncCompleted() && nMasters != 1 ) {
+                log() << ( nMasters == 0 ? "no master" : "two masters" ) << ", deferring oplog pull" << endl;
+                return true;
+            }
+        }
+
+        /*
+        	// get current mtime at the server.
+        	BSONObj o = conn->findOne("admin.$cmd", opTimeQuery);
+        	BSONElement e = o.findElement("optime");
+        	if( e.eoo() ) {
+        		log() << "repl:   failed to get cur optime from master" << endl;
+        		log() << "        " << o.toString() << endl;
+        		return false;
+        	}
+        	uassert( 10124 ,  e.type() == Date );
+        	OpTime serverCurTime;
+        	serverCurTime.asDate() = e.date();
+        */
+        return sync_pullOpLog(nApplied);
+    }
+
+    /* -- Logging of operations -------------------------------------*/
+
+// cached copies of these...so don't rename them
+    NamespaceDetails *localOplogMainDetails = 0;
+    Database *localOplogClient = 0;
+
+    void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) {
+        if ( master ) {
+            _logOp(opstr, ns, "local.oplog.$main", obj, patt, b, OpTime::now());
+            char cl[ 256 ];
+            nsToDatabase( ns, cl );
+        }
+        NamespaceDetailsTransient &t = NamespaceDetailsTransient::get_w( ns );
+        if ( t.cllEnabled() ) {
+            try {
+                _logOp(opstr, ns, t.cllNS().c_str(), obj, patt, b, OpTime::now());
+            } catch ( const DBException & ) {
+                t.cllInvalidate();
+            }
+        }
+    }    
+    
+    /* we write to local.opload.$main:
+         { ts : ..., op: ..., ns: ..., o: ... }
+       ts: an OpTime timestamp
+       op:
+        "i" insert
+        "u" update
+        "d" delete
+        "c" db cmd
+        "db" declares presence of a database (ns is set to the db name + '.')
+        "n" no op
+       bb:
+         if not null, specifies a boolean to pass along to the other side as b: param.
+         used for "justOne" or "upsert" flags on 'd', 'u'
+       first: true
+         when set, indicates this is the first thing we have logged for this database.
+         thus, the slave does not need to copy down all the data when it sees this.
+    */
+    void _logOp(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, const OpTime &ts ) {
+        if ( strncmp(ns, "local.", 6) == 0 )
+            return;
+
+        DEV assertInWriteLock();
+
+        Client::Context context;
+
+        /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
+           instead we do a single copy to the destination position in the memory mapped file.
+        */
+
+        BSONObjBuilder b;
+        b.appendTimestamp("ts", ts.asDate());
+        b.append("op", opstr);
+        b.append("ns", ns);
+        if ( bb )
+            b.appendBool("b", *bb);
+        if ( o2 )
+            b.append("o2", *o2);
+        BSONObj partial = b.done();
+        int posz = partial.objsize();
+        int len = posz + obj.objsize() + 1 + 2 /*o:*/;
+
+        Record *r;
+        if ( strncmp( logNS, "local.", 6 ) == 0 ) { // For now, assume this is olog main
+            if ( localOplogMainDetails == 0 ) {
+                setClient("local.");
+                localOplogClient = cc().database();
+                localOplogMainDetails = nsdetails(logNS);
+            }
+            cc().setns("", localOplogClient); // database = localOplogClient;
+            r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len);
+        } else {
+            setClient( logNS );
+            assert( nsdetails( logNS ) );
+            r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
+        }
+
+        char *p = r->data;
+        memcpy(p, partial.objdata(), posz);
+        *((unsigned *)p) += obj.objsize() + 1 + 2;
+        p += posz - 1;
+        *p++ = (char) Object;
+        *p++ = 'o';
+        *p++ = 0;
+        memcpy(p, obj.objdata(), obj.objsize());
+        p += obj.objsize();
+        *p = EOO;
+        
+        if ( logLevel >= 6 ) {
+            BSONObj temp(r);
+            log( 6 ) << "logging op:" << temp << endl;
+        }
+    }
+
+    /* --------------------------------------------------------------*/
+
+    /*
+    TODO:
+    _ source has autoptr to the cursor
+    _ reuse that cursor when we can
+    */
+
+    /* returns: # of seconds to sleep before next pass 
+                0 = no sleep recommended
+                1 = special sentinel indicating adaptive sleep recommended
+    */
+    int _replMain(ReplSource::SourceVector& sources, int& nApplied) {
+        {
+            ReplInfo r("replMain load sources");
+            dblock lk;
+            ReplSource::loadAll(sources);
+        }
+
+        if ( sources.empty() ) {
+            /* replication is not configured yet (for --slave) in local.sources.  Poll for config it
+            every 20 seconds.
+            */
+            return 20;
+        }
+
+        int sleepAdvice = 1;
+        for ( ReplSource::SourceVector::iterator i = sources.begin(); i != sources.end(); i++ ) {
+            ReplSource *s = i->get();
+            bool ok = false;
+            try {
+                ok = s->sync(nApplied);
+                bool moreToSync = s->haveMoreDbsToSync();
+                if( !ok ) { 
+                    sleepAdvice = 3;
+                }
+                else if( moreToSync ) {
+                    sleepAdvice = 0;
+                }
+                if ( ok && !moreToSync /*&& !s->syncedTo.isNull()*/ ) {
+                    pairSync->setInitialSyncCompletedLocking();
+                }
+            }
+            catch ( const SyncException& ) {
+                log() << "caught SyncException" << endl;
+                return 10;
+            }
+            catch ( AssertionException& e ) {
+                if ( e.severe() ) {
+                    log() << "replMain AssertionException " << e.what() << endl;
+                    return 60;
+                }
+                else {
+                    log() << "repl: AssertionException " << e.what() << '\n';
+                }
+                replInfo = "replMain caught AssertionException";
+            }
+            catch ( const DBException& e ) {
+                log() << "repl: DBException " << e.what() << endl;
+                replInfo = "replMain caught DBException";
+            }
+            catch ( const std::exception &e ) {
+                log() << "repl: std::exception " << e.what() << endl;
+                replInfo = "replMain caught std::exception";                
+            }
+            catch ( ... ) { 
+                log() << "unexpected exception during replication.  replication will halt" << endl;
+                replAllDead = "caught unexpected exception during replication";
+            }
+            if ( !ok )
+                s->resetConnection();
+        }
+        return sleepAdvice;
+    }
+
+    void replMain() {
+        ReplSource::SourceVector sources;
+        while ( 1 ) {
+            int s = 0;
+            {
+                dblock lk;
+                if ( replAllDead ) {
+                    if ( !autoresync || !ReplSource::throttledForceResyncDead( "auto" ) )
+                        break;
+                }
+                assert( syncing == 0 );
+                syncing++;
+            }
+            try {
+                int nApplied = 0;
+                s = _replMain(sources, nApplied);
+                if( s == 1 ) { 
+                    if( nApplied == 0 ) s = 2;
+                    else if( nApplied > 100 ) { 
+                        // sleep very little - just enought that we aren't truly hammering master
+                        sleepmillis(75);
+                        s = 0;
+                    }
+                }
+            } catch (...) {
+                out() << "caught exception in _replMain" << endl;
+                s = 4;
+            }
+            {
+                dblock lk;
+                assert( syncing == 1 );
+                syncing--;
+            }
+            if ( s ) {
+                stringstream ss;
+                ss << "repl: sleep " << s << "sec before next pass";
+                string msg = ss.str();
+                log() << msg << endl;
+                ReplInfo r(msg.c_str());
+                sleepsecs(s);
+            }
+        }
+    }
+
+    int debug_stop_repl = 0;
+
+    void replSlaveThread() {
+        sleepsecs(1);
+
+        {
+            dblock lk;
+
+            Client::initThread("replslave");
+	    currentClient.get()->ai->authorize("admin");
+        
+            BSONObj obj;
+            if ( Helpers::getSingleton("local.pair.startup", obj) ) {
+                // should be: {replacepeer:1}
+                replacePeer = true;
+                pairSync->setInitialSyncCompleted(); // we are the half that has all the data
+            }
+        }
+
+        while ( 1 ) {
+            try {
+                replMain();
+                if ( debug_stop_repl )
+                    break;
+                sleepsecs(5);
+            }
+            catch ( AssertionException& ) {
+                ReplInfo r("Assertion in replSlaveThread(): sleeping 5 minutes before retry");
+                problem() << "Assertion in replSlaveThread(): sleeping 5 minutes before retry" << endl;
+                sleepsecs(300);
+            }
+        }
+    }
+
+    void tempThread() {
+        while ( 1 ) {
+            out() << dbMutex.info().isLocked() << endl;
+            sleepmillis(100);
+        }
+    }
+
+    void createOplog() {
+        dblock lk;
+
+        const char * ns = "local.oplog.$main";
+        setClient(ns);
+        
+        if ( nsdetails( ns ) )
+            return;
+        
+        /* create an oplog collection, if it doesn't yet exist. */
+        BSONObjBuilder b;
+        double sz;
+        if ( cmdLine.oplogSize != 0 )
+            sz = (double)cmdLine.oplogSize;
+        else {
+            sz = 50.0 * 1000 * 1000;
+            if ( sizeof(int *) >= 8 ) {
+                sz = 990.0 * 1000 * 1000;
+                boost::intmax_t free = freeSpace(); //-1 if call not supported.
+                double fivePct = free * 0.05;
+                if ( fivePct > sz )
+                    sz = fivePct;
+            }
+        }
+
+        log() << "******\n";
+        log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB (use --oplogSize to change)\n";
+        log() << "******" << endl;
+
+        b.append("size", sz);
+        b.appendBool("capped", 1);
+        b.appendBool("autoIndexId", false);
+
+        string err;
+        BSONObj o = b.done();
+        userCreateNS(ns, o, err, false);
+        logOp( "n", "dummy", BSONObj() );
+        cc().clearns();
+    }
+    
+    void startReplication() {
+        /* this was just to see if anything locks for longer than it should -- we need to be careful
+           not to be locked when trying to connect() or query() the other side.
+           */
+        //boost::thread tempt(tempThread);
+
+        if ( !slave && !master && !replPair )
+            return;
+
+        {
+            dblock lk;
+            pairSync->init();
+        }
+
+        if ( slave || replPair ) {
+            if ( slave ) {
+				assert( slave == SimpleSlave );
+                log(1) << "slave=true" << endl;
+			}
+			else
+				slave = ReplPairSlave;
+            boost::thread repl_thread(replSlaveThread);
+        }
+
+        if ( master || replPair ) {
+            if ( master  )
+                log(1) << "master=true" << endl;
+            master = true;
+            createOplog();
+        }
+    }
+
+    /* called from main at server startup */
+    void pairWith(const char *remoteEnd, const char *arb) {
+        replPair = new ReplPair(remoteEnd, arb);
+    }
+
+    class CmdLogCollection : public Command {
+    public:
+        virtual bool slaveOk() {
+            return false;
+        }
+        CmdLogCollection() : Command( "logCollection" ) {}
+        virtual void help( stringstream &help ) const {
+            help << "examples: { logCollection: <collection ns>, start: 1 }, "
+                 << "{ logCollection: <collection ns>, validateComplete: 1 }";
+        }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string logCollection = cmdObj.getStringField( "logCollection" );
+            if ( logCollection.empty() ) {
+                errmsg = "missing logCollection spec";
+                return false;
+            }
+            bool start = !cmdObj.getField( "start" ).eoo();
+            bool validateComplete = !cmdObj.getField( "validateComplete" ).eoo();
+            if ( start ? validateComplete : !validateComplete ) {
+                errmsg = "Must specify exactly one of start:1 or validateComplete:1";
+                return false;
+            }
+            int logSizeMb = cmdObj.getIntField( "logSizeMb" );
+            NamespaceDetailsTransient &t = NamespaceDetailsTransient::get_w( logCollection.c_str() );
+            if ( start ) {
+                if ( t.cllNS().empty() ) {
+                    if ( logSizeMb == INT_MIN ) {
+                        t.cllStart();
+                    } else {
+                        t.cllStart( logSizeMb );
+                    }
+                } else {
+                    errmsg = "Log already started for ns: " + logCollection;
+                    return false;
+                }
+            } else {
+                if ( t.cllNS().empty() ) {
+                    errmsg = "No log to validateComplete for ns: " + logCollection;
+                    return false;
+                } else {
+                    if ( !t.cllValidateComplete() ) {
+                        errmsg = "Oplog failure, insufficient space allocated";
+                        return false;
+                    }
+                }
+            }
+            log() << "started logCollection with cmd obj: " << cmdObj << endl;
+            return true;
+        }
+    } cmdlogcollection;
+    
+} // namespace mongo
diff --git a/db/repl.h b/db/repl.h
new file mode 100644
index 0000000..a4c1737
--- /dev/null
+++ b/db/repl.h
@@ -0,0 +1,315 @@
+// repl.h - replication
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* replication data overview
+
+   at the slave:
+     local.sources { host: ..., source: ..., only: ..., syncedTo: ..., localLogTs: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+   at the master:
+     local.oplog.$<source>
+     local.oplog.$main is the default
+*/
+
+#pragma once
+
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "query.h"
+
+#include "../client/dbclient.h"
+
+#include "../util/optime.h"
+
+namespace mongo {
+
+    class DBClientConnection;
+    class DBClientCursor;
+
+	/* replication slave? (possibly with slave or repl pair nonmaster)
+       --slave cmd line setting -> SimpleSlave
+	*/
+	typedef enum { NotSlave=0, SimpleSlave, ReplPairSlave } SlaveTypes;
+	extern SlaveTypes slave;
+
+	/* true means we are master and doing replication.  if we are not writing to oplog (no --master or repl pairing), 
+	   this won't be true.
+	*/
+    extern bool master;
+
+    extern int opIdMem;
+    
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, 
+				   bool slaveOk, bool useReplAuth, bool snapshot);
+
+    /* A replication exception */
+    class SyncException : public DBException {
+    public:
+        virtual const char* what() const throw() { return "sync exception"; }
+        virtual int getCode(){ return 10001; }
+    };
+    
+    /* A Source is a source from which we can pull (replicate) data.
+       stored in collection local.sources.
+
+       Can be a group of things to replicate for several databases.
+
+          { host: ..., source: ..., only: ..., syncedTo: ..., localLogTs: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+       'source' defaults to 'main'; support for multiple source names is
+       not done (always use main for now).
+    */
+    class ReplSource {
+        bool resync(string db);
+
+        /* pull some operations from the master's oplog, and apply them. */
+        bool sync_pullOpLog(int& nApplied);
+
+        void sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail);
+        
+        auto_ptr<DBClientConnection> conn;
+        auto_ptr<DBClientCursor> cursor;
+
+        /* we only clone one database per pass, even if a lot need done.  This helps us
+           avoid overflowing the master's transaction log by doing too much work before going
+           back to read more transactions. (Imagine a scenario of slave startup where we try to
+           clone 100 databases in one pass.)
+        */
+        set<string> addDbNextPass;
+
+        set<string> incompleteCloneDbs;
+
+        ReplSource();
+        
+        // returns the dummy ns used to do the drop
+        string resyncDrop( const char *db, const char *requester );
+        // returns true if connected on return
+        bool connect();
+        // returns possibly unowned id spec for the operation.
+        static BSONObj idForOp( const BSONObj &op, bool &mod );
+        static void updateSetsWithOp( const BSONObj &op, bool mayUpdateStorage );
+        // call without the db mutex
+        void syncToTailOfRemoteLog();
+        // call with the db mutex
+        OpTime nextLastSavedLocalTs() const;
+        void setLastSavedLocalTs( const OpTime &nextLocalTs );
+        // call without the db mutex
+        void resetSlave();
+        // call with the db mutex
+        // returns false if the slave has been reset
+        bool updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock );
+        string ns() const { return string( "local.oplog.$" ) + sourceName(); }
+        
+    public:
+        static void applyOperation(const BSONObj& op);
+        bool replacing; // in "replace mode" -- see CmdReplacePeer
+        bool paired; // --pair in use
+        string hostName;    // ip addr or hostname plus optionally, ":<port>"
+        string _sourceName;  // a logical source name.
+        string sourceName() const {
+            return _sourceName.empty() ? "main" : _sourceName;
+        }
+        string only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating.
+
+        /* the last time point we have already synced up to (in the remote/master's oplog). */
+        OpTime syncedTo;
+
+        /* This is for repl pairs.
+           _lastSavedLocalTs is the most recent point in the local log that we know is consistent
+           with the remote log ( ie say the local op log has entries ABCDE and the remote op log 
+           has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled 
+           the DE-XY difference.)
+        */
+        OpTime _lastSavedLocalTs;
+
+        int nClonedThisPass;
+
+        typedef vector< shared_ptr< ReplSource > > SourceVector;
+        static void loadAll(SourceVector&);
+        explicit ReplSource(BSONObj);
+        bool sync(int& nApplied);
+        void save(); // write ourself to local.sources
+        void resetConnection() {
+            cursor = auto_ptr<DBClientCursor>(0);
+            conn = auto_ptr<DBClientConnection>(0);
+        }
+
+        // make a jsobj from our member fields of the form
+        //   { host: ..., source: ..., syncedTo: ... }
+        BSONObj jsobj();
+
+        bool operator==(const ReplSource&r) const {
+            return hostName == r.hostName && sourceName() == r.sourceName();
+        }
+        operator string() const { return sourceName() + "@" + hostName; }
+        
+        bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); }        
+
+        static bool throttledForceResyncDead( const char *requester );
+        static void forceResyncDead( const char *requester );
+        void forceResync( const char *requester );
+    };
+
+    /* Write operation to the log (local.oplog.$main)
+       "i" insert
+       "u" update
+       "d" delete
+       "c" db cmd
+       "db" declares presence of a database (ns is set to the db name + '.')
+    */
+    void _logOp(const char *opstr, const char *ns, const char *logNs, const BSONObj& obj, BSONObj *patt, bool *b, const OpTime &ts);
+    void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0);
+
+    // class for managing a set of ids in memory
+    class MemIds {
+    public:
+        MemIds() : size_() {}
+        friend class IdTracker;
+        void reset() { imp_.clear(); }
+        bool get( const char *ns, const BSONObj &id ) { return imp_[ ns ].count( id ); }
+        void set( const char *ns, const BSONObj &id, bool val ) {
+            if ( val ) {
+                if ( imp_[ ns ].insert( id.getOwned() ).second ) {
+                    size_ += id.objsize() + sizeof( BSONObj );
+                }
+            } else {
+                if ( imp_[ ns ].erase( id ) == 1 ) {
+                    size_ -= id.objsize() + sizeof( BSONObj );
+                }
+            }
+        }
+        long long roughSize() const {
+            return size_;
+        }
+    private:
+        typedef map< string, BSONObjSetDefaultOrder > IdSets;
+        IdSets imp_;
+        long long size_;
+    };
+
+    // class for managing a set of ids in a db collection
+    // All functions must be called with db mutex held
+    class DbIds {
+    public:
+        DbIds( const string & name ) : impl_( name, BSON( "ns" << 1 << "id" << 1 ) ) {}
+        void reset() {
+            impl_.reset();
+        }
+        bool get( const char *ns, const BSONObj &id ) {
+            return impl_.get( key( ns, id ) );
+        }
+        void set( const char *ns, const BSONObj &id, bool val ) {
+            impl_.set( key( ns, id ), val );
+        }
+    private:
+        static BSONObj key( const char *ns, const BSONObj &id ) {
+            BSONObjBuilder b;
+            b << "ns" << ns;
+            // rename _id to id since there may be duplicates
+            b.appendAs( id.firstElement(), "id" );
+            return b.obj();
+        }        
+        DbSet impl_;
+    };
+
+    // class for tracking ids and mod ids, in memory or on disk
+    // All functions must be called with db mutex held
+    // Kind of sloppy class structure, for now just want to keep the in mem
+    // version speedy.
+	// see http://www.mongodb.org/display/DOCS/Pairing+Internals
+    class IdTracker {
+    public:
+        IdTracker() :
+        dbIds_( "local.temp.replIds" ),
+        dbModIds_( "local.temp.replModIds" ),
+        inMem_( true ),
+        maxMem_( opIdMem ) {
+        }
+        void reset( int maxMem = opIdMem ) {
+            memIds_.reset();
+            memModIds_.reset();
+            dbIds_.reset();
+            dbModIds_.reset();
+            maxMem_ = maxMem;
+            inMem_ = true;
+        }
+        bool haveId( const char *ns, const BSONObj &id ) {
+            if ( inMem_ )
+                return get( memIds_, ns, id );
+            else
+                return get( dbIds_, ns, id );
+        }
+        bool haveModId( const char *ns, const BSONObj &id ) {
+            if ( inMem_ )
+                return get( memModIds_, ns, id );
+            else
+                return get( dbModIds_, ns, id );
+        }
+        void haveId( const char *ns, const BSONObj &id, bool val ) {
+            if ( inMem_ )
+                set( memIds_, ns, id, val );
+            else
+                set( dbIds_, ns, id, val );
+        }
+        void haveModId( const char *ns, const BSONObj &id, bool val ) {
+            if ( inMem_ )
+                set( memModIds_, ns, id, val );
+            else
+                set( dbModIds_, ns, id, val );
+        }
+        // will release the db mutex
+        void mayUpgradeStorage() {
+            if ( !inMem_ || memIds_.roughSize() + memModIds_.roughSize() <= maxMem_ )
+                return;
+            log() << "saving master modified id information to collection" << endl;
+            upgrade( memIds_, dbIds_ );
+            upgrade( memModIds_, dbModIds_ );
+            memIds_.reset();
+            memModIds_.reset();
+            inMem_ = false;
+        }
+        bool inMem() const { return inMem_; }
+    private:
+        template< class T >
+        bool get( T &ids, const char *ns, const BSONObj &id ) {
+            return ids.get( ns, id );
+        }
+        template< class T >
+        void set( T &ids, const char *ns, const BSONObj &id, bool val ) {
+            ids.set( ns, id, val );
+        }
+        void upgrade( MemIds &a, DbIds &b ) {
+            for( MemIds::IdSets::const_iterator i = a.imp_.begin(); i != a.imp_.end(); ++i ) {
+                for( BSONObjSetDefaultOrder::const_iterator j = i->second.begin(); j != i->second.end(); ++j ) {
+                    set( b, i->first.c_str(), *j, true );            
+                    RARELY {
+                        dbtemprelease t;
+                    }
+                }
+            }
+        }
+        MemIds memIds_;
+        MemIds memModIds_;
+        DbIds dbIds_;
+        DbIds dbModIds_;
+        bool inMem_;
+        int maxMem_;
+    };
+    
+} // namespace mongo
diff --git a/db/replset.h b/db/replset.h
new file mode 100644
index 0000000..98d80d6
--- /dev/null
+++ b/db/replset.h
@@ -0,0 +1,207 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "db.h"
+#include "dbhelpers.h"
+#include "json.h"
+#include "../client/dbclient.h"
+#include "repl.h"
+#include "cmdline.h"
+
+namespace mongo {
+
+    extern const char *replAllDead;
+
+    /* ReplPair is a pair of db servers replicating to one another and cooperating.
+
+       Only one member of the pair is active at a time; so this is a smart master/slave
+       configuration basically.
+
+       You may read from the slave at anytime though (if you don't mind the slight lag).
+
+       todo: Could be extended to be more than a pair, thus the name 'Set' -- for example,
+       a set of 3...
+    */
+
+    class ReplPair {
+    public:
+        enum ReplState {
+            State_CantArb = -3,
+            State_Confused = -2,
+            State_Negotiating = -1,
+            State_Slave = 0,
+            State_Master = 1
+        };
+
+        int state;
+        string info; // commentary about our current state
+        string arbHost;  // "-" for no arbiter.  "host[:port]"
+        int remotePort;
+        string remoteHost;
+        string remote; // host:port if port specified.
+//    int date; // -1 not yet set; 0=slave; 1=master
+
+        string getInfo() {
+            stringstream ss;
+            ss << "  state:   ";
+            if ( state == 1 ) ss << "1 State_Master ";
+            else if ( state == 0 ) ss << "0 State_Slave";
+            else
+                ss << "<b>" << state << "</b>";
+            ss << '\n';
+            ss << "  info:    " << info << '\n';
+            ss << "  arbhost: " << arbHost << '\n';
+            ss << "  remote:  " << remoteHost << ':' << remotePort << '\n';
+//        ss << "  date:    " << date << '\n';
+            return ss.str();
+        }
+
+        ReplPair(const char *remoteEnd, const char *arbiter);
+        virtual ~ReplPair() {}
+
+        bool dominant(const string& myname) {
+            if ( myname == remoteHost )
+                return cmdLine.port > remotePort;
+            return myname > remoteHost;
+        }
+
+        void setMasterLocked( int n, const char *_comment = "" ) {
+            dblock p;
+            setMaster( n, _comment );
+        }
+
+        void setMaster(int n, const char *_comment = "");
+
+        /* negotiate with our peer who is master; returns state of peer */
+        int negotiate(DBClientConnection *conn, string method);
+
+        /* peer unreachable, try our arbitrator */
+        void arbitrate();
+
+        virtual
+        DBClientConnection *newClientConnection() const {
+            return new DBClientConnection();
+        }
+    };
+
+    extern ReplPair *replPair;
+
+    /* note we always return true for the "local" namespace.
+
+       we should not allow most operations when not the master
+       also we report not master if we are "dead".
+
+       See also CmdIsMaster.
+
+       If 'client' is not specified, the current client is used.
+    */
+    inline bool isMaster( const char *client = 0 ) {
+		if( !slave ) 
+			return true;
+
+        if ( !client ) {
+            Database *database = cc().database();
+            assert( database );
+            client = database->name.c_str();
+        }
+
+        if ( replAllDead )
+            return strcmp( client, "local" ) == 0;
+
+        if ( replPair ) {
+			if( replPair->state == ReplPair::State_Master )
+				return true;
+		}
+        else { 
+            if( master ) {
+                // if running with --master --slave, allow.  note that master is also true 
+                // for repl pairs so the check for replPair above is important.
+                return true;
+            }
+        }
+        
+        if ( cc().isGod() )
+            return true;
+        
+        return strcmp( client, "local" ) == 0;
+    }
+    inline bool isMasterNs( const char *ns ) {
+        char cl[ 256 ];
+        nsToDatabase( ns, cl );
+        return isMaster( cl );
+    }
+
+    inline ReplPair::ReplPair(const char *remoteEnd, const char *arb) {
+        state = -1;
+        remote = remoteEnd;
+        remotePort = CmdLine::DefaultDBPort;
+        remoteHost = remoteEnd;
+        const char *p = strchr(remoteEnd, ':');
+        if ( p ) {
+            remoteHost = string(remoteEnd, p-remoteEnd);
+            remotePort = atoi(p+1);
+            uassert( 10125 , "bad port #", remotePort > 0 && remotePort < 0x10000 );
+            if ( remotePort == CmdLine::DefaultDBPort )
+                remote = remoteHost; // don't include ":27017" as it is default; in case ran in diff ways over time to normalizke the hostname format in sources collection
+        }
+
+        uassert( 10126 , "arbiter parm is missing, use '-' for none", arb);
+        arbHost = arb;
+        uassert( 10127 , "arbiter parm is empty", !arbHost.empty());
+    }
+
+    /* This is set to true if we have EVER been up to date -- this way a new pair member
+     which is a replacement won't go online as master until we have initially fully synced.
+     */
+    class PairSync {
+        int initialsynccomplete;
+    public:
+        PairSync() {
+            initialsynccomplete = -1;
+        }
+
+        /* call before using the class.  from dbmutex */
+        void init() {
+            BSONObj o;
+            initialsynccomplete = 0;
+            if ( Helpers::getSingleton("local.pair.sync", o) )
+                initialsynccomplete = 1;
+        }
+
+        bool initialSyncCompleted() {
+            return initialsynccomplete != 0;
+        }
+
+        void setInitialSyncCompleted() {
+            BSONObj o = fromjson("{\"initialsynccomplete\":1}");
+            Helpers::putSingleton("local.pair.sync", o);
+            initialsynccomplete = 1;
+        }
+
+        void setInitialSyncCompletedLocking() {
+            if ( initialsynccomplete == 1 )
+                return;
+            dblock lk;
+            BSONObj o = fromjson("{\"initialsynccomplete\":1}");
+            Helpers::putSingleton("local.pair.sync", o);
+            initialsynccomplete = 1;
+        }
+    };
+
+
+} // namespace mongo
diff --git a/db/resource.h b/db/resource.h
new file mode 100644
index 0000000..59b9f5c
--- /dev/null
+++ b/db/resource.h
@@ -0,0 +1,34 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by db.rc
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+namespace mongo {
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        101
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1001
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
+
+} // namespace mongo
diff --git a/db/scanandorder.h b/db/scanandorder.h
new file mode 100644
index 0000000..3f41433
--- /dev/null
+++ b/db/scanandorder.h
@@ -0,0 +1,148 @@
+/* scanandorder.h
+   Order results (that aren't already indexes and in order.)
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /* todo:
+       _ handle compound keys with differing directions.  we don't handle this yet: neither here nor in indexes i think!!!
+       _ limit amount of data
+    */
+
+    /* see also IndexDetails::getKeysFromObject, which needs some merging with this. */
+
+    class KeyType : boost::noncopyable {
+    public:
+        BSONObj pattern; // e.g., { ts : -1 }
+    public:
+        KeyType(BSONObj _keyPattern) {
+            pattern = _keyPattern;
+            assert( !pattern.isEmpty() );
+        }
+
+        // returns the key value for o
+        BSONObj getKeyFromObject(BSONObj o) {
+            return o.extractFields(pattern);
+        }
+    };
+
+    /* todo:
+       _ respect limit
+       _ check for excess mem usage
+       _ response size limit from runquery; push it up a bit.
+    */
+
+    inline void fillQueryResultFromObj(BufBuilder& bb, FieldMatcher *filter, BSONObj& js) {
+        if ( filter ) {
+            BSONObjBuilder b( bb );
+            BSONObjIterator i( js );
+            bool gotId = false;
+            while ( i.more() ){
+                BSONElement e = i.next();
+                const char * fname = e.fieldName();
+                
+                if ( strcmp( fname , "_id" ) == 0 ){
+                    b.append( e );
+                    gotId = true;
+                } else {
+                    filter->append( b , e );
+                }
+            }
+            b.done();
+        } else {
+            bb.append((void*) js.objdata(), js.objsize());
+        }
+    }
+    
+    typedef multimap<BSONObj,BSONObj,BSONObjCmp> BestMap;
+    class ScanAndOrder {
+        BestMap best; // key -> full object
+        int startFrom;
+        int limit;   // max to send back.
+        KeyType order;
+        unsigned approxSize;
+
+        void _add(BSONObj& k, BSONObj o) {
+            best.insert(make_pair(k,o));
+        }
+
+        void _addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i) {
+            const BSONObj& worstBestKey = i->first;
+            int c = worstBestKey.woCompare(k, order.pattern);
+            if ( c > 0 ) {
+                // k is better, 'upgrade'
+                best.erase(i);
+                _add(k, o);
+            }
+        }
+
+    public:
+        ScanAndOrder(int _startFrom, int _limit, BSONObj _order) :
+                best( BSONObjCmp( _order ) ),
+                startFrom(_startFrom), order(_order) {
+            limit = _limit > 0 ? _limit + startFrom : 0x7fffffff;
+            approxSize = 0;
+        }
+
+        int size() const {
+            return best.size();
+        }
+
+        void add(BSONObj o) {
+            BSONObj k = order.getKeyFromObject(o);
+            if ( (int) best.size() < limit ) {
+                approxSize += k.objsize();
+                uassert( 10128 ,  "too much key data for sort() with no index.  add an index or specify a smaller limit", approxSize < 1 * 1024 * 1024 );
+                _add(k, o);
+                return;
+            }
+            BestMap::iterator i;
+            assert( best.end() != best.begin() );
+            i = best.end();
+            i--;
+            _addIfBetter(k, o, i);
+        }
+
+        void _fill(BufBuilder& b, FieldMatcher *filter, int& nout, BestMap::iterator begin, BestMap::iterator end) {
+            int n = 0;
+            int nFilled = 0;
+            for ( BestMap::iterator i = begin; i != end; i++ ) {
+                n++;
+                if ( n <= startFrom )
+                    continue;
+                BSONObj& o = i->second;
+                fillQueryResultFromObj(b, filter, o);
+                nFilled++;
+                if ( nFilled >= limit )
+                    break;
+                uassert( 10129 ,  "too much data for sort() with no index", b.len() < 4000000 ); // appserver limit
+            }
+            nout = nFilled;
+        }
+
+        /* scanning complete. stick the query result in b for n objects. */
+        void fill(BufBuilder& b, FieldMatcher *filter, int& nout) {
+            _fill(b, filter, nout, best.begin(), best.end());
+        }
+
+    };
+
+} // namespace mongo
diff --git a/db/security.cpp b/db/security.cpp
new file mode 100644
index 0000000..747b04a
--- /dev/null
+++ b/db/security.cpp
@@ -0,0 +1,32 @@
+// security.cpp
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "stdafx.h"
+#include "security.h"
+#include "instance.h"
+#include "client.h"
+#include "curop.h"
+
+namespace mongo {
+
+    bool noauth = true;
+
+	int AuthenticationInfo::warned = 0;
+
+} // namespace mongo
+
diff --git a/db/security.h b/db/security.h
new file mode 100644
index 0000000..f61d5e1
--- /dev/null
+++ b/db/security.h
@@ -0,0 +1,77 @@
+// security.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <boost/thread/tss.hpp>
+#undef assert
+#define assert xassert
+
+#include "db.h"
+#include "dbhelpers.h"
+#include "nonce.h"
+
+namespace mongo {
+
+    // --noauth cmd line option
+    extern bool noauth;
+
+    /* for a particular db */
+    struct Auth {
+        Auth() { level = 0; }
+        int level;
+    };
+
+    class AuthenticationInfo : boost::noncopyable {
+        map<string, Auth> m; // dbname -> auth
+		static int warned;
+    public:
+		bool isLocalHost;
+        AuthenticationInfo() { isLocalHost = false; }
+        virtual ~AuthenticationInfo() {
+        }
+        void logout(const char *dbname) { 
+			assertInWriteLock();
+			m.erase(dbname); 
+		}
+        void authorize(const char *dbname) { 
+			assertInWriteLock();
+            m[dbname].level = 2;
+        }
+        virtual bool isAuthorized(const char *dbname) { 
+            if( m[dbname].level == 2 ) return true;
+			if( noauth ) return true;
+			if( m["admin"].level == 2 ) return true;
+			if( m["local"].level == 2 ) return true;
+			if( isLocalHost ) { 
+                readlock l(""); 
+                Client::Context c("admin.system.users");
+				BSONObj result;
+				if( Helpers::getSingleton("admin.system.users", result) )
+					return false;
+				if( warned == 0 ) {
+					warned++;
+					log() << "warning: no users configured in admin.system.users, allowing localhost access" << endl;
+				}
+				return true;
+			}
+			return false;
+        }
+    };
+
+} // namespace mongo
diff --git a/db/security_commands.cpp b/db/security_commands.cpp
new file mode 100644
index 0000000..9d63744
--- /dev/null
+++ b/db/security_commands.cpp
@@ -0,0 +1,160 @@
+// security_commands.cpp
+// security.cpp links with both dbgrid and db.  this file db only -- at least for now.
+
+// security.cpp
+
+#include "stdafx.h"
+#include "security.h"
+#include "../util/md5.hpp"
+#include "json.h" 
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "commands.h"
+#include "jsobj.h"
+#include "client.h"
+
+namespace mongo {
+
+/* authentication
+
+   system.users contains 
+     { user : <username>, pwd : <pwd_digest>, ... }
+
+   getnonce sends nonce to client
+
+   client then sends { authenticate:1, nonce:<nonce_str>, user:<username>, key:<key> }
+
+   where <key> is md5(<nonce_str><username><pwd_digest_str>) as a string
+*/
+
+    boost::thread_specific_ptr<nonce> lastNonce;
+
+    class CmdGetNonce : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdGetNonce() : Command("getnonce") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            nonce *n = new nonce(security.getNonce());
+            stringstream ss;
+            ss << hex << *n;
+            result.append("nonce", ss.str() );
+            lastNonce.reset(n);
+            return true;
+        }
+    } cmdGetNonce;
+
+    class CmdLogout : public Command {
+    public:
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdLogout() : Command("logout") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            // database->name is the one we are logging out...
+            Client& client = cc();
+            AuthenticationInfo *ai = client.ai;
+            ai->logout(client.database()->name.c_str());
+            return true;
+        }
+    } cmdLogout;
+    
+    class CmdAuthenticate : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        CmdAuthenticate() : Command("authenticate") {}
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            log(1) << " authenticate: " << cmdObj << endl;
+
+            string user = cmdObj.getStringField("user");
+            string key = cmdObj.getStringField("key");
+            string received_nonce = cmdObj.getStringField("nonce");
+            
+            if( user.empty() || key.empty() || received_nonce.empty() ) { 
+                log() << "field missing/wrong type in received authenticate command " 
+                    << cc().database()->name
+                    << '\n';               
+                errmsg = "auth fails";
+                sleepmillis(10);
+                return false;
+            }
+            
+            stringstream digestBuilder;
+
+            {
+                bool reject = false;
+                nonce *ln = lastNonce.release();
+                if ( ln == 0 ) {
+                    reject = true;
+                } else {
+                    digestBuilder << hex << *ln;
+                    reject = digestBuilder.str() != received_nonce;
+                }
+                    
+                if ( reject ) {
+                    log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << '\n';
+                    errmsg = "auth fails";
+                    sleepmillis(30);
+                    return false;
+                }
+            }
+
+            static BSONObj userPattern = fromjson("{\"user\":1}");
+            string systemUsers = cc().database()->name + ".system.users";
+            OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
+
+            BSONObj userObj;
+            {
+                BSONObjBuilder b;
+                b << "user" << user;
+                BSONObj query = b.done();
+                if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) { 
+                    log() << "auth: couldn't find user " << user << ", " << systemUsers << '\n';
+                    errmsg = "auth fails";
+                    return false;
+                }
+            }
+            
+            md5digest d;
+            {
+                
+                string pwd = userObj.getStringField("pwd");
+                digestBuilder << user << pwd;
+                string done = digestBuilder.str();
+                
+                md5_state_t st;
+                md5_init(&st);
+                md5_append(&st, (const md5_byte_t *) done.c_str(), done.size());
+                md5_finish(&st, d);
+            }
+            
+            string computed = digestToString( d );
+            
+            if ( key != computed ){
+                log() << "auth: key mismatch " << user << ", ns:" << ns << '\n';
+                errmsg = "auth fails";
+                return false;
+            }
+
+            AuthenticationInfo *ai = currentClient.get()->ai;
+            ai->authorize(cc().database()->name.c_str());
+            return true;
+        }
+    } cmdAuthenticate;
+    
+} // namespace mongo
diff --git a/db/storage.cpp b/db/storage.cpp
new file mode 100644
index 0000000..4da2d82
--- /dev/null
+++ b/db/storage.cpp
@@ -0,0 +1,61 @@
+// storage.cpp
+
+#include "stdafx.h"
+#include "pdfile.h"
+#include "reccache.h"
+#include "rec.h"
+#include "db.h"
+
+namespace mongo {
+
+void writerThread();
+
+#if defined(_RECSTORE)
+    static int inited;
+#endif
+
+// pick your store for indexes by setting this typedef
+// this doesn't need to be an ifdef, we can make it dynamic
+#if defined(_RECSTORE)
+RecStoreInterface *btreeStore = new CachedBasicRecStore();
+#else
+RecStoreInterface *btreeStore = new MongoMemMapped_RecStore();
+#endif
+
+void BasicRecStore::init(const char *fn, unsigned recsize)
+{ 
+    massert( 10394 ,  "compile packing problem recstore?", sizeof(RecStoreHeader) == 8192);
+    filename = fn;
+    f.open(fn);
+    uassert( 10130 ,  string("couldn't open file:")+fn, f.is_open() );
+    len = f.len();
+    if( len == 0 ) { 
+        log() << "creating recstore file " << fn << '\n';
+        h.recsize = recsize;
+        len = sizeof(RecStoreHeader);
+        f.write(0, (const char *) &h, sizeof(RecStoreHeader));
+    }    
+    else { 
+        f.read(0, (char *) &h, sizeof(RecStoreHeader));
+        massert( 10395 , string("recstore was not closed cleanly: ")+fn, h.cleanShutdown==0);
+        massert( 10396 , string("recstore recsize mismatch, file:")+fn, h.recsize == recsize);
+        massert( 10397 , string("bad recstore [1], file:")+fn, (h.leof-sizeof(RecStoreHeader)) % recsize == 0);        
+        if( h.leof > len ) { 
+            stringstream ss;
+            ss << "bad recstore, file:" << fn << " leof:" << h.leof << " len:" << len;
+            massert( 10398 , ss.str(), false);
+        }
+        if( h.cleanShutdown )
+            log() << "warning: non-clean shutdown for file " << fn << '\n';
+        h.cleanShutdown = 2;
+        writeHeader();
+        f.fsync();
+    }
+#if defined(_RECSTORE)
+    if( inited++ == 0 ) {
+        boost::thread t(writerThread);
+    }
+#endif
+}
+
+}
diff --git a/db/storage.h b/db/storage.h
new file mode 100644
index 0000000..cc29e60
--- /dev/null
+++ b/db/storage.h
@@ -0,0 +1,155 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* storage.h
+
+   Storage subsystem management.
+   Lays out our datafiles on disk, manages disk space.
+*/
+
+#pragma once
+
+namespace mongo {
+
+#pragma pack(1)
+
+    class Record;
+    class DeletedRecord;
+    class Extent;
+    class BtreeBucket;
+    class BSONObj;
+    class MongoDataFile;
+
+    class DiskLoc {
+        int fileNo; /* this will be volume, file #, etc. */
+        int ofs;
+    public:
+        // Note: MaxFiles imposes a limit of about 32TB of data per process
+        enum SentinelValues { MaxFiles=16000, NullOfs = -1 };
+
+        int a() const {
+            return fileNo;
+        }
+
+        DiskLoc(int a, int b) : fileNo(a), ofs(b) {
+            //assert(ofs!=0);
+        }
+        DiskLoc() { Null(); }
+        DiskLoc(const DiskLoc& l) {
+            fileNo=l.fileNo;
+            ofs=l.ofs;
+        }
+
+        bool questionable() {
+            return ofs < -1 ||
+                   fileNo < -1 ||
+                   fileNo > 524288;
+        }
+
+        bool isNull() const {
+            return fileNo == -1;
+            //            return ofs == NullOfs;
+        }
+        void Null() {
+            fileNo = -1;
+            ofs = 0;
+        }
+        void assertOk() {
+            assert(!isNull());
+        }
+        void setInvalid() {
+            fileNo = -2; 
+            ofs = 0;
+        }
+        bool isValid() const {
+            return fileNo != -2;
+        }
+
+        string toString() const {
+            if ( isNull() )
+                return "null";
+            stringstream ss;
+            ss << hex << fileNo << ':' << ofs;
+            return ss.str();
+        }
+        operator string() const { return toString(); }
+
+        int& GETOFS() {
+            return ofs;
+        }
+        int getOfs() const {
+            return ofs;
+        }
+        void set(int a, int b) {
+            fileNo=a;
+            ofs=b;
+        }
+        void setOfs(int _fileNo, int _ofs) {
+            fileNo = _fileNo;
+            ofs = _ofs;
+        }
+
+        void inc(int amt) {
+            assert( !isNull() );
+            ofs += amt;
+        }
+
+        bool sameFile(DiskLoc b) {
+            return fileNo == b.fileNo;
+        }
+
+        bool operator==(const DiskLoc& b) const {
+            return fileNo==b.fileNo && ofs == b.ofs;
+        }
+        bool operator!=(const DiskLoc& b) const {
+            return !(*this==b);
+        }
+        const DiskLoc& operator=(const DiskLoc& b) {
+            fileNo=b.fileNo;
+            ofs = b.ofs;
+            //assert(ofs!=0);
+            return *this;
+        }
+        int compare(const DiskLoc& b) const {
+            int x = fileNo - b.fileNo;
+            if ( x )
+                return x;
+            return ofs - b.ofs;
+        }
+        bool operator<(const DiskLoc& b) const {
+            return compare(b) < 0;
+        }
+
+        /* get the "thing" associated with this disk location.
+           it is assumed the object is what it is -- you must asure that:
+           think of this as an unchecked type cast.
+        */
+        BSONObj obj() const;
+        Record* rec() const;
+        DeletedRecord* drec() const;
+        Extent* ext() const;
+        BtreeBucket* btree() const;
+        BtreeBucket* btreemod() const; // marks modified / dirty
+
+        MongoDataFile& pdf() const;
+    };
+
+#pragma pack()
+
+    const DiskLoc minDiskLoc(0, 1);
+    const DiskLoc maxDiskLoc(0x7fffffff, 0x7fffffff);
+
+} // namespace mongo
diff --git a/db/tests.cpp b/db/tests.cpp
new file mode 100644
index 0000000..81cc363
--- /dev/null
+++ b/db/tests.cpp
@@ -0,0 +1,68 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* tests.cpp
+
+   unit test & such
+*/
+
+#include "stdafx.h"
+#include "../util/mmap.h"
+
+namespace mongo {
+
+    int test2_old9() {
+        out() << "test2" << endl;
+        printStackTrace();
+        if ( 1 )
+            return 1;
+
+        MemoryMappedFile f;
+
+        long len = 64*1024*1024;
+        char *p = (char *) f.map("/tmp/test.dat", len);
+        char *start = p;
+        char *end = p + 64*1024*1024-2;
+        end[1] = 'z';
+        int i;
+        while ( p < end ) {
+            *p++ = ' ';
+            if ( ++i%64 == 0 ) {
+                *p++ = '\n';
+                *p++ = 'x';
+            }
+        }
+        *p = 'a';
+
+        f.flush(true);
+        out() << "done" << endl;
+
+        char *x = start + 32 * 1024 * 1024;
+        char *y = start + 48 * 1024 * 1024;
+        char *z = start + 62 * 1024 * 1024;
+
+        strcpy(z, "zfoo");
+        out() << "y" << endl;
+        strcpy(y, "yfoo");
+        strcpy(x, "xfoo");
+        strcpy(start, "xfoo");
+
+        dbexit( EXIT_TEST );
+
+        return 1;
+    }
+
+} // namespace mongo
diff --git a/db/update.cpp b/db/update.cpp
new file mode 100644
index 0000000..0639a99
--- /dev/null
+++ b/db/update.cpp
@@ -0,0 +1,736 @@
+// update.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "stdafx.h"
+#include "query.h"
+#include "pdfile.h"
+#include "jsobjmanipulator.h"
+#include "queryoptimizer.h"
+#include "repl.h"
+#include "update.h"
+
+namespace mongo {
+
+    const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" ,
+                                     "$bitand" , "$bitor" , "$bit" };
+    unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*);
+
+    bool Mod::_pullElementMatch( BSONElement& toMatch ) const {
+        
+        if ( elt.type() != Object ){
+            // if elt isn't an object, then comparison will work
+            return toMatch.woCompare( elt , false ) == 0;
+        }
+
+        if ( toMatch.type() != Object ){
+            // looking for an object, so this can't match
+            return false;
+        }
+        
+        // now we have an object on both sides
+        return matcher->matches( toMatch.embeddedObject() );
+    }
+
+    void Mod::apply( BSONObjBuilder& b , BSONElement in ){
+        switch ( op ){
+        
+        case INC: {
+            // TODO: this is horrible
+            inc( in );
+            b.appendAs( elt , shortFieldName ); 
+            break;
+        }
+            
+        case SET: {
+            _checkForAppending( elt );
+            b.appendAs( elt , shortFieldName );
+            break;
+        }
+
+        case UNSET: {
+            //Explicit NOOP
+            break;
+        }
+
+        case PUSH: {
+            uassert( 10131 ,  "$push can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+            while ( i.more() ){
+                bb.append( i.next() );
+                n++;
+            }
+
+            pushStartSize = n;
+
+            bb.appendAs( elt ,  bb.numStr( n ) );
+            bb.done();
+            break;
+        }
+            
+        case PUSH_ALL: {
+            uassert( 10132 ,  "$pushAll can only be applied to an array" , in.type() == Array );
+            uassert( 10133 ,  "$pushAll has to be passed an array" , elt.type() );
+
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+            
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+            while ( i.more() ){
+                bb.append( i.next() );
+                n++;
+            }
+
+            pushStartSize = n;
+
+            i = BSONObjIterator( elt.embeddedObject() );
+            while ( i.more() ){
+                bb.appendAs( i.next() , bb.numStr( n++ ) );
+            }
+
+            bb.done();
+            break;
+        }
+            
+        case PULL:
+        case PULL_ALL: {
+            uassert( 10134 ,  "$pull/$pullAll can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+                        
+            int n = 0;
+
+            BSONObjIterator i( in.embeddedObject() );
+            while ( i.more() ){
+                BSONElement e = i.next();
+                bool allowed = true;
+
+                if ( op == PULL ){
+                    allowed = ! _pullElementMatch( e );
+                }
+                else {
+                    BSONObjIterator j( elt.embeddedObject() );
+                    while( j.more() ) {
+                        BSONElement arrJ = j.next();
+                        if ( e.woCompare( arrJ, false ) == 0 ){
+                            allowed = false;
+                            break;
+                        }
+                    }
+                }
+
+                if ( allowed )
+                    bb.appendAs( e , bb.numStr( n++ ) );
+            }
+            
+            bb.done();
+            break;
+        }
+
+        case POP: {
+            uassert( 10135 ,  "$pop can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+                        
+            int n = 0;
+
+            BSONObjIterator i( in.embeddedObject() );
+            if ( elt.isNumber() && elt.number() < 0 ){
+                // pop from front
+                if ( i.more() ){
+                    i.next();
+                    n++;
+                }
+
+                while( i.more() ) {
+                    bb.appendAs( i.next() , bb.numStr( n - 1 ).c_str() );
+                    n++;
+                }
+            }
+            else {
+                // pop from back
+                while( i.more() ) {
+                    n++;
+                    BSONElement arrI = i.next();
+                    if ( i.more() ){
+                        bb.append( arrI );
+                    }
+                }
+            }
+
+            pushStartSize = n;
+            assert( pushStartSize == in.embeddedObject().nFields() );
+            bb.done();
+            break;
+        }
+
+        case BIT: {
+            uassert( 10136 ,  "$bit needs an array" , elt.type() == Object );
+            uassert( 10137 ,  "$bit can only be applied to numbers" , in.isNumber() );
+            uassert( 10138 ,  "$bit can't use a double" , in.type() != NumberDouble );
+            
+            int x = in.numberInt();
+            long long y = in.numberLong();
+
+            BSONObjIterator it( elt.embeddedObject() );
+            while ( it.more() ){
+                BSONElement e = it.next();
+                uassert( 10139 ,  "$bit field must be number" , e.isNumber() );
+                if ( strcmp( e.fieldName() , "and" ) == 0 ){
+                    switch( in.type() ){
+                    case NumberInt: x = x&e.numberInt(); break;
+                    case NumberLong: y = y&e.numberLong(); break;
+                    default: assert( 0 );
+                    }
+                }
+                else if ( strcmp( e.fieldName() , "or" ) == 0 ){
+                    switch( in.type() ){
+                    case NumberInt: x = x|e.numberInt(); break;
+                    case NumberLong: y = y|e.numberLong(); break;
+                    default: assert( 0 );
+                    }
+                }
+
+                else {
+                    throw UserException( 9016, (string)"unknown bit mod:" + e.fieldName() );
+                }
+            }
+            
+            switch( in.type() ){
+            case NumberInt: b.append( shortFieldName , x ); break;
+            case NumberLong: b.append( shortFieldName , y ); break;
+            default: assert( 0 );
+            }
+
+            break;
+        }
+
+        default:
+            stringstream ss;
+            ss << "Mod::apply can't handle type: " << op;
+            throw UserException( 9017, ss.str() );
+        }
+    }
+
+    bool ModSet::canApplyInPlaceAndVerify(const BSONObj &obj) const {
+        bool inPlacePossible = true;
+
+        // Perform this check first, so that we don't leave a partially modified object on uassert.
+        for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            const Mod& m = i->second;
+            BSONElement e = obj.getFieldDotted(m.fieldName);
+            
+            if ( e.eoo() ) {
+                inPlacePossible = (m.op == Mod::UNSET);
+            } 
+            else {
+                switch( m.op ) {
+                case Mod::INC:
+                    uassert( 10140 ,  "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() );
+                    if ( !e.isNumber() )
+                        inPlacePossible = false;
+                    break;
+                case Mod::SET:
+                    inPlacePossible = 
+                        m.elt.type() == e.type() &&
+                        m.elt.valuesize() == e.valuesize();
+                    break;
+                case Mod::PUSH:
+                case Mod::PUSH_ALL:
+                    uassert( 10141 ,  "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() );
+                    inPlacePossible = false;
+                    break;
+                case Mod::PULL:
+                case Mod::PULL_ALL: {
+                    uassert( 10142 ,  "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() );
+                    BSONObjIterator i( e.embeddedObject() );
+                    while( inPlacePossible && i.more() ) {
+                        BSONElement arrI = i.next();
+                        if ( m.op == Mod::PULL ) {
+                            if ( m._pullElementMatch( arrI ) )
+                                inPlacePossible = false;
+                        } 
+                        else if ( m.op == Mod::PULL_ALL ) {
+                            BSONObjIterator j( m.elt.embeddedObject() );
+                            while( inPlacePossible && j.moreWithEOO() ) {
+                                BSONElement arrJ = j.next();
+                                if ( arrJ.eoo() )
+                                    break;
+                                if ( arrI.woCompare( arrJ, false ) == 0 ) {
+                                    inPlacePossible = false;
+                                }
+                            }
+                        }
+                    }
+                    break;
+                }
+                case Mod::POP: {
+                    uassert( 10143 ,  "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() );
+                    if ( ! e.embeddedObject().isEmpty() )
+                        inPlacePossible = false;
+                    break;
+                }
+                default:
+                    // mods we don't know about shouldn't be done in place
+                    inPlacePossible = false;
+                }
+            }
+        }
+        return inPlacePossible;
+    }
+    
+    void ModSet::applyModsInPlace(const BSONObj &obj) const {
+        for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            const Mod& m = i->second;
+            BSONElement e = obj.getFieldDotted(m.fieldName);
+            
+            switch ( m.op ){
+            case Mod::UNSET:
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                break;
+
+            // [dm] the BSONElementManipulator statements below are for replication (correct?)
+            case Mod::INC:
+                m.inc(e);
+                m.setElementToOurNumericValue(e);
+                break;
+            case Mod::SET:
+                if ( e.isNumber() && m.elt.isNumber() ) {
+                    // todo: handle NumberLong:
+                    m.setElementToOurNumericValue(e);
+                } 
+                else {
+                    BSONElementManipulator( e ).replaceTypeAndValue( m.elt );
+                }
+                break;
+            default:
+                uassert( 10144 ,  "can't apply mod in place - shouldn't have gotten here" , 0 );
+            }
+        }
+    }
+
+    void ModSet::extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base ) {
+        if ( top.type() != Object ) {
+            fields[ base + top.fieldName() ] = top;
+            return;
+        }
+        BSONObjIterator i( top.embeddedObject() );
+        bool empty = true;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            extractFields( fields, e, base + top.fieldName() + "." );
+            empty = false;
+        }
+        if ( empty )
+            fields[ base + top.fieldName() ] = top;            
+    }
+    
+    void ModSet::_appendNewFromMods( const string& root , Mod& m , BSONObjBuilder& b , set<string>& onedownseen ){
+        const char * temp = m.fieldName;
+        temp += root.size();
+        const char * dot = strchr( temp , '.' );
+        if ( dot ){
+            string nr( m.fieldName , 0 , 1 + ( dot - m.fieldName ) );
+            string nf( temp , 0 , dot - temp );
+            if ( onedownseen.count( nf ) )
+                return;
+            onedownseen.insert( nf );
+            BSONObjBuilder bb ( b.subobjStart( nf.c_str() ) );
+            createNewFromMods( nr , bb , BSONObj() );
+            bb.done();
+        }
+        else {
+            appendNewFromMod( m , b );
+        }
+        
+    }
+    
+    void ModSet::createNewFromMods( const string& root , BSONObjBuilder& b , const BSONObj &obj ){
+        BSONObjIteratorSorted es( obj );
+        BSONElement e = es.next();
+
+        ModHolder::iterator m = _mods.lower_bound( root );
+        ModHolder::iterator mend = _mods.lower_bound( root + "{" );
+
+        set<string> onedownseen;
+        
+        while ( e.type() && m != mend ){
+            string field = root + e.fieldName();
+            FieldCompareResult cmp = compareDottedFieldNames( m->second.fieldName , field );
+
+            switch ( cmp ){
+                
+            case LEFT_SUBFIELD: { // Mod is embeddeed under this element
+                uassert( 10145 ,  "LEFT_SUBFIELD only supports Object" , e.type() == Object || e.type() == Array );
+                if ( onedownseen.count( e.fieldName() ) == 0 ){
+                    onedownseen.insert( e.fieldName() );
+                    BSONObjBuilder bb ( e.type() == Object ? b.subobjStart( e.fieldName() ) : b.subarrayStart( e.fieldName() ) );
+                    stringstream nr; nr << root << e.fieldName() << ".";
+                    createNewFromMods( nr.str() , bb , e.embeddedObject() );
+                    bb.done();
+                    // inc both as we handled both
+                    e = es.next();
+                    m++;
+                }
+                continue;
+            }
+            case LEFT_BEFORE: // Mod on a field that doesn't exist
+                _appendNewFromMods( root , m->second , b , onedownseen );
+                m++;
+                continue;
+            case SAME:
+                m->second.apply( b , e );
+                e = es.next();
+                m++;
+                continue;
+            case RIGHT_BEFORE: // field that doesn't have a MOD
+                b.append( e );
+                e = es.next();
+                continue;
+            case RIGHT_SUBFIELD:
+                massert( 10399 ,  "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 ); 
+                break;
+            default:
+                massert( 10400 ,  "unhandled case" , 0 );
+            }
+        }
+        
+        // finished looping the mods, just adding the rest of the elements
+        while ( e.type() ){
+            b.append( e );
+            e = es.next();
+        }
+        
+        // do mods that don't have fields already
+        for ( ; m != mend; m++ ){
+            _appendNewFromMods( root , m->second , b , onedownseen );
+        }
+    }
+
+    BSONObj ModSet::createNewFromMods( const BSONObj &obj ) {
+        BSONObjBuilder b( (int)(obj.objsize() * 1.1) );
+        createNewFromMods( "" , b , obj );
+        return b.obj();
+    }
+
+    BSONObj ModSet::createNewFromQuery( const BSONObj& query ){
+        BSONObj newObj;
+
+        {
+            BSONObjBuilder bb;
+            EmbeddedBuilder eb( &bb );
+            BSONObjIteratorSorted i( query );
+            while ( i.more() ){
+                BSONElement e = i.next();
+
+                if ( e.type() == Object && e.embeddedObject().firstElement().fieldName()[0] == '$' ){
+                    // this means this is a $gt type filter, so don't make part of the new object
+                    continue;
+                }
+
+                eb.appendAs( e , e.fieldName() );
+            }
+            eb.done();
+            newObj = bb.obj();
+        }
+        
+        if ( canApplyInPlaceAndVerify( newObj ) )
+            applyModsInPlace( newObj );
+        else
+            newObj = createNewFromMods( newObj );
+        
+        return newObj;
+    }
+    
+    /* get special operations like $inc
+       { $inc: { a:1, b:1 } }
+       { $set: { a:77 } }
+       { $push: { a:55 } }
+       { $pushAll: { a:[77,88] } }
+       { $pull: { a:66 } }
+       { $pullAll : { a:[99,1010] } }
+       NOTE: MODIFIES source from object!
+    */
+    void ModSet::getMods(const BSONObj &from) {
+        BSONObjIterator it(from);
+        while ( it.more() ) {
+            BSONElement e = it.next();
+            const char *fn = e.fieldName();
+            uassert( 10147 ,  "Invalid modifier specified" + string( fn ), e.type() == Object );
+            BSONObj j = e.embeddedObject();
+            BSONObjIterator jt(j);
+            Mod::Op op = opFromStr( fn );
+            if ( op == Mod::INC )
+                strcpy((char *) fn, "$set"); // rewrite for op log
+            while ( jt.more() ) {
+                BSONElement f = jt.next(); // x:44
+
+                const char * fieldName = f.fieldName();
+
+                uassert( 10148 ,  "Mod on _id not allowed", strcmp( fieldName, "_id" ) != 0 );
+                uassert( 10149 ,  "Invalid mod field name, may not end in a period", fieldName[ strlen( fieldName ) - 1 ] != '.' );
+                uassert( 10150 ,  "Field name duplication not allowed with modifiers", ! haveModForField( fieldName ) );
+                uassert( 10151 ,  "have conflict mod" , ! haveConflictingMod( fieldName ) );
+                uassert( 10152 ,  "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC );
+                uassert( 10153 ,  "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) );
+
+                Mod m;
+                m.init( op , f );
+                m.setFieldName( f.fieldName() );
+
+                // horrible - to be cleaned up
+                if ( f.type() == NumberDouble ) {
+                    m.ndouble = (double *) f.value();
+                    m.nint = 0;
+                } else if ( f.type() == NumberInt ) {
+                    m.ndouble = 0;
+                    m.nint = (int *) f.value();
+                }
+                else if( f.type() == NumberLong ) { 
+                    m.ndouble = 0;
+                    m.nint = 0;
+                    m.nlong = (long long *) f.value();
+                }
+
+                _mods[m.fieldName] = m;
+            }
+        }
+    }
+    
+    void checkNoMods( BSONObj o ) {
+        BSONObjIterator i( o );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            uassert( 10154 ,  "Modifiers and non-modifiers cannot be mixed", e.fieldName()[ 0 ] != '$' );
+        }
+    }
+    
+    class UpdateOp : public QueryOp {
+    public:
+        UpdateOp() : nscanned_() {}
+        virtual void init() {
+            BSONObj pattern = qp().query();
+            c_.reset( qp().newCursor().release() );
+            if ( !c_->ok() )
+                setComplete();
+            else
+                matcher_.reset( new CoveredIndexMatcher( pattern, qp().indexKey() ) );
+        }
+        virtual void next() {
+            if ( !c_->ok() ) {
+                setComplete();
+                return;
+            }
+            nscanned_++;
+            if ( matcher_->matches(c_->currKey(), c_->currLoc()) ) {
+                setComplete();
+                return;
+            }
+            c_->advance();
+        }
+        bool curMatches(){
+            return matcher_->matches(c_->currKey(), c_->currLoc() );
+        }
+        virtual bool mayRecordPlan() const { return false; }
+        virtual QueryOp *clone() const {
+            return new UpdateOp();
+        }
+        shared_ptr< Cursor > c() { return c_; }
+        long long nscanned() const { return nscanned_; }
+    private:
+        shared_ptr< Cursor > c_;
+        long long nscanned_;
+        auto_ptr< CoveredIndexMatcher > matcher_;
+    };
+
+    
+    UpdateResult updateObjects(const char *ns, BSONObj updateobjOrig, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) {
+        int profile = cc().database()->profile;
+        StringBuilder& ss = debug.str;
+        
+        uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 );
+        if ( strstr(ns, ".system.") ) {
+            /* dm: it's very important that system.indexes is never updated as IndexDetails has pointers into it */
+            uassert( 10156 , "cannot update system collection", legalClientSystemNS( ns , true ) );
+        }
+
+        set<DiskLoc> seenObjects;
+        
+        QueryPlanSet qps( ns, patternOrig, BSONObj() );
+        UpdateOp original;
+        shared_ptr< UpdateOp > u = qps.runOp( original );
+        massert( 10401 ,  u->exceptionMessage(), u->complete() );
+        shared_ptr< Cursor > c = u->c();
+        int numModded = 0;
+        while ( c->ok() ) {
+            if ( numModded > 0 && ! u->curMatches() ){
+                c->advance();
+                continue;
+            }
+            Record *r = c->_current();
+            DiskLoc loc = c->currLoc();
+
+            if ( c->getsetdup( loc ) ){
+                c->advance();
+                continue;
+            }
+                               
+            BSONObj js(r);
+            
+            BSONObj pattern = patternOrig;
+            BSONObj updateobj = updateobjOrig;
+
+            if ( logop ) {
+                BSONObjBuilder idPattern;
+                BSONElement id;
+                // NOTE: If the matching object lacks an id, we'll log
+                // with the original pattern.  This isn't replay-safe.
+                // It might make sense to suppress the log instead
+                // if there's no id.
+                if ( js.getObjectID( id ) ) {
+                    idPattern.append( id );
+                    pattern = idPattern.obj();
+                }
+                else {
+                    uassert( 10157 ,  "multi-update requires all modified objects to have an _id" , ! multi );
+                }
+            }
+            
+            if ( profile )
+                ss << " nscanned:" << u->nscanned();
+            
+            /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
+               regular ones at the moment. */
+            
+            const char *firstField = updateobj.firstElement().fieldName();
+            
+            if ( firstField[0] == '$' ) {
+
+                if ( multi ){
+                    c->advance(); // go to next record in case this one moves
+                    if ( seenObjects.count( loc ) )
+                        continue;
+                    updateobj = updateobj.copy();
+                }
+                
+                ModSet mods;
+                mods.getMods(updateobj);
+                NamespaceDetailsTransient& ndt = NamespaceDetailsTransient::get_w(ns);
+                set<string>& idxKeys = ndt.indexKeys();
+                int isIndexed = mods.isIndexed( idxKeys );
+                
+                if ( isIndexed && multi ){
+                    c->noteLocation();
+                }
+
+                if ( isIndexed <= 0 && mods.canApplyInPlaceAndVerify( loc.obj() ) ) {
+                    mods.applyModsInPlace( loc.obj() );
+                    //seenObjects.insert( loc );
+                    if ( profile )
+                        ss << " fastmod ";
+                    
+                    if ( isIndexed ){
+                        seenObjects.insert( loc );
+                    }
+                } 
+                else {
+                    BSONObj newObj = mods.createNewFromMods( loc.obj() );
+                    uassert( 12522 , "$ operator made objcet too large" , newObj.isValid() );
+                    DiskLoc newLoc = theDataFileMgr.update(ns, r, loc , newObj.objdata(), newObj.objsize(), debug);
+                    if ( newLoc != loc || isIndexed ){
+                        // object moved, need to make sure we don' get again
+                        seenObjects.insert( newLoc );
+                    }
+                        
+                }
+                
+                if ( logop ) {
+                    
+                    assert( mods.size() );
+
+                    if ( mods.haveArrayDepMod() ) {
+                        BSONObjBuilder patternBuilder;
+                        patternBuilder.appendElements( pattern );
+                        mods.appendSizeSpecForArrayDepMods( patternBuilder );
+                        pattern = patternBuilder.obj();                        
+                    }
+                    
+                    if ( mods.needOpLogRewrite() )
+                        updateobj = mods.getOpLogRewrite();
+                    
+                    logOp("u", ns, updateobj, &pattern );
+                }
+                numModded++;
+                if ( ! multi )
+                    break;
+                if ( multi && isIndexed )
+                    c->checkLocation();
+                continue;
+            } 
+            
+            uassert( 10158 ,  "multi update only works with $ operators" , ! multi );
+
+            BSONElementManipulator::lookForTimestamps( updateobj );
+            checkNoMods( updateobj );
+            theDataFileMgr.update(ns, r, loc , updateobj.objdata(), updateobj.objsize(), debug);
+            if ( logop )
+                logOp("u", ns, updateobj, &pattern );
+            return UpdateResult( 1 , 0 , 1 );
+        }
+        
+        if ( numModded )
+            return UpdateResult( 1 , 1 , numModded );
+
+        
+        if ( profile )
+            ss << " nscanned:" << u->nscanned();
+        
+        if ( upsert ) {
+            if ( updateobjOrig.firstElement().fieldName()[0] == '$' ) {
+                /* upsert of an $inc. build a default */
+                ModSet mods;
+                mods.getMods(updateobjOrig);
+                 
+                BSONObj newObj = mods.createNewFromQuery( patternOrig );
+
+                if ( profile )
+                    ss << " fastmodinsert ";
+                theDataFileMgr.insert(ns, newObj);
+                if ( profile )
+                    ss << " fastmodinsert ";
+                if ( logop )
+                    logOp( "i", ns, newObj );
+                return UpdateResult( 0 , 1 , 1 );
+            }
+            uassert( 10159 ,  "multi update only works with $ operators" , ! multi );
+            checkNoMods( updateobjOrig );
+            if ( profile )
+                ss << " upsert ";
+            theDataFileMgr.insert(ns, updateobjOrig);
+            if ( logop )
+                logOp( "i", ns, updateobjOrig );
+            return UpdateResult( 0 , 0 , 1 );
+        }
+        return UpdateResult( 0 , 0 , 0 );
+    }
+    
+}
diff --git a/db/update.h b/db/update.h
new file mode 100644
index 0000000..26a8a8d
--- /dev/null
+++ b/db/update.h
@@ -0,0 +1,382 @@
+// update.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../stdafx.h"
+#include "jsobj.h"
+#include "../util/embedded_builder.h"
+#include "matcher.h"
+
+namespace mongo {
+
+    /* Used for modifiers such as $inc, $set, $push, ... */
+    struct Mod {
+        // See opFromStr below
+        //        0    1    2     3         4     5          6    7      8       9       10
+        enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT  } op;
+        
+        static const char* modNames[];
+        static unsigned modNamesNum;
+
+        const char *fieldName;
+        const char *shortFieldName;
+        
+        // kind of lame; fix one day?
+        double *ndouble;
+        int *nint;
+        long long *nlong;
+
+        BSONElement elt; // x:5 note: this is the actual element from the updateobj
+        int pushStartSize;
+        boost::shared_ptr<Matcher> matcher;
+
+        void init( Op o , BSONElement& e ){
+            op = o;
+            elt = e;
+            if ( op == PULL && e.type() == Object )
+                matcher.reset( new Matcher( e.embeddedObject() ) );
+        }
+
+        void setFieldName( const char * s ){
+            fieldName = s;
+            shortFieldName = strrchr( fieldName , '.' );
+            if ( shortFieldName )
+                shortFieldName++;
+            else
+                shortFieldName = fieldName;
+        }
+
+        /* [dm] why is this const? (or rather, why was setn const?)  i see why but think maybe clearer if were not.  */
+        void inc(BSONElement& n) const { 
+            uassert( 10160 ,  "$inc value is not a number", n.isNumber() );
+            if( ndouble ) 
+                *ndouble += n.numberDouble();
+            else if( nint )
+                *nint += n.numberInt();
+            else
+                *nlong += n.numberLong();
+        }
+
+        void setElementToOurNumericValue(BSONElement& e) const { 
+            BSONElementManipulator manip(e);
+            if( e.type() == NumberLong )
+                manip.setLong(_getlong());
+            else
+                manip.setNumber(_getn());
+        }
+
+        double _getn() const {
+            if( ndouble ) return *ndouble;
+            if( nint ) return *nint;
+            return (double) *nlong;
+        }
+        long long _getlong() const {
+            if( nlong ) return *nlong; 
+            if( ndouble ) return (long long) *ndouble;
+            return *nint;
+        }
+        bool operator<( const Mod &other ) const {
+            return strcmp( fieldName, other.fieldName ) < 0;
+        }
+        
+        bool arrayDep() const {
+            switch (op){
+            case PUSH:
+            case PUSH_ALL:
+            case POP:
+                return true;
+            default:
+                return false;
+            }
+        }
+        
+        bool isIndexed( const set<string>& idxKeys ) const {
+            // check if there is an index key that is a parent of mod
+            for( const char *dot = strchr( fieldName, '.' ); dot; dot = strchr( dot + 1, '.' ) )
+                if ( idxKeys.count( string( fieldName, dot - fieldName ) ) )
+                    return true;
+            string fullName = fieldName;
+            // check if there is an index key equal to mod
+            if ( idxKeys.count(fullName) )
+                return true;
+            // check if there is an index key that is a child of mod
+            set< string >::const_iterator j = idxKeys.upper_bound( fullName );
+            if ( j != idxKeys.end() && j->find( fullName ) == 0 && (*j)[fullName.size()] == '.' )
+                return true;
+            return false;
+        }
+        
+        void apply( BSONObjBuilder& b , BSONElement in );
+        
+        /**
+         * @return true iff toMatch should be removed from the array
+         */
+        bool _pullElementMatch( BSONElement& toMatch ) const;
+
+        bool needOpLogRewrite() const {
+            switch( op ){
+            case BIT:
+            case BITAND:
+            case BITOR:
+                // TODO: should we convert this to $set?
+                return false;
+            default:
+                return false;
+            }
+        }
+        
+        void appendForOpLog( BSONObjBuilder& b ) const {
+            const char * name = modNames[op];
+            
+            BSONObjBuilder bb( b.subobjStart( name ) );
+            bb.append( elt );
+            bb.done();
+        }
+
+        void _checkForAppending( BSONElement& e ){
+            if ( e.type() == Object ){
+                // this is a tiny bit slow, but rare and important
+                // only when setting something TO an object, not setting something in an object
+                // and it checks for { $set : { x : { 'a.b' : 1 } } } 
+                // which is feel has been common
+                uassert( 12527 , "not okForStorage" , e.embeddedObject().okForStorage() );
+            }
+        }
+        
+    };
+
+    class ModSet {
+        typedef map<string,Mod> ModHolder;
+        ModHolder _mods;
+        
+        static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base );
+        
+        FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const {
+            bool mDone = ( m == _mods.end() );
+            bool pDone = ( p == pEnd );
+            assert( ! mDone );
+            assert( ! pDone );
+            if ( mDone && pDone )
+                return SAME;
+            // If one iterator is done we want to read from the other one, so say the other one is lower.
+            if ( mDone )
+                return RIGHT_BEFORE;
+            if ( pDone )
+                return LEFT_BEFORE;
+
+            return compareDottedFieldNames( m->first, p->first.c_str() );
+        }
+
+        void _appendNewFromMods( const string& root , Mod& m , BSONObjBuilder& b , set<string>& onedownseen );
+        
+        void appendNewFromMod( Mod& m , BSONObjBuilder& b ){
+            switch ( m.op ){
+                
+            case Mod::PUSH: { 
+                BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+                arr.appendAs( m.elt, "0" );
+                arr.done();
+                m.pushStartSize = -1;
+                break;
+            } 
+                
+            case Mod::PUSH_ALL: {
+                b.appendAs( m.elt, m.shortFieldName );
+                m.pushStartSize = -1;
+                break;
+            } 
+                
+            case Mod::UNSET:
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                // no-op b/c unset/pull of nothing does nothing
+                break;
+                
+            case Mod::INC:
+            case Mod::SET: {
+                m._checkForAppending( m.elt );
+                b.appendAs( m.elt, m.shortFieldName );
+                break;
+            }
+            default: 
+                stringstream ss;
+                ss << "unknown mod in appendNewFromMod: " << m.op;
+                throw UserException( 9015, ss.str() );
+            }
+         
+        }
+        
+        bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) {
+            for( string left = EmbeddedBuilder::splitDot( right );
+                 left.length() > 0 && left[ left.length() - 1 ] != '.';
+                 left += "." + EmbeddedBuilder::splitDot( right ) ) {
+                if ( existing.count( left ) > 0 && existing[ left ].type() != Object )
+                    return false;
+                if ( haveModForField( left.c_str() ) )
+                    return false;
+            }
+            return true;
+        }
+        static Mod::Op opFromStr( const char *fn ) {
+            assert( fn[0] == '$' );
+            switch( fn[1] ){
+            case 'i': {
+                if ( fn[2] == 'n' && fn[3] == 'c' && fn[4] == 0 )
+                    return Mod::INC;
+                break;
+            }
+            case 's': {
+                if ( fn[2] == 'e' && fn[3] == 't' && fn[4] == 0 )
+                    return Mod::SET;
+                break;
+            }
+            case 'p': {
+                if ( fn[2] == 'u' ){
+                    if ( fn[3] == 's' && fn[4] == 'h' ){
+                        if ( fn[5] == 0 )
+                            return Mod::PUSH;
+                        if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+                            return Mod::PUSH_ALL;
+                    }
+                    else if ( fn[3] == 'l' && fn[4] == 'l' ){
+                        if ( fn[5] == 0 )
+                            return Mod::PULL;
+                        if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+                            return Mod::PULL_ALL;
+                    }
+                }
+                else if ( fn[2] == 'o' && fn[3] == 'p' && fn[4] == 0 )
+                    return Mod::POP;
+                break;
+            }
+            case 'u': {
+                if ( fn[2] == 'n' && fn[3] == 's' && fn[4] == 'e' && fn[5] == 't' && fn[6] == 0 )
+                    return Mod::UNSET;
+                break;
+            }
+            case 'b': {
+                if ( fn[2] == 'i' && fn[3] == 't' ){
+                    if ( fn[4] == 0 )
+                        return Mod::BIT;
+                    if ( fn[4] == 'a' && fn[5] == 'n' && fn[6] == 'd' && fn[7] == 0 )
+                        return Mod::BITAND;
+                    if ( fn[4] == 'o' && fn[5] == 'r' && fn[6] == 0 )
+                        return Mod::BITOR;
+                }
+                break;
+            }
+            default: break;
+            }
+            uassert( 10161 ,  "Invalid modifier specified " + string( fn ), false );
+            return Mod::INC;
+        }
+        
+    public:
+
+        void getMods( const BSONObj &from );
+        /**
+           will return if can be done in place, or uassert if there is an error
+           @return whether or not the mods can be done in place
+         */
+        bool canApplyInPlaceAndVerify( const BSONObj &obj ) const;
+        void applyModsInPlace( const BSONObj &obj ) const;
+
+        // new recursive version, will replace at some point
+        void createNewFromMods( const string& root , BSONObjBuilder& b , const BSONObj &obj );
+
+        BSONObj createNewFromMods( const BSONObj &obj );
+
+        BSONObj createNewFromQuery( const BSONObj& query );
+
+        /**
+         *
+         */
+        int isIndexed( const set<string>& idxKeys ) const {
+            int numIndexes = 0;
+            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ){
+                if ( i->second.isIndexed( idxKeys ) )
+                    numIndexes++;
+            }
+            return numIndexes;
+        }
+
+        unsigned size() const { return _mods.size(); }
+
+        bool haveModForField( const char *fieldName ) const {
+            return _mods.find( fieldName ) != _mods.end();
+        }
+
+        bool haveConflictingMod( const string& fieldName ){
+            size_t idx = fieldName.find( '.' );
+            if ( idx == string::npos )
+                idx = fieldName.size();
+            
+            ModHolder::const_iterator start = _mods.lower_bound(fieldName.substr(0,idx));
+            for ( ; start != _mods.end(); start++ ){
+                FieldCompareResult r = compareDottedFieldNames( fieldName , start->first );
+                switch ( r ){
+                case LEFT_SUBFIELD: return true;
+                case LEFT_BEFORE: return false;
+                case SAME: return true;
+                case RIGHT_BEFORE: return false;
+                case RIGHT_SUBFIELD: return true;
+                }
+            }
+            return false;
+
+            
+        }
+        
+        // re-writing for oplog
+
+        bool needOpLogRewrite() const {
+            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                if ( i->second.needOpLogRewrite() )
+                    return true;
+            return false;            
+        }
+        
+        BSONObj getOpLogRewrite() const {
+            BSONObjBuilder b;
+            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                i->second.appendForOpLog( b );
+            return b.obj();
+        }
+
+        bool haveArrayDepMod() const {
+            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                if ( i->second.arrayDep() )
+                    return true;
+            return false;
+        }
+
+        void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const {
+            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) {
+                const Mod& m = i->second;
+                if ( m.arrayDep() ){
+                    if ( m.pushStartSize == -1 )
+                        b.appendNull( m.fieldName );
+                    else
+                        b << m.fieldName << BSON( "$size" << m.pushStartSize );
+                }
+            }
+        }
+    };
+    
+
+}
+