Imported Upstream version 1.4.0

author: Antonin Kral <a.kral@bobek.cz> 2010-03-25 19:21:32 +0100
committer: Antonin Kral <a.kral@bobek.cz> 2010-03-25 19:21:32 +0100
commit: 0ca01a91ae0a3562e54c226e7b9512feb2ea83d0 (patch)
tree: 2b3886e435b0217d6afd63a213b04d32bb4b4f6f /db
parent: a696359b248adef0cc8576fce3f473535e995136 (diff)
download: mongodb-0ca01a91ae0a3562e54c226e7b9512feb2ea83d0.tar.gz
83 files changed, 9695 insertions, 3499 deletions
diff --git a/db/background.h b/db/background.h
new file mode 100644
index 0000000..24ea1cb
--- /dev/null
+++ b/db/background.h
@@ -0,0 +1,56 @@
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* background.h
+
+   Concurrency coordination for administrative operations.
+*/
+
+#pragma once
+
+namespace mongo { 
+
+    /* these are administrative operations / jobs 
+       for a namespace running in the background, and that only one 
+       at a time per namespace is permitted, and that if in progress,
+       you aren't allowed to do other NamespaceDetails major manipulations
+       (such as dropping ns or db) even in the foreground and must 
+       instead uassert. 
+
+       It's assumed this is not for super-high RPS things, so we don't do 
+       anything special in the implementation here to be fast.
+    */
+    class BackgroundOperation : public boost::noncopyable {
+    public:
+        static bool inProgForDb(const char *db);
+        static bool inProgForNs(const char *ns);
+        static void assertNoBgOpInProgForDb(const char *db);
+        static void assertNoBgOpInProgForNs(const char *ns);
+        static void dump(stringstream&);
+
+        /* check for in progress before instantiating */
+        BackgroundOperation(const char *ns);
+
+        virtual ~BackgroundOperation();
+
+    private:
+        NamespaceString _ns;
+        static map<string, unsigned> dbsInProg;
+        static set<string> nsInProg;
+    };
+
+} // namespace mongo
+
diff --git a/db/btree.cpp b/db/btree.cpp
index 8b910f5..18f9e76 100644
--- a/db/btree.cpp
+++ b/db/btree.cpp
@@ -25,6 +25,7 @@
 #include "client.h"
 #include "dbhelpers.h"
 #include "curop.h"
+#include "stats/counters.h"
 
 namespace mongo {
 
@@ -41,6 +42,11 @@ namespace mongo {
     const int split_debug = 0;
     const int insert_debug = 0;
 
+    static void alreadyInIndex() { 
+        // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord()
+        throw MsgAssertionException(10287, "btree: key+recloc already in index");
+    }
+
     /* BucketBasics --------------------------------------------------- */
 
     inline void BucketBasics::modified(const DiskLoc& thisLoc) {
@@ -356,9 +362,36 @@ namespace mongo {
         return false;
     }
 
+    /* @param self - don't complain about ourself already being in the index case.
+       @return true = there is a duplicate.
+    */
+    bool BtreeBucket::wouldCreateDup(
+        const IndexDetails& idx, DiskLoc thisLoc, 
+        const BSONObj& key, BSONObj order,
+        DiskLoc self) 
+    { 
+        int pos;
+        bool found;
+        DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
+
+        while ( !b.isNull() ) {
+            // we skip unused keys
+            BtreeBucket *bucket = b.btree();
+            _KeyNode& kn = bucket->k(pos);
+            if ( kn.isUsed() ) {
+                if( bucket->keyAt(pos).woEqual(key) )
+                    return kn.recordLoc != self;
+                break;
+            }
+            b = bucket->advance(b, pos, 1, "BtreeBucket::dupCheck");
+        }
+
+        return false;
+    }
+
     string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ){
         stringstream ss;
-        ss << "E11000 duplicate key error";
+        ss << "E11000 duplicate key error ";
         ss << "index: " << idx.indexNamespace() << "  ";
         ss << "dup key: " << key;
         return ss.str();
@@ -391,6 +424,9 @@ namespace mongo {
 			}
 		}
 #endif
+        
+        globalIndexCounters.btree( (char*)this );
+        
         /* binary search for this key */
         bool dupsChecked = false;
         int l=0;
@@ -407,12 +443,19 @@ namespace mongo {
                         // coding effort in here to make this particularly fast
                         if( !dupsChecked ) { 
                             dupsChecked = true;
-                            if( idx.head.btree()->exists(idx, idx.head, key, order) )
-                                uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+                            if( idx.head.btree()->exists(idx, idx.head, key, order) ) {
+                                if( idx.head.btree()->wouldCreateDup(idx, idx.head, key, order, recordLoc) )
+                                    uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+                                else
+                                    alreadyInIndex();
+                            }
                         }
                     }
-                    else
+                    else {
+                        if( M.recordLoc == recordLoc ) 
+                            alreadyInIndex();
                         uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+                    }
                 }
 
                 // dup keys allowed.  use recordLoc as if it is part of the key
@@ -444,7 +487,7 @@ namespace mongo {
     }
 
     void BtreeBucket::delBucket(const DiskLoc& thisLoc, IndexDetails& id) {
-        ClientCursor::informAboutToDeleteBucket(thisLoc);
+        ClientCursor::informAboutToDeleteBucket(thisLoc); // slow...
         assert( !isHead() );
 
         BtreeBucket *p = parent.btreemod();
@@ -466,6 +509,10 @@ namespace mongo {
             assert(false);
         }
 found:
+        deallocBucket( thisLoc );
+    }
+    
+    void BtreeBucket::deallocBucket(const DiskLoc &thisLoc) {
 #if 1
         /* as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
            it (meaning it is ineligible for reuse).
@@ -807,13 +854,15 @@ found:
                 return 0;
             }
 
-            out() << "_insert(): key already exists in index\n";
-            out() << "  " << idx.indexNamespace().c_str() << " thisLoc:" << thisLoc.toString() << '\n';
-            out() << "  " << key.toString() << '\n';
-            out() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
-            out() << "  old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl;
-            out() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
-            massert( 10287 , "btree: key+recloc already in index", false);
+            DEV { 
+                out() << "_insert(): key already exists in index (ok for background:true)\n";
+                out() << "  " << idx.indexNamespace().c_str() << " thisLoc:" << thisLoc.toString() << '\n';
+                out() << "  " << key.toString() << '\n';
+                out() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
+                out() << "  old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl;
+                out() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
+            }
+            alreadyInIndex();
         }
 
         DEBUGGING out() << "TEMP: key: " << key.toString() << endl;
@@ -926,12 +975,11 @@ namespace mongo {
         b->k(1).setUnused();
 
         b->dumpTree(id.head, order);
-        cout << "---\n";
 
         b->bt_insert(id.head, A, key, order, false, id);
 
         b->dumpTree(id.head, order);
-        cout << "---\n";*/
+        */
 
         // this should assert.  does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing)
         b->bt_insert(id.head, C, key, order, false, id);
@@ -1004,20 +1052,27 @@ namespace mongo {
                 BSONObj k; 
                 DiskLoc r;
                 x->popBack(r,k);
-                if( x->n == 0 )
-                    log() << "warning: empty bucket on BtreeBuild " << k.toString() << endl;
+                bool keepX = ( x->n != 0 );
+                DiskLoc keepLoc = keepX ? xloc : x->nextChild;
 
-                if ( ! up->_pushBack(r, k, order, xloc) ){
+                if ( ! up->_pushBack(r, k, order, keepLoc) ){
                     // current bucket full
                     DiskLoc n = BtreeBucket::addBucket(idx);
                     up->tempNext() = n;
                     upLoc = n; 
                     up = upLoc.btreemod();
-                    up->pushBack(r, k, order, xloc);
+                    up->pushBack(r, k, order, keepLoc);
                 }
 
-                xloc = x->tempNext(); /* get next in chain at current level */
-                x->parent = upLoc;
+                DiskLoc nextLoc = x->tempNext(); /* get next in chain at current level */
+                if ( keepX ) {
+                    x->parent = upLoc;                
+                } else {
+                    if ( !x->nextChild.isNull() )
+                        x->nextChild.btreemod()->parent = upLoc;
+                    x->deallocBucket( xloc );
+                }
+                xloc = nextLoc;
             }
             
             loc = upStart;
diff --git a/db/btree.h b/db/btree.h
index 2c2ab81..b2e9ba9 100644
--- a/db/btree.h
+++ b/db/btree.h
@@ -20,7 +20,7 @@
 
 #include "../stdafx.h"
 #include "jsobj.h"
-#include "storage.h"
+#include "diskloc.h"
 #include "pdfile.h"
 
 namespace mongo {
@@ -28,8 +28,8 @@ namespace mongo {
 #pragma pack(1)
 
     struct _KeyNode {
-        DiskLoc prevChildBucket;
-        DiskLoc recordLoc;
+        DiskLoc prevChildBucket; // the lchild
+        DiskLoc recordLoc; // location of the record associated with the key
         short keyDataOfs() const {
             return (short) _kdo;
         }
@@ -53,10 +53,10 @@ namespace mongo {
             */
             recordLoc.GETOFS() |= 1;
         }
-        int isUnused() {
+        int isUnused() const {
             return recordLoc.getOfs() & 1;
         }
-        int isUsed() {
+        int isUsed() const {
             return !isUnused();
         }
     };
@@ -85,13 +85,18 @@ namespace mongo {
         bool isHead() { return parent.isNull(); }
         void assertValid(const BSONObj &order, bool force = false);
         int fullValidate(const DiskLoc& thisLoc, const BSONObj &order); /* traverses everything */
-    protected:
-        void modified(const DiskLoc& thisLoc);
+
         KeyNode keyNode(int i) const {
-            assert( i < n );
+            if ( i >= n ){
+                massert( 13000 , (string)"invalid keyNode: " +  BSON( "i" << i << "n" << n ).jsonString() , i < n );
+            }
             return KeyNode(*this, k(i));
         }
 
+    protected:
+
+        void modified(const DiskLoc& thisLoc);
+
         char * dataAt(short ofs) {
             return data + ofs;
         }
@@ -151,6 +156,10 @@ namespace mongo {
             ss << "    emptySize: " << emptySize << " topSize: " << topSize << endl;
             return ss.str();
         }
+        
+        bool isUsed( int i ) const {
+            return k(i).isUsed();
+        }
 
     protected:
         void _shape(int level, stringstream&);
@@ -184,7 +193,13 @@ namespace mongo {
         */
         bool exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, BSONObj order);
 
+        bool wouldCreateDup(
+            const IndexDetails& idx, DiskLoc thisLoc, 
+            const BSONObj& key, BSONObj order,
+            DiskLoc self); 
+
         static DiskLoc addBucket(IndexDetails&); /* start a new index off, empty */
+        void deallocBucket(const DiskLoc &thisLoc); // clear bucket memory, placeholder for deallocation
         
         static void renameIndexNamespace(const char *oldNs, const char *newNs);
 
@@ -256,6 +271,7 @@ namespace mongo {
 
         virtual void noteLocation(); // updates keyAtKeyOfs...
         virtual void checkLocation();
+        virtual bool supportGetMore() { return true; }
 
         /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
            if a multikey index traversal:
@@ -318,15 +334,20 @@ namespace mongo {
             return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable();
         }
 
-        virtual BSONObj prettyStartKey() const {
-            return prettyKey( startKey );
-        }
-        virtual BSONObj prettyEndKey() const {
-            return prettyKey( endKey );
+        virtual BSONObj prettyIndexBounds() const {
+            BSONArrayBuilder ba;
+            if ( bounds_.size() == 0 ) {
+                ba << BSON_ARRAY( prettyKey( startKey ) << prettyKey( endKey ) );
+            } else {
+                for( BoundList::const_iterator i = bounds_.begin(); i != bounds_.end(); ++i ) {
+                    ba << BSON_ARRAY( prettyKey( i->first ) << prettyKey( i->second ) );
+                }
+            }
+            return ba.arr();
         }
         
         void forgetEndKey() { endKey = BSONObj(); }
-        
+
     private:
         /* Our btrees may (rarely) have "unused" keys when items are deleted.
            Skip past them.
@@ -362,6 +383,7 @@ namespace mongo {
         DiskLoc locAtKeyOfs;
         BoundList bounds_;
         unsigned boundIndex_;
+        const IndexSpec& _spec;
     };
 
 #pragma pack()
@@ -369,6 +391,9 @@ namespace mongo {
     inline bool IndexDetails::hasKey(const BSONObj& key) { 
         return head.btree()->exists(*this, head, key, keyPattern());
     }
+    inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) { 
+        return head.btree()->wouldCreateDup(*this, head, key, keyPattern(), self);
+    }
 
     /* build btree from the bottom up */
     /* _ TODO dropDups */
diff --git a/db/btreecursor.cpp b/db/btreecursor.cpp
index bb477d6..ab15c44 100644
--- a/db/btreecursor.cpp
+++ b/db/btreecursor.cpp
@@ -36,7 +36,8 @@ namespace mongo {
             indexDetails( _id ),
             order( _id.keyPattern() ),
             direction( _direction ),
-            boundIndex_()
+            boundIndex_(),
+            _spec( _id.getSpec() )
     {
         audit();
         init();
@@ -51,7 +52,8 @@ namespace mongo {
             order( _id.keyPattern() ),
             direction( _direction ),
             bounds_( _bounds ),
-            boundIndex_()
+            boundIndex_(),
+            _spec( _id.getSpec() )
     {
         assert( !bounds_.empty() );
         audit();
@@ -74,6 +76,10 @@ namespace mongo {
     }
 
     void BtreeCursor::init() {
+        if ( _spec.getType() ){
+            startKey = _spec.getType()->fixKey( startKey );
+            endKey = _spec.getType()->fixKey( endKey );
+        }
         bool found;
         bucket = indexDetails.head.btree()->
         locate(indexDetails, indexDetails.head, startKey, order, keyOfs, found, direction > 0 ? minDiskLoc : maxDiskLoc, direction);
@@ -88,7 +94,7 @@ namespace mongo {
             init();
         } while ( !ok() && ++boundIndex_ < bounds_.size() );
     }
-    
+
     /* skip unused keys. */
     void BtreeCursor::skipUnusedKeys() {
         int u = 0;
diff --git a/db/client.cpp b/db/client.cpp
index 68a0c9e..dc82a25 100644
--- a/db/client.cpp
+++ b/db/client.cpp
@@ -1,5 +1,5 @@
-// client.cpp
-
+// client.cpp
+
 /**
 *    Copyright (C) 2009 10gen Inc.
 *
@@ -25,40 +25,41 @@
 #include "client.h"
 #include "curop.h"
 #include "json.h"
- 
+#include "security.h"
+
 namespace mongo {
 
-    boost::mutex Client::clientsMutex;
+    mongo::mutex Client::clientsMutex;
     set<Client*> Client::clients; // always be in clientsMutex when manipulating this
     boost::thread_specific_ptr<Client> currentClient;
 
     Client::Client(const char *desc) : 
-      _curOp(new CurOp()),
-      _database(0), _ns("")/*, _nsstr("")*/ 
-      ,_shutdown(false),
+      _context(0),
+      _shutdown(false),
       _desc(desc),
       _god(0)
-    { 
-        ai = new AuthenticationInfo(); 
-        boostlock bl(clientsMutex);
+    {
+        _curOp = new CurOp( this );
+        scoped_lock bl(clientsMutex);
         clients.insert(this);
     }
 
     Client::~Client() { 
         delete _curOp;
-        delete ai; 
-        ai = 0;
         _god = 0;
-        if ( !_shutdown ) {
-            cout << "ERROR: Client::shutdown not called!" << endl;
-        }
+
+        if ( _context )
+            cout << "ERROR: Client::~Client _context should be NULL: " << _desc << endl;
+        if ( !_shutdown ) 
+            cout << "ERROR: Client::shutdown not called: " << _desc << endl;
     }
 
     bool Client::shutdown(){
         _shutdown = true;
-
+        if ( inShutdown() )
+            return false;
         {
-            boostlock bl(clientsMutex);
+            scoped_lock bl(clientsMutex);
             clients.erase(this);
         }
 
@@ -68,8 +69,10 @@ namespace mongo {
             didAnything = true;
             for ( list<string>::iterator i = _tempCollections.begin(); i!=_tempCollections.end(); i++ ){
                 string ns = *i;
+                Top::global.collectionDropped( ns );
+                    
                 dblock l;
-                setClient( ns.c_str() );
+                Client::Context ctx( ns );
                 if ( ! nsdetails( ns.c_str() ) )
                     continue;
                 try {
@@ -88,12 +91,158 @@ namespace mongo {
     }
 
     BSONObj CurOp::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}");
-    WrappingInt CurOp::_nextOpNum;
+    AtomicUInt CurOp::_nextOpNum;
     
-    Client::Context::Context( string ns , Database * db )
-        : _client( currentClient.get() ) {
+    Client::Context::Context( string ns , Database * db, bool doauth )
+        : _client( currentClient.get() ) , _oldContext( _client->_context ) , 
+          _path( dbpath ) , _lock(0) , _justCreated(false) {
         assert( db && db->isOk() );
-        _client->setns( ns.c_str() , db );
+        _ns = ns;
+        _db = db;
+        _client->_context = this;
+        if ( doauth )
+            _auth();
+    }
+
+    void Client::Context::_finishInit( bool doauth ){
+        int lockState = dbMutex.getState();
+        assert( lockState );
+        
+        _db = dbHolder.get( _ns , _path );
+        if ( _db ){
+            _justCreated = false;
+        }
+        else if ( dbMutex.getState() > 0 ){
+            // already in a write lock
+            _db = dbHolder.getOrCreate( _ns , _path , _justCreated );
+            assert( _db );
+        }
+        else if ( dbMutex.getState() < -1 ){
+            // nested read lock :(
+            assert( _lock );
+            _lock->releaseAndWriteLock();
+            _db = dbHolder.getOrCreate( _ns , _path , _justCreated );
+            assert( _db );
+        }
+        else {
+            // we have a read lock, but need to get a write lock for a bit
+            // we need to be in a write lock since we're going to create the DB object
+            // to do that, we're going to unlock, then get a write lock
+            // this is so that if this is the first query and its long doesn't block db
+            // we just have to check that the db wasn't closed in the interim where we unlock
+            for ( int x=0; x<2; x++ ){
+                {                     
+                    dbtemprelease unlock;
+                    writelock lk( _ns );
+                    dbHolder.getOrCreate( _ns , _path , _justCreated );
+                }
+                
+                _db = dbHolder.get( _ns , _path );
+                
+                if ( _db )
+                    break;
+                
+                log() << "db was closed on us right after we opened it: " << _ns << endl;
+            }
+            
+            uassert( 13005 , "can't create db, keeps getting closed" , _db );
+        }
+        
+        _client->_context = this;
+        _client->_curOp->enter( this );
+        if ( doauth )
+            _auth( lockState );
+    }
+
+    void Client::Context::_auth( int lockState ){
+        if ( _client->_ai.isAuthorizedForLock( _db->name , lockState ) )
+            return;
+
+        // before we assert, do a little cleanup
+        _client->_context = _oldContext; // note: _oldContext may be null
+        
+        stringstream ss;
+        ss << "unauthorized for db [" << _db->name << "] lock type: " << lockState << endl;
+        massert( 10057 , ss.str() , 0 );
+    }
+
+    Client::Context::~Context() {
+        DEV assert( _client == currentClient.get() );
+        _client->_curOp->leave( this );
+        _client->_context = _oldContext; // note: _oldContext may be null
+    }
+
+    string Client::toString() const {
+        stringstream ss;
+        if ( _curOp )
+            ss << _curOp->infoNoauth().jsonString();
+        return ss.str();
+    }
+
+    string sayClientState(){
+        Client* c = currentClient.get();
+        if ( ! c )
+            return "no client";
+        return c->toString();
+    }
+    
+    void curopWaitingForLock( int type ){
+        Client * c = currentClient.get();
+        assert( c );
+        CurOp * co = c->curop();
+        if ( co ){
+            co->waitingForLock( type );
+        }
+    }
+    void curopGotLock(){
+        Client * c = currentClient.get();
+        assert(c);
+        CurOp * co = c->curop();
+        if ( co ){
+            co->gotLock();
+        }
+    }
+
+    BSONObj CurOp::infoNoauth() {
+        BSONObjBuilder b;
+        b.append("opid", _opNum);
+        bool a = _active && _start;
+        b.append("active", a);
+        if ( _lockType )
+            b.append("lockType" , _lockType > 0 ? "write" : "read"  );
+        b.append("waitingForLock" , _waitingForLock );
+        
+        if( a ){
+            b.append("secs_running", elapsedSeconds() );
+        }
+        
+        b.append( "op" , opToString( _op ) );
+        
+        b.append("ns", _ns);
+        
+        if( haveQuery() ) {
+            b.append("query", query());
+        }
+        // b.append("inLock",  ??
+        stringstream clientStr;
+        clientStr << inet_ntoa( _remote.sin_addr ) << ":" << ntohs( _remote.sin_port );
+        b.append("client", clientStr.str());
+
+        if ( _client )
+            b.append( "desc" , _client->desc() );
+        
+        if ( ! _message.empty() ){
+            if ( _progressMeter.isActive() ){
+                StringBuilder buf(128);
+                buf << _message << " " << _progressMeter.toString();
+                b.append( "msg" , buf.str() );
+            }
+            else {
+                b.append( "msg" , _message );
+            }
+        }
+
+        return b.obj();
     }
 
 }
diff --git a/db/client.h b/db/client.h
index 99092ca..ab43509 100644
--- a/db/client.h
+++ b/db/client.h
@@ -1,5 +1,5 @@
-// client.h
-
+// client.h
+
 /**
 *    Copyright (C) 2008 10gen Inc.
 *
@@ -25,9 +25,10 @@
 #pragma once
 
 #include "../stdafx.h"
+#include "security.h"
 #include "namespace.h"
 #include "lasterror.h"
-#include "../util/top.h"
+#include "stats/top.h"
 
 namespace mongo { 
 
@@ -39,12 +40,9 @@ namespace mongo {
 
     extern boost::thread_specific_ptr<Client> currentClient;
 
-    bool setClient(const char *ns, const string& path=dbpath, mongolock *lock = 0);
-
-
     class Client : boost::noncopyable { 
     public:
-        static boost::mutex clientsMutex;
+        static mongo::mutex clientsMutex;
         static set<Client*> clients; // always be in clientsMutex when manipulating this
 
         class GodScope {
@@ -57,71 +55,125 @@ namespace mongo {
         /* Set database we want to use, then, restores when we finish (are out of scope)
            Note this is also helpful if an exception happens as the state if fixed up.
         */
-        class Context {
+        class Context : boost::noncopyable{
             Client * _client;
-            Database * _olddb;
-            string _oldns;
+            Context * _oldContext;
+            
+            string _path;
+            mongolock * _lock;
+            bool _justCreated;
+
+            string _ns;
+            Database * _db;
+
+            /**
+             * at this point _client, _oldContext and _ns have to be set
+             * _db should not have been touched
+             * this will set _db and create if needed
+             * will also set _client->_context to this
+             */
+            void _finishInit( bool doauth=true);
+            
+            void _auth( int lockState = dbMutex.getState() );
         public:
-            Context(const char *ns) 
-                : _client( currentClient.get() ) {
-                _olddb = _client->_database;
-                _oldns = _client->_ns;
-                setClient(ns);
-            }
-            Context(string ns) 
-                : _client( currentClient.get() ){
-                _olddb = _client->_database;
-                _oldns = _client->_ns;
-                setClient(ns.c_str());
+            Context(const string& ns, string path=dbpath, mongolock * lock = 0 , bool doauth=true ) 
+                : _client( currentClient.get() ) , _oldContext( _client->_context ) , 
+                  _path( path ) , _lock( lock ) ,
+                  _ns( ns ){
+                _finishInit( doauth );
             }
             
             /* this version saves the context but doesn't yet set the new one: */
-            Context() 
-                : _client( currentClient.get() ) {
-                _olddb = _client->database();
-                _oldns = _client->ns();        
 
+            Context() 
+                : _client( currentClient.get() ) , _oldContext( _client->_context ), 
+                  _path( dbpath ) , _lock(0) , _justCreated(false){
+                _client->_context = this;
+                clear();
             }
             
             /**
              * if you are doing this after allowing a write there could be a race condition
              * if someone closes that db.  this checks that the DB is still valid
              */
-            Context( string ns , Database * db );
+            Context( string ns , Database * db, bool doauth=true );
+            
+            ~Context();
+            
+            Client* getClient() const { return _client; }
+            
+            Database* db() const {
+                return _db;
+            }
 
-            ~Context() {
-                DEV assert( _client == currentClient.get() );
-                _client->setns( _oldns.c_str(), _olddb );
+            const char * ns() const {
+                return _ns.c_str();
+            }
+            
+            bool justCreated() const {
+                return _justCreated;
             }
 
-        };
+            bool equals( const string& ns , const string& path=dbpath ) const {
+                return _ns == ns && _path == path;
+            }
+
+            bool inDB( const string& db , const string& path=dbpath ) const {
+                if ( _path != path )
+                    return false;
+                
+                if ( db == _ns )
+                    return true;
+
+                string::size_type idx = _ns.find( db );
+                if ( idx != 0 )
+                    return false;
+                
+                return  _ns[db.size()] == '.';
+            }
 
+            void clear(){
+                _ns = "";
+                _db = 0;
+            }
+
+            /**
+             * call before unlocking, so clear any non-thread safe state
+             */
+            void unlocked(){
+                _db = 0;
+            }
+
+            /**
+             * call after going back into the lock, will re-establish non-thread safe stuff
+             */
+            void relocked(){
+                _finishInit();
+            }
+
+            friend class CurOp;
+        };
+        
     private:
-        CurOp * const _curOp;
-        Database *_database;
-        Namespace _ns;
-        //NamespaceString _nsstr;
+        CurOp * _curOp;
+        Context * _context;
         bool _shutdown;
         list<string> _tempCollections;
         const char *_desc;
         bool _god;
+        AuthenticationInfo _ai;
+
     public:
-        AuthenticationInfo *ai;
-        Top top;
+        
+        AuthenticationInfo * getAuthenticationInfo(){ return &_ai; }
+        bool isAdmin() { return _ai.isAuthorized( "admin" ); }
 
         CurOp* curop() { return _curOp; }
-        Database* database() { 
-            return _database; 
-        }
-        const char *ns() { return _ns.buf; }
-
-        void setns(const char *ns, Database *db) { 
-            _database = db;
-            _ns = ns;
-            //_nsstr = ns;
-        }
-        void clearns() { setns("", 0); }
-
+        
+        Context* getContext(){ return _context; }
+        Database* database() {  return _context ? _context->db() : 0; }
+        const char *ns() { return _context->ns(); }
+        
         Client(const char *desc);
         ~Client();
 
@@ -143,6 +195,10 @@ namespace mongo {
         bool shutdown();
 
         bool isGod() const { return _god; }
+
+        friend class CurOp;
+
+        string toString() const;
     };
     
     inline Client& cc() { 
@@ -182,12 +238,15 @@ namespace mongo {
             dbMutex.unlock_shared();
             dbMutex.lock();
 
-            /* this is defensive; as we were unlocked for a moment above, 
-               the Database object we reference could have been deleted:
-            */
-            cc().clearns();
+            if ( cc().getContext() )
+                cc().getContext()->unlocked();
         }
     }
-    
+
+    string sayClientState();
+  
+    inline bool haveClient(){ 
+        return currentClient.get() > 0;
+    }
 };
 
diff --git a/db/clientcursor.cpp b/db/clientcursor.cpp
index 0de0b2e..be0bd2f 100644
--- a/db/clientcursor.cpp
+++ b/db/clientcursor.cpp
@@ -36,7 +36,7 @@ namespace mongo {
     boost::recursive_mutex ClientCursor::ccmutex;
 
     unsigned ClientCursor::byLocSize() { 
-        recursive_boostlock lock(ccmutex);
+        recursive_scoped_lock lock(ccmutex);
         return byLoc.size();
     }
 
@@ -63,7 +63,7 @@ namespace mongo {
 
     /* todo: this implementation is incomplete.  we use it as a prefix for dropDatabase, which
              works fine as the prefix will end with '.'.  however, when used with drop and
-    		 deleteIndexes, this could take out cursors that belong to something else -- if you
+    		 dropIndexes, this could take out cursors that belong to something else -- if you
     		 drop "foo", currently, this will kill cursors for "foobar".
     */
     void ClientCursor::invalidate(const char *nsPrefix) {
@@ -73,7 +73,7 @@ namespace mongo {
         assert( len > 0 && strchr(nsPrefix, '.') );
 
         {
-            recursive_boostlock lock(ccmutex);
+            recursive_scoped_lock lock(ccmutex);
 
             for ( CCByLoc::iterator i = byLoc.begin(); i != byLoc.end(); ++i ) {
                 ClientCursor *cc = i->second;
@@ -88,7 +88,7 @@ namespace mongo {
 
     /* called every 4 seconds.  millis is amount of idle time passed since the last call -- could be zero */
     void ClientCursor::idleTimeReport(unsigned millis) {
-        recursive_boostlock lock(ccmutex);
+        recursive_scoped_lock lock(ccmutex);
         for ( CCByLoc::iterator i = byLoc.begin(); i != byLoc.end();  ) {
             CCByLoc::iterator j = i;
             i++;
@@ -104,7 +104,7 @@ namespace mongo {
        note this is potentially slow
     */
     void ClientCursor::informAboutToDeleteBucket(const DiskLoc& b) {
-        recursive_boostlock lock(ccmutex);
+        recursive_scoped_lock lock(ccmutex);
         RARELY if ( byLoc.size() > 70 ) {
             log() << "perf warning: byLoc.size=" << byLoc.size() << " in aboutToDeleteBucket\n";
         }
@@ -117,7 +117,7 @@ namespace mongo {
 
     /* must call this on a delete so we clean up the cursors. */
     void ClientCursor::aboutToDelete(const DiskLoc& dl) {
-        recursive_boostlock lock(ccmutex);
+        recursive_scoped_lock lock(ccmutex);
 
         CCByLoc::iterator j = byLoc.lower_bound(dl);
         CCByLoc::iterator stop = byLoc.upper_bound(dl);
@@ -170,7 +170,7 @@ namespace mongo {
         assert( pos != -2 );
 
         {
-            recursive_boostlock lock(ccmutex);
+            recursive_scoped_lock lock(ccmutex);
             setLastLoc_inlock( DiskLoc() ); // removes us from bylocation multimap
             clientCursorsById.erase(cursorid);
 
@@ -193,7 +193,7 @@ namespace mongo {
             return;
         }
         {
-            recursive_boostlock lock(ccmutex);
+            recursive_scoped_lock lock(ccmutex);
             setLastLoc_inlock(cl);
             c->noteLocation();
         }
@@ -217,7 +217,7 @@ namespace mongo {
             static bool inEmpty = false;
             if( test && !inEmpty ) { 
                 inEmpty = true;
-                log() << "TEST: manipulate collection during remove" << endl;
+                log() << "TEST: manipulate collection during cc:yield" << endl;
                 if( test == 1 ) 
                     Helpers::emptyCollection(ns.c_str());
                 else if( test == 2 ) {
@@ -267,8 +267,9 @@ namespace mongo {
         virtual void help( stringstream& help ) const {
             help << " example: { cursorInfo : 1 }";
         }
+        virtual LockType locktype(){ return NONE; }
         bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
-            recursive_boostlock lock(ClientCursor::ccmutex);
+            recursive_scoped_lock lock(ClientCursor::ccmutex);
             result.append("byLocation_size", unsigned( ClientCursor::byLoc.size() ) );
             result.append("clientCursors_size", unsigned( ClientCursor::clientCursorsById.size() ) );
             return true;
diff --git a/db/clientcursor.h b/db/clientcursor.h
index 03f20e9..42919e3 100644
--- a/db/clientcursor.h
+++ b/db/clientcursor.h
@@ -28,7 +28,7 @@
 #include "cursor.h"
 #include "jsobj.h"
 #include "../util/message.h"
-#include "storage.h"
+#include "diskloc.h"
 #include "dbhelpers.h"
 #include "matcher.h"
 
@@ -83,7 +83,7 @@ namespace mongo {
                 _c = 0;
             }
             Pointer(long long cursorid) {
-                recursive_boostlock lock(ccmutex);
+                recursive_scoped_lock lock(ccmutex);
                 _c = ClientCursor::find_inlock(cursorid, true);
                 if( _c ) {
                     if( _c->_pinValue >= 100 ) {
@@ -105,8 +105,15 @@ namespace mongo {
         int pos;                                 // # objects into the cursor so far 
         BSONObj query;
 
-        ClientCursor() : _idleAgeMillis(0), _pinValue(0), _doingDeletes(false), pos(0) {
-            recursive_boostlock lock(ccmutex);
+        ClientCursor(auto_ptr<Cursor>& _c, const char *_ns, bool okToTimeout) : 
+            _idleAgeMillis(0), _pinValue(0), 
+            _doingDeletes(false), 
+            ns(_ns), c(_c), 
+            pos(0) 
+        {
+            if( !okToTimeout )
+                noTimeout();
+            recursive_scoped_lock lock(ccmutex);
             cursorid = allocCursorId_inlock();
             clientCursorsById.insert( make_pair(cursorid, this) );
         }
@@ -116,11 +123,11 @@ namespace mongo {
             return _lastLoc;
         }
 
-        auto_ptr< FieldMatcher > filter; // which fields query wants returned
+        shared_ptr< FieldMatcher > fields; // which fields query wants returned
         Message originalMessage; // this is effectively an auto ptr for data the matcher points to
 
         /* Get rid of cursors for namespaces that begin with nsprefix.
-           Used by drop, deleteIndexes, dropDatabase.
+           Used by drop, dropIndexes, dropDatabase.
         */
         static void invalidate(const char *nsPrefix);
 
@@ -130,7 +137,8 @@ namespace mongo {
          *       we don't do herein as this->matcher (above) is only initialized for true queries/getmore.
          *       (ie not set for remote/update)
          * @return if the cursor is still valid. 
-         *         if false is returned, then this ClientCursor should be considered deleted
+         *         if false is returned, then this ClientCursor should be considered deleted - 
+         *         in fact, the whole database could be gone.
          */
         bool yield();
     private:
@@ -147,16 +155,16 @@ namespace mongo {
         }
     public:
         static ClientCursor* find(CursorId id, bool warn = true) { 
-            recursive_boostlock lock(ccmutex);
+            recursive_scoped_lock lock(ccmutex);
             ClientCursor *c = find_inlock(id, warn);
 			// if this asserts, your code was not thread safe - you either need to set no timeout 
 			// for the cursor or keep a ClientCursor::Pointer in scope for it.
-            massert( 12521, "internal error: use of an unlocked ClientCursor", c->_pinValue ); 
+            massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue ); 
             return c;
         }
 
         static bool erase(CursorId id) {
-            recursive_boostlock lock(ccmutex);
+            recursive_scoped_lock lock(ccmutex);
             ClientCursor *cc = find_inlock(id);
             if ( cc ) {
                 assert( cc->_pinValue < 100 ); // you can't still have an active ClientCursor::Pointer
@@ -195,13 +203,13 @@ namespace mongo {
         }
 
         static void idleTimeReport(unsigned millis);
-
+private:
         // cursors normally timeout after an inactivy period to prevent excess memory use
         // setting this prevents timeout of the cursor in question.
         void noTimeout() { 
             _pinValue++;
         }
-
+public:
         void setDoingDeletes( bool doingDeletes ){
             _doingDeletes = doingDeletes;
         }
diff --git a/db/cloner.cpp b/db/cloner.cpp
index 862f37c..d300721 100644
--- a/db/cloner.cpp
+++ b/db/cloner.cpp
@@ -46,6 +46,7 @@ namespace mongo {
            snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
                          for example repairDatabase need not use it.
         */
+        void setConnection( DBClientWithCommands *c ) { conn.reset( c ); }
         bool go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot);
         bool startCloneCollection( const char *fromhost, const char *ns, const BSONObj &query, string& errmsg, bool logForRepl, bool copyIndexes, int logSizeMb, long long &cursorId );
         bool finishCloneCollection( const char *fromhost, const char *ns, const BSONObj &query, long long cursorId, string &errmsg );
@@ -97,11 +98,11 @@ namespace mongo {
         
         list<BSONObj> storedForLater;
         
-        assert( c.get() );
+        massert( 13055 , "socket error in Cloner:copy" , c.get() );
         long long n = 0;
         time_t saveLast = time( 0 );
         while ( 1 ) {
-            {
+            if( !c->moreInCurrentBatch() || n % 128 == 127 /*yield some*/ ) {
                 dbtemprelease r;
                 if ( !c->more() )
                     break;
@@ -111,7 +112,7 @@ namespace mongo {
             /* assure object is valid.  note this will slow us down a little. */
             if ( !tmp.valid() ) {
                 stringstream ss;
-                ss << "skipping corrupt object from " << from_collection;
+                ss << "Cloner: skipping corrupt object from " << from_collection;
                 BSONElement e = tmp.firstElement();
                 try {
                     e.validate();
@@ -191,7 +192,9 @@ namespace mongo {
 		
             auto_ptr<DBClientCursor> c;
             {
-                if ( !masterSameProcess ) {
+                if ( conn.get() ) {
+                    // nothing to do
+                } else if ( !masterSameProcess ) {
                     auto_ptr< DBClientConnection > c( new DBClientConnection() );
                     if ( !c->connect( masterHost, errmsg ) )
                         return false;
@@ -215,7 +218,7 @@ namespace mongo {
 
                 log(2) << "\t cloner got " << collection << endl;
 
-                BSONElement e = collection.findElement("name");
+                BSONElement e = collection.getField("name");
                 if ( e.eoo() ) {
                     string s = "bad system.namespaces object " + collection.toString();
                     massert( 10290 , s.c_str(), false);
@@ -231,12 +234,11 @@ namespace mongo {
                         continue;
                     }
                 }
-                else if( strchr(from_name, '$') ) {
+                if( strchr(from_name, '$') ) {
                     // don't clone index namespaces -- we take care of those separately below.
                     log(2) << "\t\t not cloning because has $ " << endl;
                     continue;
                 }            
-                
                 toClone.push_back( collection.getOwned() );
             }
         }
@@ -414,6 +416,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; }
         virtual void help( stringstream &help ) const {
             help << "clone this database from an instance of the db on another host\n";
             help << "example: { clone : \"host13\" }";
@@ -436,6 +439,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; }
         CmdCloneCollection() : Command("cloneCollection") { }
         virtual void help( stringstream &help ) const {
             help << " example: { cloneCollection: <collection ns>, from: <hostname>, query: <query> }";
@@ -462,7 +466,7 @@ namespace mongo {
             /* replication note: we must logOp() not the command, but the cloned data -- if the slave
              were to clone it would get a different point-in-time and not match.
              */
-            setClient( collection.c_str() );
+            Client::Context ctx( collection );
             
             log() << "cloneCollection.  db:" << ns << " collection:" << collection << " from: " << fromhost << " query: " << query << " logSizeMb: " << logSizeMb << ( copyIndexes ? "" : ", not copying indexes" ) << endl;
             
@@ -479,6 +483,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; }
         CmdStartCloneCollection() : Command("startCloneCollection") { }
         virtual void help( stringstream &help ) const {
             help << " example: { startCloneCollection: <collection ns>, from: <hostname>, query: <query> }";
@@ -506,7 +511,7 @@ namespace mongo {
             /* replication note: we must logOp() not the command, but the cloned data -- if the slave
              were to clone it would get a different point-in-time and not match.
              */
-            setClient( collection.c_str() );
+            Client::Context ctx(collection);
             
             log() << "startCloneCollection.  db:" << ns << " collection:" << collection << " from: " << fromhost << " query: " << query << endl;
             
@@ -532,6 +537,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; }
         CmdFinishCloneCollection() : Command("finishCloneCollection") { }
         virtual void help( stringstream &help ) const {
             help << " example: { finishCloneCollection: <finishToken> }";
@@ -562,7 +568,7 @@ namespace mongo {
                 cursorId = cursorIdToken._numberLong();
             }
             
-            setClient( collection.c_str() );
+            Client::Context ctx( collection );
             
             log() << "finishCloneCollection.  db:" << ns << " collection:" << collection << " from: " << fromhost << " query: " << query << endl;
             
@@ -571,8 +577,50 @@ namespace mongo {
         }
     } cmdfinishclonecollection;
 
+    thread_specific_ptr< DBClientConnection > authConn_;
+    /* Usage:
+     admindb.$cmd.findOne( { copydbgetnonce: 1, fromhost: <hostname> } );
+     */
+    class CmdCopyDbGetNonce : public Command {
+    public:
+        CmdCopyDbGetNonce() : Command("copydbgetnonce") { }
+        virtual bool adminOnly() {
+            return true;
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual LockType locktype(){ return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "get a nonce for subsequent copy db request from secure server\n";
+            help << "usage: {copydbgetnonce: 1, fromhost: <hostname>}";
+        }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string fromhost = cmdObj.getStringField("fromhost");
+            if ( fromhost.empty() ) {
+                /* copy from self */
+                stringstream ss;
+                ss << "localhost:" << cmdLine.port;
+                fromhost = ss.str();
+            }
+            authConn_.reset( new DBClientConnection() );
+            BSONObj ret;
+            {
+                dbtemprelease t;
+                if ( !authConn_->connect( fromhost, errmsg ) )
+                    return false;
+                if( !authConn_->runCommand( "admin", BSON( "getnonce" << 1 ), ret ) ) {
+                    errmsg = "couldn't get nonce " + string( ret );
+                    return false;
+                }
+            }
+            result.appendElements( ret );
+            return true;
+        }
+    } cmdcopydbgetnonce;
+
     /* Usage:
-       admindb.$cmd.findOne( { copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db> } );
+       admindb.$cmd.findOne( { copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, username: <username>, nonce: <nonce>, key: <key>] } );
     */
     class CmdCopyDb : public Command {
     public:
@@ -583,9 +631,10 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; }
         virtual void help( stringstream &help ) const {
-            help << "copy a database from antoher host to this host\n";
-            help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}";
+            help << "copy a database from another host to this host\n";
+            help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, username: <username>, nonce: <nonce>, key: <key>]}";
         }
         virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             string fromhost = cmdObj.getStringField("fromhost");
@@ -601,9 +650,24 @@ namespace mongo {
                 errmsg = "parms missing - {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}";
                 return false;
             }
-            setClient(todb.c_str());
-            bool res = cloneFrom(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, /*slaveok*/false, /*replauth*/false, /*snapshot*/true);
-            cc().clearns();
+            Cloner c;
+            string username = cmdObj.getStringField( "username" );
+            string nonce = cmdObj.getStringField( "nonce" );
+            string key = cmdObj.getStringField( "key" );
+            if ( !username.empty() && !nonce.empty() && !key.empty() ) {
+                uassert( 13008, "must call copydbgetnonce first", authConn_.get() );
+                BSONObj ret;
+                {
+                    dbtemprelease t;
+                    if ( !authConn_->runCommand( fromdb, BSON( "authenticate" << 1 << "user" << username << "nonce" << nonce << "key" << key ), ret ) ) {
+                        errmsg = "unable to login " + string( ret );
+                        return false;
+                    }
+                }
+                c.setConnection( authConn_.release() );
+            }
+            Client::Context ctx(todb);
+            bool res = c.go(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, /*slaveok*/false, /*replauth*/false, /*snapshot*/true);
             return res;
         }
     } cmdcopydb;
@@ -617,6 +681,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; }
         virtual bool logTheOp() {
             return true; // can't log steps when doing fast rename within a db, so always log the op rather than individual steps comprising it.
         }
@@ -631,16 +696,19 @@ namespace mongo {
                 return false;
             }
             
-            setClient( source.c_str() );
-            NamespaceDetails *nsd = nsdetails( source.c_str() );
-            uassert( 10026 ,  "source namespace does not exist", nsd );
-            bool capped = nsd->capped;
+            bool capped = false;
             long long size = 0;
-            if ( capped )
-                for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext )
-                    size += i.ext()->length;
+            {
+                Client::Context ctx( source );
+                NamespaceDetails *nsd = nsdetails( source.c_str() );
+                uassert( 10026 ,  "source namespace does not exist", nsd );
+                capped = nsd->capped;
+                if ( capped )
+                    for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext )
+                        size += i.ext()->length;
+            }
             
-            setClient( target.c_str() );
+            Client::Context ctx( target );
             
             if ( nsdetails( target.c_str() ) ){
                 uassert( 10027 ,  "target namespace exists", cmdObj["dropTarget"].trueValue() );
@@ -715,8 +783,10 @@ namespace mongo {
                 theDataFileMgr.insert( targetIndexes.c_str(), n );
             }
 
-            setClient( source.c_str() );
-            dropCollection( source, errmsg, result );
+            {
+                Client::Context ctx( source );
+                dropCollection( source, errmsg, result );
+            }
             return true;
         }
     } cmdrenamecollection;
diff --git a/db/cmdline.cpp b/db/cmdline.cpp
new file mode 100644
index 0000000..59eafdd
--- /dev/null
+++ b/db/cmdline.cpp
@@ -0,0 +1,162 @@
+// cmdline.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "cmdline.h"
+#include "commands.h"
+
+namespace po = boost::program_options;
+
+namespace mongo {
+    CmdLine cmdLine;
+
+    void setupSignals();
+    BSONArray argvArray;
+
+    void CmdLine::addGlobalOptions( boost::program_options::options_description& general , 
+                                    boost::program_options::options_description& hidden ){
+        /* support for -vv -vvvv etc. */
+        for (string s = "vv"; s.length() <= 12; s.append("v")) {
+            hidden.add_options()(s.c_str(), "verbose");
+        }
+        
+        general.add_options()
+            ("help,h", "show this usage information")
+            ("version", "show version information")
+            ("config,f", po::value<string>(), "configuration file specifying additional options")
+            ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)")
+            ("quiet", "quieter output")
+            ("port", po::value<int>(&cmdLine.port), "specify port number")
+            ("logpath", po::value<string>() , "file to send all output to instead of stdout" )
+            ("logappend" , "append to logpath instead of over-writing" )
+#ifndef _WIN32
+            ("fork" , "fork server process" )
+#endif
+            ;
+        
+    }
+
+
+    bool CmdLine::store( int argc , char ** argv , 
+                         boost::program_options::options_description& visible,
+                         boost::program_options::options_description& hidden,
+                         boost::program_options::positional_options_description& positional,
+                         boost::program_options::variables_map &params ){
+        
+        
+        /* don't allow guessing - creates ambiguities when some options are
+         * prefixes of others. allow long disguises and don't allow guessing
+         * to get away with our vvvvvvv trick. */
+        int style = (((po::command_line_style::unix_style ^
+                       po::command_line_style::allow_guessing) |
+                      po::command_line_style::allow_long_disguise) ^
+                     po::command_line_style::allow_sticky);
+
+        
+        try {
+
+            po::options_description all;
+            all.add( visible );
+            all.add( hidden );
+
+            po::store( po::command_line_parser(argc, argv)
+                       .options( all )
+                       .positional( positional )
+                       .style( style )
+                       .run(), 
+                       params );
+
+            if ( params.count("config") ){
+                ifstream f( params["config"].as<string>().c_str() );
+                if ( ! f.is_open() ){
+                    cout << "ERROR: could not read from config file" << endl << endl;
+                    cout << visible << endl;
+                    return false;
+                }
+                
+                po::store( po::parse_config_file( f , all ) , params );
+                f.close();
+            }
+            
+            po::notify(params);
+        } 
+        catch (po::error &e) {
+            cout << "ERROR: " << e.what() << endl << endl;
+            cout << visible << endl;
+            return false;
+        }
+
+        if (params.count("verbose")) {
+            logLevel = 1;
+        }
+
+        for (string s = "vv"; s.length() <= 12; s.append("v")) {
+            if (params.count(s)) {
+                logLevel = s.length();
+            }
+        }
+
+        if (params.count("quiet")) {
+            cmdLine.quiet = true;
+        }
+
+#ifndef _WIN32
+        if (params.count("fork")) {
+            if ( ! params.count( "logpath" ) ){
+                cout << "--fork has to be used with --logpath" << endl;
+                ::exit(-1);
+            }
+            pid_t c = fork();
+            if ( c ){
+                cout << "forked process: " << c << endl;
+                ::exit(0);
+            }
+            setsid();
+            setupSignals();
+        }
+#endif
+        if (params.count("logpath")) {
+            string lp = params["logpath"].as<string>();
+            uassert( 10033 ,  "logpath has to be non-zero" , lp.size() );
+            initLogging( lp , params.count( "logappend" ) );
+        }
+
+        {
+            BSONArrayBuilder b;
+            for (int i=0; i < argc; i++)
+                b << argv[i];
+            argvArray = b.arr();
+        }
+
+        return true;
+    }
+
+    class CmdGetCmdLineOpts : Command{
+        public:
+        CmdGetCmdLineOpts(): Command("getCmdLineOpts") {}
+        virtual LockType locktype() { return NONE; }
+        virtual bool adminOnly() { return true; }
+        virtual bool slaveOk() { return true; }
+
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+            result.append("argv", argvArray);
+            return true;
+        }
+
+    } cmdGetCmdLineOpts;
+}
diff --git a/db/cmdline.h b/db/cmdline.h
index b071259..3e46c5e 100644
--- a/db/cmdline.h
+++ b/db/cmdline.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "../stdafx.h"
+
 namespace mongo {
     
     /* command line options        
@@ -23,6 +25,7 @@ namespace mongo {
     /* concurrency: OK/READ */
     struct CmdLine { 
         int port;              // --port
+        bool rest;             // --rest
 
         string source;         // --source
         string only;           // --only
@@ -47,11 +50,25 @@ namespace mongo {
         };
 
         CmdLine() : 
-            port(DefaultDBPort), quiet(false), notablescan(false), prealloc(true), smallfiles(false),
+            port(DefaultDBPort), rest(false), quiet(false), notablescan(false), prealloc(true), smallfiles(false),
             quota(false), quotaFiles(8), cpu(false), oplogSize(0), defaultProfile(0), slowMS(100)
         { } 
+        
 
-    };
+        static void addGlobalOptions( boost::program_options::options_description& general , 
+                                      boost::program_options::options_description& hidden );
 
+        
+        /**
+         * @return true if should run program, false if should exit
+         */
+        static bool store( int argc , char ** argv , 
+                           boost::program_options::options_description& visible,
+                           boost::program_options::options_description& hidden,
+                           boost::program_options::positional_options_description& positional,
+                           boost::program_options::variables_map &output );
+    };
+    
     extern CmdLine cmdLine;
+    
 }
diff --git a/db/commands.cpp b/db/commands.cpp
index 3078ea1..83d7219 100644
--- a/db/commands.cpp
+++ b/db/commands.cpp
@@ -20,6 +20,8 @@
 #include "stdafx.h"
 #include "jsobj.h"
 #include "commands.h"
+#include "client.h"
+#include "replset.h"
 
 namespace mongo {
 
@@ -72,9 +74,14 @@ namespace mongo {
                 ok = c->run(ns, jsobj, errmsg, anObjBuilder, false);
             }
 
-            anObjBuilder.append( "ok" , ok ? 1.0 : 0.0 );
+            BSONObj tmp = anObjBuilder.asTempObj();
+            bool have_ok = tmp.hasField("ok");
+            bool have_errmsg = tmp.hasField("errmsg");
+
+            if (!have_ok)
+                anObjBuilder.append( "ok" , ok ? 1.0 : 0.0 );
             
-            if ( !ok ) {
+            if ( !ok && !have_errmsg) {
                 anObjBuilder.append("errmsg", errmsg);
                 uassert_nothrow(errmsg.c_str());
             }
@@ -92,11 +99,12 @@ namespace mongo {
     }
 
 
-    bool Command::readOnly( const string& name ){
+    Command::LockType Command::locktype( const string& name ){
         Command * c = findCommand( name );
         if ( ! c )
-            return false;
-        return c->readOnly();
+            return WRITE;
+        return c->locktype();
     }
+
     
 } // namespace mongo
diff --git a/db/commands.h b/db/commands.h
index 20fb98c..518dcb7 100644
--- a/db/commands.h
+++ b/db/commands.h
@@ -25,11 +25,15 @@ namespace mongo {
     class BSONObj;
     class BSONObjBuilder;
     class BufBuilder;
-    
+    class Client;
+
 // db "commands" (sent via db.$cmd.findOne(...))
 // subclass to make a command.
     class Command {
     public:
+        
+        enum LockType { READ = -1 , NONE = 0 , WRITE = 1 };
+
         string name;
 
         /* run the given command
@@ -42,12 +46,12 @@ namespace mongo {
         */
         virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) = 0;
 
-        /* true if a read lock is sufficient 
-		   note: logTheTop() MUST be false if readOnly
+        /* 
+		   note: logTheTop() MUST be false if READ
+           if NONE, can't use Client::Context setup
+                    use with caution
 		 */
-        virtual bool readOnly() { 
-            return false;
-        }
+        virtual LockType locktype() = 0;
 
         /* Return true if only the admin ns has privileges to run this command. */
         virtual bool adminOnly() {
@@ -105,10 +109,11 @@ namespace mongo {
 
     public:
         static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder);
-        static bool readOnly( const string& name );
+        static LockType locktype( const string& name );
         static Command * findCommand( const string& name );
     };
 
     bool _runCommands(const char *ns, BSONObj& jsobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions);
 
+
 } // namespace mongo
diff --git a/db/common.cpp b/db/common.cpp
new file mode 100644
index 0000000..a199bd1
--- /dev/null
+++ b/db/common.cpp
@@ -0,0 +1,14 @@
+// common.cpp
+
+#include "stdafx.h"
+#include "concurrency.h"
+
+/**
+ * this just has globals
+ */
+namespace mongo {
+
+    /* we use new here so we don't have to worry about destructor orders at program shutdown */
+    MongoMutex &dbMutex( *(new MongoMutex) );
+
+}
diff --git a/db/concurrency.h b/db/concurrency.h
index daf09b6..de8f242 100644
--- a/db/concurrency.h
+++ b/db/concurrency.h
@@ -1,3 +1,19 @@
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 /* concurrency.h
 
    mongod concurrency rules & notes will be placed here.
@@ -17,19 +33,34 @@
 #include <boost/thread/shared_mutex.hpp>
 #undef assert
 #define assert xassert
+#define HAVE_READLOCK
 #else
-#warning built with boost version 1.34 or older limited concurrency
+#warning built with boost version 1.34 or older - limited concurrency
 #endif
 
 namespace mongo {
 
+    inline bool readLockSupported(){
+#ifdef HAVE_READLOCK
+        return true;
+#else
+        return false;
+#endif
+    }
+
+    string sayClientState();
+    bool haveClient();
+    
+    void curopWaitingForLock( int type );
+    void curopGotLock();
+
     /* mutex time stats */
     class MutexInfo {
         unsigned long long start, enter, timeLocked; // all in microseconds
         int locked;
 
     public:
-        MutexInfo() : locked(0) {
+        MutexInfo() : timeLocked(0) , locked(0) {
             start = curTimeMicros64();
         }
         void entered() {
@@ -51,9 +82,12 @@ namespace mongo {
             s = start;
             tl = timeLocked;
         }
+        unsigned long long getTimeLocked() const {
+            return timeLocked;
+        }
     };
 
-#if BOOST_VERSION >= 103500
+#ifdef HAVE_READLOCK
 //#if 0
     class MongoMutex {
         MutexInfo _minfo;
@@ -80,19 +114,25 @@ namespace mongo {
         void assertAtLeastReadLocked() { assert(atLeastReadLocked()); }
 
         void lock() { 
-            DEV cout << "LOCK" << endl;
+            //DEV cout << "LOCK" << endl;
+            DEV assert( haveClient() );
+                
             int s = _state.get();
             if( s > 0 ) {
                 _state.set(s+1);
                 return;
             }
-            massert( 10293 , "internal error: locks are not upgradeable", s == 0 );
+            massert( 10293 , (string)"internal error: locks are not upgradeable: " + sayClientState() , s == 0 );
             _state.set(1);
+
+            curopWaitingForLock( 1 );
             _m.lock(); 
+            curopGotLock();
+
             _minfo.entered();
         }
         void unlock() { 
-            DEV cout << "UNLOCK" << endl;
+            //DEV cout << "UNLOCK" << endl;
             int s = _state.get();
             if( s > 1 ) { 
                 _state.set(s-1);
@@ -103,7 +143,7 @@ namespace mongo {
                     _releasedEarly.set(false);
                     return;
                 }
-                assert(false); // attempt to unlock when wasn't in a write lock
+                massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false);
             }
             _state.set(0);
             _minfo.leaving();
@@ -121,7 +161,7 @@ namespace mongo {
         }
 
         void lock_shared() { 
-            DEV cout << " LOCKSHARED" << endl;
+            //DEV cout << " LOCKSHARED" << endl;
             int s = _state.get();
             if( s ) {
                 if( s > 0 ) { 
@@ -136,10 +176,29 @@ namespace mongo {
                 }
             }
             _state.set(-1);
+            curopWaitingForLock( -1 );
             _m.lock_shared(); 
+            curopGotLock();
+        }
+        
+        bool lock_shared_try( int millis ) {
+            int s = _state.get();
+            if ( s ){
+                // we already have a lock, so no need to try
+                lock_shared();
+                return true;
+            }
+            
+            boost::system_time until = get_system_time();
+            until += boost::posix_time::milliseconds(2);
+            bool got = _m.timed_lock_shared( until );
+            if ( got )
+                _state.set(-1);
+            return got;
         }
+        
         void unlock_shared() { 
-            DEV cout << " UNLOCKSHARED" << endl;
+            //DEV cout << " UNLOCKSHARED" << endl;
             int s = _state.get();
             if( s > 0 ) { 
                 assert( s > 1 ); /* we must have done a lock write first to have s > 1 */
@@ -154,6 +213,7 @@ namespace mongo {
             _state.set(0);
             _m.unlock_shared(); 
         }
+        
         MutexInfo& info() { return _minfo; }
     };
 #else
@@ -165,7 +225,7 @@ namespace mongo {
     public:
         MongoMutex() { }
         void lock() { 
-#if BOOST_VERSION >= 103500
+#ifdef HAVE_READLOCK
             m.lock();
 #else
             boost::detail::thread::lock_ops<boost::recursive_mutex>::lock(m);
@@ -182,7 +242,7 @@ namespace mongo {
 
         void _unlock() {
             _minfo.leaving();
-#if BOOST_VERSION >= 103500
+#ifdef HAVE_READLOCK
             m.unlock();
 #else
             boost::detail::thread::lock_ops<boost::recursive_mutex>::unlock(m);
@@ -197,6 +257,18 @@ namespace mongo {
         }
 
         void lock_shared() { lock(); }
+        bool lock_shared_try( int millis ) {
+            while ( millis-- ){
+                if ( getState() ){
+                    sleepmillis(1);
+                    continue;
+                }
+                lock_shared();
+                return true;
+            }
+            return false;
+        }
+                    
         void unlock_shared() { unlock(); }
         MutexInfo& info() { return _minfo; }
         void assertWriteLocked() { 
@@ -220,8 +292,10 @@ namespace mongo {
             dbMutex.lock();
         }
         ~writelock() { 
-            dbunlocking_write();
-            dbMutex.unlock();
+            DESTRUCTOR_GUARD(
+                dbunlocking_write();
+                dbMutex.unlock();
+            );
         }
     };
     
@@ -230,11 +304,43 @@ namespace mongo {
             dbMutex.lock_shared();
         }
         ~readlock() { 
-            dbunlocking_read();
-            dbMutex.unlock_shared();
+            DESTRUCTOR_GUARD(
+                dbunlocking_read();
+                dbMutex.unlock_shared();
+            );
         }
+    };	
+
+    struct readlocktry {
+        readlocktry( const string&ns , int tryms ){
+            _got = dbMutex.lock_shared_try( tryms );
+        }
+        ~readlocktry() {
+            if ( _got ){
+                dbunlocking_read();
+                dbMutex.unlock_shared();
+            }
+        }
+        bool got(){
+            return _got;
+        }
+        bool _got;
     };
     
+    struct atleastreadlock {
+        atleastreadlock( const string& ns ){
+            _prev = dbMutex.getState();
+            if ( _prev == 0 )
+                dbMutex.lock_shared();
+        }
+        ~atleastreadlock(){
+            if ( _prev == 0 )
+                dbMutex.unlock_shared();
+        }
+
+        int _prev;
+    };
+
     class mongolock {
         bool _writelock;
     public:
@@ -246,14 +352,15 @@ namespace mongo {
                 dbMutex.lock_shared();
         }
         ~mongolock() { 
-            if( _writelock ) { 
-                dbunlocking_write();
-                dbMutex.unlock();
-            }
-            else {
-                dbunlocking_read();
-                dbMutex.unlock_shared();
-            }
+            DESTRUCTOR_GUARD(
+                if( _writelock ) { 
+                    dbunlocking_write();
+                    dbMutex.unlock();
+                } else {
+                    dbunlocking_read();
+                    dbMutex.unlock_shared();
+                }
+            );
         }
         /* this unlocks, does NOT upgrade. that works for our current usage */
         void releaseAndWriteLock();
diff --git a/db/curop.h b/db/curop.h
index 8a28f4f..e5d38d7 100644
--- a/db/curop.h
+++ b/db/curop.h
@@ -1,10 +1,27 @@
 // curop.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 
 #pragma once
 
 #include "namespace.h"
-#include "security.h"
 #include "client.h"
+#include "../util/atomic_int.h"
+#include "db.h"
 
 namespace mongo { 
 
@@ -20,19 +37,53 @@ namespace mongo {
     /* Current operation (for the current Client).
        an embedded member of Client class, and typically used from within the mutex there. */
     class CurOp : boost::noncopyable {
-        static WrappingInt _nextOpNum;
+        static AtomicUInt _nextOpNum;
         static BSONObj _tooBig; // { $msg : "query not recording (too large)" }
+        
+        Client * _client;
+        CurOp * _wrapped;
+
+        unsigned long long _start;
+        unsigned long long _checkpoint;
+        unsigned long long _end;
 
         bool _active;
-        Timer _timer;
         int _op;
-        WrappingInt _opNum;
+        bool _command;
+        int _lockType; // see concurrency.h for values
+        bool _waitingForLock;
+        int _dbprofile; // 0=off, 1=slow, 2=all
+        AtomicUInt _opNum;
         char _ns[Namespace::MaxNsLen+2];
-        struct sockaddr_in client;
-
+        struct sockaddr_in _remote;
+        
         char _queryBuf[256];
-        bool haveQuery() const { return *((int *) _queryBuf) != 0; }
+        
         void resetQuery(int x=0) { *((int *)_queryBuf) = x; }
+        
+        OpDebug _debug;
+        
+        ThreadSafeString _message;
+        ProgressMeter _progressMeter;
+
+        void _reset(){
+            _command = false;
+            _lockType = 0;
+            _dbprofile = 0;
+            _end = 0;
+            _waitingForLock = false;
+            _message = "";
+            _progressMeter.finished();
+        }
+
+        void setNS(const char *ns) {
+            strncpy(_ns, ns, Namespace::MaxNsLen);
+        }
+
+    public:
+
+        bool haveQuery() const { return *((int *) _queryBuf) != 0; }
+
         BSONObj query() {
             if( *((int *) _queryBuf) == 1 ) { 
                 return _tooBig;
@@ -41,37 +92,108 @@ namespace mongo {
             return o;
         }
 
-        OpDebug _debug;
-    public:
-        void reset( const sockaddr_in &_client) { 
+        void ensureStarted(){
+            if ( _start == 0 )
+                _start = _checkpoint = curTimeMicros64();            
+        }
+        void enter( Client::Context * context ){
+            ensureStarted();
+            setNS( context->ns() );
+            if ( context->_db && context->_db->profile > _dbprofile )
+                _dbprofile = context->_db->profile;
+        }
+
+        void leave( Client::Context * context ){
+            unsigned long long now = curTimeMicros64();
+            Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command );
+            _checkpoint = now;
+        }
+        
+        void reset( const sockaddr_in & remote, int op ) { 
+            _reset();
+            _start = _checkpoint = 0;
             _active = true;
-            _opNum = _nextOpNum.atomicIncrement();
-            _timer.reset();
+            _opNum = _nextOpNum++;
             _ns[0] = '?'; // just in case not set later
             _debug.reset();
             resetQuery();
-            client = _client;
+            _remote = remote;
+            _op = op;
+        }
+        
+        void markCommand(){
+            _command = true;
+        }
+
+        void waitingForLock( int type ){
+            _waitingForLock = true;
+            if ( type > 0 )
+                _lockType = 1;
+            else
+                _lockType = -1;
+        }
+        void gotLock(){
+            _waitingForLock = false;
         }
 
         OpDebug& debug(){
             return _debug;
         }
+        
+        int profileLevel() const {
+            return _dbprofile;
+        }
 
-        WrappingInt opNum() const { return _opNum; }
-        bool active() const { return _active; }
+        const char * getNS() const {
+            return _ns;
+        }
 
-        int elapsedMillis(){ return _timer.millis(); }
+        bool shouldDBProfile( int ms ) const {
+            if ( _dbprofile <= 0 )
+                return false;
+            
+            return _dbprofile >= 2 || ms >= cmdLine.slowMS;
+        }
+        
+        AtomicUInt opNum() const { return _opNum; }
+
+        /** if this op is running */
+        bool active() const { return _active; }
+        
+        int getLockType() const { return _lockType; }
+        bool isWaitingForLock() const { return _waitingForLock; } 
+        int getOp() const { return _op; }
+        
         
         /** micros */
-        unsigned long long startTime(){
-            return _timer.startTime();
+        unsigned long long startTime() {
+            ensureStarted();
+            return _start;
         }
 
-        void setActive(bool active) { _active = active; }
-        void setNS(const char *ns) {
-            strncpy(_ns, ns, Namespace::MaxNsLen);
+        void done() {
+            _active = false;
+            _end = curTimeMicros64();
+        }
+        
+        unsigned long long totalTimeMicros() {
+            massert( 12601 , "CurOp not marked done yet" , ! _active );
+            return _end - startTime();
+        }
+
+        int totalTimeMillis() {
+            return (int) (totalTimeMicros() / 1000);
         }
-        void setOp(int op) { _op = op; }
+
+        int elapsedMillis() {
+            unsigned long long total = curTimeMicros64() - startTime();
+            return (int) (total / 1000);
+        }
+
+        int elapsedSeconds() {
+            return elapsedMillis() / 1000;
+        }
+
         void setQuery(const BSONObj& query) { 
             if( query.objsize() > (int) sizeof(_queryBuf) ) { 
                 resetQuery(1); // flag as too big and return
@@ -80,9 +202,15 @@ namespace mongo {
             memcpy(_queryBuf, query.objdata(), query.objsize());
         }
 
-        CurOp() { 
+        CurOp( Client * client , CurOp * wrapped = 0 ) { 
+            _client = client;
+            _wrapped = wrapped;
+            if ( _wrapped ){
+                _client->_curOp = this;
+            }
+            _start = _checkpoint = 0;
             _active = false;
-//            opNum = 0; 
+            _reset();
             _op = 0;
             // These addresses should never be written to again.  The zeroes are
             // placed here as a precaution because currentOp may be accessed
@@ -90,10 +218,14 @@ namespace mongo {
             memset(_ns, 0, sizeof(_ns));
             memset(_queryBuf, 0, sizeof(_queryBuf));
         }
+        
+        ~CurOp(){
+            if ( _wrapped )
+                _client->_curOp = _wrapped;
+        }
 
         BSONObj info() { 
-            AuthenticationInfo *ai = currentClient.get()->ai;
-            if( !ai->isAuthorized("admin") ) { 
+            if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) { 
                 BSONObjBuilder b;
                 b.append("err", "unauthorized");
                 return b.obj();
@@ -101,35 +233,30 @@ namespace mongo {
             return infoNoauth();
         }
         
-        BSONObj infoNoauth() {
-            BSONObjBuilder b;
-            b.append("opid", _opNum);
-            b.append("active", _active);
-            if( _active ) 
-                b.append("secs_running", _timer.seconds() );
-            if( _op == 2004 ) 
-                b.append("op", "query");
-            else if( _op == 2005 )
-                b.append("op", "getMore");
-            else if( _op == 2001 )
-                b.append("op", "update");
-            else if( _op == 2002 )
-                b.append("op", "insert");
-            else if( _op == 2006 )
-                b.append("op", "delete");
-            else
-                b.append("op", _op);
-            b.append("ns", _ns);
+        BSONObj infoNoauth();
+
+        string getRemoteString(){
+            stringstream ss;
+            ss << inet_ntoa( _remote.sin_addr ) << ":" << ntohs( _remote.sin_port );
+            return ss.str();
+        }
 
-            if( haveQuery() ) {
-                b.append("query", query());
+        ProgressMeter& setMessage( const char * msg , long long progressMeterTotal = 0 , int secondsBetween = 3 ){
+            _message = msg;
+            if ( progressMeterTotal ){
+                assert( ! _progressMeter.isActive() );
+                _progressMeter.reset( progressMeterTotal , secondsBetween );
+            }
+            else {
+                _progressMeter.finished();
             }
-            // b.append("inLock",  ??
-            stringstream clientStr;
-            clientStr << inet_ntoa( client.sin_addr ) << ":" << ntohs( client.sin_port );
-            b.append("client", clientStr.str());
-            return b.obj();
+            return _progressMeter;
         }
+
+        string getMessage() const { return _message; }
+        ProgressMeter getProgressMeter() { return _progressMeter; }
+
+        friend class Client;
     };
 
     /* 0 = ok
@@ -137,12 +264,12 @@ namespace mongo {
        future: maybe use this as a "going away" thing on process termination with a higher flag value 
     */
     extern class KillCurrentOp { 
-         enum { Off, On, All } state;
-        WrappingInt toKill;
+        enum { Off, On, All } state;
+        AtomicUInt toKill;
     public:
         void killAll() { state = All; }
-        void kill(WrappingInt i) { toKill = i; state = On; }
-
+        void kill(AtomicUInt i) { toKill = i; state = On; }
+        
         void checkForInterrupt() { 
             if( state != Off ) { 
                 if( state == All ) 
diff --git a/db/cursor.h b/db/cursor.h
index 3868cca..69e5d67 100644
--- a/db/cursor.h
+++ b/db/cursor.h
@@ -19,7 +19,7 @@
 #include "../stdafx.h"
 
 #include "jsobj.h"
-#include "storage.h"
+#include "diskloc.h"
 
 namespace mongo {
     
@@ -76,6 +76,8 @@ namespace mongo {
 
         /* called before query getmore block is iterated */
         virtual void checkLocation() { }
+        
+        virtual bool supportGetMore() = 0;
 
         virtual string toString() {
             return "abstract?";
@@ -91,10 +93,10 @@ namespace mongo {
         */
         virtual bool getsetdup(DiskLoc loc) = 0;
 
-        virtual BSONObj prettyStartKey() const { return BSONObj(); }
-        virtual BSONObj prettyEndKey() const { return BSONObj(); }
+        virtual BSONObj prettyIndexBounds() const { return BSONObj(); }
 
         virtual bool capped() const { return false; }
+
     };
 
     // strategy object implementing direction of traversal.
@@ -157,6 +159,8 @@ namespace mongo {
             return tailable_;
         }
         virtual bool getsetdup(DiskLoc loc) { return false; }
+
+        virtual bool supportGetMore() { return true; }
     };
 
     /* used for order { $natural: -1 } */
diff --git a/db/database.h b/db/database.h
index 0fcf386..868af0b 100644
--- a/db/database.h
+++ b/db/database.h
@@ -36,7 +36,7 @@ namespace mongo {
             : name(nm), path(_path), namespaceIndex( path, name ) {
             
             { // check db name is valid
-                int L = strlen(nm);
+                size_t L = strlen(nm);
                 uassert( 10028 ,  "db name is empty", L > 0 );
                 uassert( 10029 ,  "bad db name [1]", *nm != '.' );
                 uassert( 10030 ,  "bad db name [2]", nm[L-1] != '.' );
@@ -63,8 +63,8 @@ namespace mongo {
         ~Database() {
             magic = 0;
             btreeStore->closeFiles(name, path);
-            int n = files.size();
-            for ( int i = 0; i < n; i++ )
+            size_t n = files.size();
+            for ( size_t i = 0; i < n; i++ )
                 delete files[i];
         }
         
@@ -79,12 +79,19 @@ namespace mongo {
             return ! namespaceIndex.allocated();
         }
 
-        bool exists(int n) { 
+        boost::filesystem::path fileName( int n ) {
             stringstream ss;
             ss << name << '.' << n;
             boost::filesystem::path fullName;
-            fullName = boost::filesystem::path(path) / ss.str();
-            return boost::filesystem::exists(fullName);
+            fullName = boost::filesystem::path(path);
+            if ( directoryperdb )
+                fullName /= name;
+            fullName /= ss.str();
+            return fullName;
+        }
+        
+        bool exists(int n) { 
+            return boost::filesystem::exists( fileName( n ) );
         }
 
         void openAllFiles() { 
@@ -124,10 +131,7 @@ namespace mongo {
                 p = files[n];
             }
             if ( p == 0 ) {
-                stringstream ss;
-                ss << name << '.' << n;
-                boost::filesystem::path fullName;
-                fullName = boost::filesystem::path(path) / ss.str();
+                boost::filesystem::path fullName = fileName( n );
                 string fullNameString = fullName.string();
                 p = new MongoDataFile(n);
                 int minSize = 0;
diff --git a/db/db.cpp b/db/db.cpp
index 9b1a22a..fe63df1 100644
--- a/db/db.cpp
+++ b/db/db.cpp
@@ -29,6 +29,7 @@
 #include "instance.h"
 #include "clientcursor.h"
 #include "pdfile.h"
+#include "stats/counters.h"
 #if !defined(_WIN32)
 #include <sys/file.h>
 #endif
@@ -40,6 +41,7 @@
 #include "../scripting/engine.h"
 #include "module.h"
 #include "cmdline.h"
+#include "stats/snapshots.h"
 
 namespace mongo {
 
@@ -54,10 +56,11 @@ namespace mongo {
 
     extern string bind_ip;
     extern char *appsrvPath;
-    extern bool autoresync;
     extern int diagLogging;
     extern int lenForNewNsFiles;
     extern int lockFile;
+    
+    extern string repairpath;
 
     void setupSignals();
     void closeAllSockets();
@@ -65,9 +68,14 @@ namespace mongo {
     void pairWith(const char *remoteEnd, const char *arb);
     void setRecCacheSize(unsigned MB);
 
+    void exitCleanly( ExitCode code );
+
     const char *ourgetns() { 
         Client *c = currentClient.get();
-        return c ? c->ns() : "";
+        if ( ! c )
+            return "";
+        Client::Context* cc = c->getContext();
+        return cc ? cc->ns() : "";
     }
 
     struct MyStartupTests {
@@ -80,7 +88,7 @@ namespace mongo {
 
     void testTheDb() {
         OpDebug debug;
-        setClient("sys.unittest.pdfile");
+        Client::Context ctx("sys.unittest.pdfile");
 
         /* this is not validly formatted, if you query this namespace bad things will happen */
         theDataFileMgr.insert("sys.unittest.pdfile", (void *) "hello worldx", 13);
@@ -99,8 +107,6 @@ namespace mongo {
             c->advance();
         }
         out() << endl;
-
-        cc().clearns();
     }
 
     MessagingPort *connGrab = 0;
@@ -137,13 +143,11 @@ namespace mongo {
     };
 
     void webServerThread();
-    void pdfileInit();
 
     void listen(int port) {
         log() << mongodVersion() << endl;
         printGitVersion();
         printSysInfo();
-        pdfileInit();
         //testTheDb();
         log() << "waiting for connections on port " << port << endl;
         OurListener l(bind_ip, port);
@@ -193,7 +197,7 @@ namespace mongo {
 
         try {
 
-            c.ai->isLocalHost = dbMsgPort.farEnd.isLocalHost();
+            c.getAuthenticationInfo()->isLocalHost = dbMsgPort.farEnd.isLocalHost();
 
             Message m;
             while ( 1 ) {
@@ -206,6 +210,11 @@ namespace mongo {
                     break;
                 }
 
+                if ( inShutdown() ) {
+                    log() << "got request after shutdown()" << endl;
+                    break;
+                }
+                
                 lastError.startRequest( m , le );
 
                 DbResponse dbresponse;
@@ -236,6 +245,9 @@ namespace mongo {
             problem() << "SocketException in connThread, closing client connection" << endl;
             dbMsgPort.shutdown();
         }
+        catch ( const ClockSkewException & ) {
+            exitCleanly( EXIT_CLOCK_SKEW );
+        }        
         catch ( std::exception &e ) {
             problem() << "Uncaught std::exception: " << e.what() << ", terminating" << endl;
             dbexit( EXIT_UNCAUGHT );
@@ -263,8 +275,10 @@ namespace mongo {
 //  SockAddr db("172.16.0.179", MessagingPort::DBPort);
 
         MessagingPort p;
-        if ( !p.connect(db) )
+        if ( !p.connect(db) ){
+            out() << "msg couldn't connect" << endl;
             return;
+        }
 
         const int Loops = 1;
         for ( int q = 0; q < Loops; q++ ) {
@@ -280,8 +294,9 @@ namespace mongo {
             Timer t;
             bool ok = p.call(send, response);
             double tm = ((double) t.micros()) + 1;
-            out() << " ****ok. response.data:" << ok << " time:" << tm / 1000.0 << "ms " <<
-                 ((double) len) * 8 / 1000000 / (tm/1000000) << "Mbps" << endl;
+            out() << " ****ok. response.data:" << ok << " time:" << tm / 1000.0 << "ms "
+                  << "len: " << len << " data: " << response.data->_data << endl;
+
             if (  q+1 < Loops ) {
                 out() << "\t\tSLEEP 8 then sending again as a test" << endl;
                 sleepsecs(8);
@@ -327,15 +342,22 @@ namespace mongo {
         return repairDatabase( dbName.c_str(), errmsg );
     }
     
+    extern bool checkNsFilesOnLoad;
+
     void repairDatabases() {
+        Client::GodScope gs;
         log(1) << "enter repairDatabases" << endl;
+        
+        assert(checkNsFilesOnLoad);
+        checkNsFilesOnLoad = false; // we are mainly just checking the header - don't scan the whole .ns file for every db here.
+
         dblock lk;
         vector< string > dbNames;
         getDatabaseNames( dbNames );
         for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
             string dbName = *i;
             log(1) << "\t" << dbName << endl;
-            assert( !setClient( dbName.c_str() ) );
+            Client::Context ctx( dbName );
             MongoDataFile *p = cc().database()->getFile( 0 );
             MDFHeader *h = p->getHeader();
             if ( !h->currentVersion() || forceRepair ) {
@@ -369,6 +391,8 @@ namespace mongo {
             cc().shutdown();
             dbexit( EXIT_CLEAN );
         }
+
+        checkNsFilesOnLoad = true;
     }
 
     void clearTmpFiles() {
@@ -377,12 +401,13 @@ namespace mongo {
                 i != boost::filesystem::directory_iterator(); ++i ) {
             string fileName = boost::filesystem::path(*i).leaf();
             if ( boost::filesystem::is_directory( *i ) &&
-                    fileName.length() > 2 && fileName.substr( 0, 3 ) == "tmp" )
+                fileName.length() && fileName[ 0 ] == '$' )
                 boost::filesystem::remove_all( *i );
         }
     }
-
+    
     void clearTmpCollections() {
+        Client::GodScope gs;
         vector< string > toDelete;
         DBDirectClient cli;
         auto_ptr< DBClientCursor > c = cli.query( "local.system.namespaces", Query( fromjson( "{name:/^local.temp./}" ) ) );
@@ -395,7 +420,7 @@ namespace mongo {
             cli.dropCollection( *i );
         }
     }
-
+    
     /**
      * does background async flushes of mmapped files
      */
@@ -403,15 +428,23 @@ namespace mongo {
     public:
         void run(){
             log(1) << "will flush memory every: " << _sleepsecs << " seconds" << endl;
+            int time_flushing = 0;
             while ( ! inShutdown() ){
                 if ( _sleepsecs == 0 ){
                     // in case at some point we add an option to change at runtime
                     sleepsecs(5);
                     continue;
                 }
-                sleepmillis( (int)(_sleepsecs * 1000) );
-                MemoryMappedFile::flushAll( false );
-                log(1) << "flushing mmmap" << endl;
+
+                sleepmillis( (int)(std::max(0.0, (_sleepsecs * 1000) - time_flushing)) );
+
+                Date_t start = jsTime();
+                MemoryMappedFile::flushAll( true );
+                time_flushing = (int) (jsTime() - start);
+
+                globalFlushCounters.flushed(time_flushing);
+
+                log(1) << "flushing mmap took " << time_flushing << "ms" << endl;
             }
         }
         
@@ -445,14 +478,21 @@ namespace mongo {
         bool is32bit = sizeof(int*) == 4;
 
         log() << "Mongo DB : starting : pid = " << pid << " port = " << cmdLine.port << " dbpath = " << dbpath
-              <<  " master = " << master << " slave = " << (int) slave << "  " << ( is32bit ? "32" : "64" ) << "-bit " << endl;
-
+              <<  " master = " << replSettings.master << " slave = " << (int) replSettings.slave << "  " << ( is32bit ? "32" : "64" ) << "-bit " << endl;
+        DEV log() << " FULL DEBUG ENABLED " << endl;
         show_32_warning();
 
-        stringstream ss;
-        ss << "dbpath (" << dbpath << ") does not exist";
-        massert( 10296 ,  ss.str().c_str(), boost::filesystem::exists( dbpath ) );
-
+        {
+            stringstream ss;
+            ss << "dbpath (" << dbpath << ") does not exist";
+            massert( 10296 ,  ss.str().c_str(), boost::filesystem::exists( dbpath ) );
+        }
+        {
+            stringstream ss;
+            ss << "repairpath (" << repairpath << ") does not exist";
+            massert( 12590 ,  ss.str().c_str(), boost::filesystem::exists( repairpath ) );
+        }
+        
         acquirePathLock();
         remove_all( dbpath + "/_tmp/" );
 
@@ -461,11 +501,10 @@ namespace mongo {
         BOOST_CHECK_EXCEPTION( clearTmpFiles() );
 
         Client::initThread("initandlisten");
+        _diaglog.init();
 
         clearTmpCollections();
 
-        _diaglog.init();
-
         Module::initAll();
 
 #if 0
@@ -493,6 +532,7 @@ namespace mongo {
         /* this is for security on certain platforms (nonce generation) */
         srand((unsigned) (curTimeMicros() ^ startupSrandTimer.micros()));
 
+        snapshotThread.go();
         listen(listenPort);
 
         // listen() will return when exit code closes its socket.
@@ -557,6 +597,7 @@ string arg_error_check(int argc, char* argv[]) {
 
 int main(int argc, char* argv[], char *envp[] )
 {
+    static StaticObserver staticObserver;
     getcurns = ourgetns;
 
     po::options_description general_options("General options");
@@ -564,25 +605,17 @@ int main(int argc, char* argv[], char *envp[] )
     po::options_description sharding_options("Sharding options");
     po::options_description visible_options("Allowed options");
     po::options_description hidden_options("Hidden options");
-    po::options_description cmdline_options("Command line options");
 
     po::positional_options_description positional_options;
 
+    CmdLine::addGlobalOptions( general_options , hidden_options );
+
     general_options.add_options()
-        ("help,h", "show this usage information")
-        ("version", "show version information")
-        ("config,f", po::value<string>(), "configuration file specifying additional options")
-        ("port", po::value<int>(&cmdLine.port)/*->default_value(CmdLine::DefaultDBPort)*/, "specify port number")
         ("bind_ip", po::value<string>(&bind_ip),
          "local ip address to bind listener - all local ips bound by default")
-        ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)")
         ("dbpath", po::value<string>()->default_value("/data/db/"), "directory for datafiles")
-        ("quiet", "quieter output")
-        ("logpath", po::value<string>() , "file to send all output to instead of stdout" )
-        ("logappend" , "appnd to logpath instead of over-writing" )
-#ifndef _WIN32
-        ("fork" , "fork server process" )
-#endif
+        ("directoryperdb", "each database will be stored in a separate directory")
+        ("repairpath", po::value<string>() , "root directory for repair files - defaults to dbpath" )
         ("cpu", "periodically show cpu and iowait utilization")
         ("noauth", "run without security")
         ("auth", "run with security")
@@ -593,6 +626,7 @@ int main(int argc, char* argv[], char *envp[] )
         ("nocursors", "diagnostic/debugging option")
         ("nohints", "ignore query hints")
         ("nohttpinterface", "disable http interface")
+        ("rest","turn on simple rest api")
         ("noscripting", "disable scripting engine")
         ("noprealloc", "disable data file preallocation")
         ("smallfiles", "use a smaller default file size")
@@ -620,8 +654,10 @@ int main(int argc, char* argv[], char *envp[] )
         ("only", po::value<string>(), "when slave: specify a single database to replicate")
         ("pairwith", po::value<string>(), "address of server to pair with")
         ("arbiter", po::value<string>(), "address of arbiter server")
+        ("slavedelay", po::value<int>(), "specify delay (in seconds) to be used when applying master ops to slave")
+        ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer")
         ("autoresync", "automatically resync if slave data is stale")
-        ("oplogSize", po::value<long>(), "size limit (in MB) for op log")
+        ("oplogSize", po::value<int>(), "size limit (in MB) for op log")
         ("opIdMem", po::value<long>(), "size limit (in bytes) for in memory storage of op ids")
         ;
 
@@ -635,18 +671,12 @@ int main(int argc, char* argv[], char *envp[] )
         ("cacheSize", po::value<long>(), "cache size (in MB) for rec store")
         ;
 
-    /* support for -vv -vvvv etc. */
-    for (string s = "vv"; s.length() <= 10; s.append("v")) {
-        hidden_options.add_options()(s.c_str(), "verbose");
-    }
 
     positional_options.add("command", 3);
     visible_options.add(general_options);
     visible_options.add(replication_options);
     visible_options.add(sharding_options);
     Module::addOptions( visible_options );
-    cmdline_options.add(visible_options);
-    cmdline_options.add(hidden_options);
 
     setupSignals();
 
@@ -677,7 +707,7 @@ int main(int argc, char* argv[], char *envp[] )
         bool removeService = false;
         bool startService = false;
         po::variables_map params;
-
+        
         string error_message = arg_error_check(argc, argv);
         if (error_message != "") {
             cout << error_message << endl << endl;
@@ -685,37 +715,9 @@ int main(int argc, char* argv[], char *envp[] )
             return 0;
         }
 
-        /* don't allow guessing - creates ambiguities when some options are
-         * prefixes of others. allow long disguises and don't allow guessing
-         * to get away with our vvvvvvv trick. */
-        int command_line_style = (((po::command_line_style::unix_style ^
-                                    po::command_line_style::allow_guessing) |
-                                   po::command_line_style::allow_long_disguise) ^
-                                  po::command_line_style::allow_sticky);
 
-        try {
-            po::store(po::command_line_parser(argc, argv).options(cmdline_options).
-                      positional(positional_options).
-                      style(command_line_style).run(), params);
-
-            if (params.count("config")) {
-                ifstream config_file (params["config"].as<string>().c_str());
-                if (config_file.is_open()) {
-                    po::store(po::parse_config_file(config_file, cmdline_options), params);
-                    config_file.close();
-                } else {
-                    cout << "ERROR: could not read from config file" << endl << endl;
-                    cout << visible_options << endl;
-                    return 0;
-                }
-            }
-
-            po::notify(params);
-        } catch (po::error &e) {
-            cout << "ERROR: " << e.what() << endl << endl;
-            cout << visible_options << endl;
+        if ( ! CmdLine::store( argc , argv , visible_options , hidden_options , positional_options , params ) )
             return 0;
-        }
 
         if (params.count("help")) {
             show_help_text(visible_options);
@@ -727,16 +729,8 @@ int main(int argc, char* argv[], char *envp[] )
             return 0;
         }
         dbpath = params["dbpath"].as<string>();
-        if (params.count("quiet")) {
-            cmdLine.quiet = true;
-        }
-        if (params.count("verbose")) {
-            logLevel = 1;
-        }
-        for (string s = "vv"; s.length() <= 10; s.append("v")) {
-            if (params.count(s)) {
-                logLevel = s.length();
-            }
+        if ( params.count("directoryperdb")) {
+            directoryperdb = true;
         }
         if (params.count("cpu")) {
             cmdLine.cpu = true;
@@ -761,25 +755,11 @@ int main(int argc, char* argv[], char *envp[] )
             /* casting away the const-ness here */
             appsrvPath = (char*)(params["appsrvpath"].as<string>().c_str());
         }
-#ifndef _WIN32
-        if (params.count("fork")) {
-            if ( ! params.count( "logpath" ) ){
-                cout << "--fork has to be used with --logpath" << endl;
-                return -1;
-            }
-            pid_t c = fork();
-            if ( c ){
-                cout << "forked process: " << c << endl;
-                ::exit(0);
-            }
-            setsid();
-            setupSignals();
-        }
-#endif
-        if (params.count("logpath")) {
-            string lp = params["logpath"].as<string>();
-            uassert( 10033 ,  "logpath has to be non-zero" , lp.size() );
-            initLogging( lp , params.count( "logappend" ) );
+        if (params.count("repairpath")) {
+            repairpath = params["repairpath"].as<string>();
+            uassert( 12589, "repairpath has to be non-zero", repairpath.size() );
+        } else {
+            repairpath = dbpath;
         }
         if (params.count("nocursors")) {
             useCursors = false;
@@ -790,6 +770,9 @@ int main(int argc, char* argv[], char *envp[] )
         if (params.count("nohttpinterface")) {
             noHttpInterface = true;
         }
+        if (params.count("rest")) {
+            cmdLine.rest = true;
+        }
         if (params.count("noscripting")) {
             useJNI = false;
         }
@@ -831,13 +814,19 @@ int main(int argc, char* argv[], char *envp[] )
             startService = true;
         }
         if (params.count("master")) {
-            master = true;
+            replSettings.master = true;
         }
         if (params.count("slave")) {
-            slave = SimpleSlave;
+            replSettings.slave = SimpleSlave;
+        }
+        if (params.count("slavedelay")) {
+            replSettings.slavedelay = params["slavedelay"].as<int>();
+        }
+        if (params.count("fastsync")) {
+            replSettings.fastsync = true;
         }
         if (params.count("autoresync")) {
-            autoresync = true;
+            replSettings.autoresync = true;
         }
         if (params.count("source")) {
             /* specifies what the source in local.sources should be */
@@ -864,7 +853,7 @@ int main(int argc, char* argv[], char *envp[] )
             assert(lenForNewNsFiles > 0);
         }
         if (params.count("oplogSize")) {
-            long x = params["oplogSize"].as<long>();
+            long x = params["oplogSize"].as<int>();
             uassert( 10035 , "bad --oplogSize arg", x > 0);
             cmdLine.oplogSize = x * 1024 * 1024;
             assert(cmdLine.oplogSize > 0);
@@ -872,8 +861,8 @@ int main(int argc, char* argv[], char *envp[] )
         if (params.count("opIdMem")) {
             long x = params["opIdMem"].as<long>();
             uassert( 10036 , "bad --opIdMem arg", x > 0);
-            opIdMem = x;
-            assert(opIdMem > 0);
+            replSettings.opIdMem = x;
+            assert(replSettings.opIdMem > 0);
         }
         if (params.count("cacheSize")) {
             long x = params["cacheSize"].as<long>();
@@ -974,13 +963,13 @@ namespace mongo {
 
 #undef out
 
-    void exitCleanly() {
+    void exitCleanly( ExitCode code ) {
         goingAway = true;
         killCurrentOp.killAll();
         {
             dblock lk;
             log() << "now exiting" << endl;
-            dbexit( EXIT_KILL );        
+            dbexit( code );        
         }
     }
 
@@ -1026,9 +1015,18 @@ namespace mongo {
         int x;
         sigwait( &asyncSignals, &x );
         log() << "got kill or ctrl c signal " << x << " (" << strsignal( x ) << "), will terminate after current cmd ends" << endl;
-        exitCleanly();
+        Client::initThread( "interruptThread" );
+        exitCleanly( EXIT_KILL );
     }
 
+    // this will be called in certain c++ error cases, for example if there are two active
+    // exceptions
+    void myterminate() {
+        rawOut( "terminate() called, printing stack:\n" );
+        printStackTrace();
+        abort();
+    }
+    
     void setupSignals() {
         assert( signal(SIGSEGV, abruptQuit) != SIG_ERR );
         assert( signal(SIGFPE, abruptQuit) != SIG_ERR );
@@ -1044,12 +1042,15 @@ namespace mongo {
         sigaddset( &asyncSignals, SIGTERM );
         assert( pthread_sigmask( SIG_SETMASK, &asyncSignals, 0 ) == 0 );
         boost::thread it( interruptThread );
+        
+        set_terminate( myterminate );
     }
 
 #else
 void ctrlCTerminate() {
-    log() << "got kill or ctrl c signal, will terminate after current cmd ends" << endl;
-    exitCleanly();
+    log() << "got kill or ctrl-c signal, will terminate after current cmd ends" << endl;
+    Client::initThread( "ctrlCTerminate" );
+    exitCleanly( EXIT_KILL );
 }
 BOOL CtrlHandler( DWORD fdwCtrlType )
 {
@@ -1086,14 +1087,6 @@ BOOL CtrlHandler( DWORD fdwCtrlType )
     }
 #endif
 
-void temptestfoo() {
-    MongoMutex m;
-    m.lock();
-//    m.lock_upgrade();
-    m.lock_shared();
-}
-
-
 } // namespace mongo
 
 #include "recstore.h"
diff --git a/db/db.h b/db/db.h
index 3475f34..0bbc97b 100644
--- a/db/db.h
+++ b/db/db.h
@@ -18,7 +18,6 @@
 
 #include "../stdafx.h"
 #include "../util/message.h"
-#include "../util/top.h"
 #include "boost/version.hpp"
 #include "concurrency.h"
 #include "pdfile.h"
@@ -47,16 +46,36 @@ namespace mongo {
      */
     class DatabaseHolder {
     public:
+        typedef map<string,Database*> DBs;
+        typedef map<string,DBs> Paths;
+
         DatabaseHolder() : _size(0){
         }
 
-        Database * get( const string& ns , const string& path ){
+        bool isLoaded( const string& ns , const string& path ) const {
             dbMutex.assertAtLeastReadLocked();
-            map<string,Database*>& m = _paths[path];
+            Paths::const_iterator x = _paths.find( path );
+            if ( x == _paths.end() )
+                return false;
+            const DBs& m = x->second;
             
             string db = _todb( ns );
 
-            map<string,Database*>::iterator it = m.find(db);
+            DBs::const_iterator it = m.find(db);
+            return it != m.end();
+        }
+
+        
+        Database * get( const string& ns , const string& path ) const {
+            dbMutex.assertAtLeastReadLocked();
+            Paths::const_iterator x = _paths.find( path );
+            if ( x == _paths.end() )
+                return 0;
+            const DBs& m = x->second;
+            
+            string db = _todb( ns );
+
+            DBs::const_iterator it = m.find(db);
             if ( it != m.end() ) 
                 return it->second;
             return 0;
@@ -64,20 +83,42 @@ namespace mongo {
         
         void put( const string& ns , const string& path , Database * db ){
             dbMutex.assertWriteLocked();
-            map<string,Database*>& m = _paths[path];
+            DBs& m = _paths[path];
             Database*& d = m[_todb(ns)];
             if ( ! d )
                 _size++;
             d = db;
         }
         
+        Database* getOrCreate( const string& ns , const string& path , bool& justCreated ){
+            dbMutex.assertWriteLocked();
+            DBs& m = _paths[path];
+            
+            string dbname = _todb( ns );
+
+            Database* & db = m[dbname];
+            if ( db ){
+                justCreated = false;
+                return db;
+            }
+            
+            log(1) << "Accessing: " << dbname << " for the first time" << endl;
+            db = new Database( dbname.c_str() , justCreated , path );
+            _size++;
+            return db;
+        }
+        
+
+
+
         void erase( const string& ns , const string& path ){
             dbMutex.assertWriteLocked();
-            map<string,Database*>& m = _paths[path];
-            _size -= m.erase( _todb( ns ) );
+            DBs& m = _paths[path];
+            _size -= (int)m.erase( _todb( ns ) );
         }
 
-        bool closeAll( const string& path , BSONObjBuilder& result );
+        /* force - force close even if something underway - use at shutdown */
+        bool closeAll( const string& path , BSONObjBuilder& result, bool force );
 
         int size(){
             return _size;
@@ -86,107 +127,68 @@ namespace mongo {
         /**
          * gets all unique db names, ignoring paths
          */
-        void getAllShortNames( set<string>& all ) const{
+        void getAllShortNames( set<string>& all ) const {
             dbMutex.assertAtLeastReadLocked();
-            for ( map<string, map<string,Database*> >::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ){
-                map<string,Database*> m = i->second;
-                for( map<string,Database*>::const_iterator j=m.begin(); j!=m.end(); j++ ){
+            for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ){
+                DBs m = i->second;
+                for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ){
                     all.insert( j->first );
                 }
             }
         }
-        
+
     private:
         
-        string _todb( const string& ns ){
+        string _todb( const string& ns ) const {
             size_t i = ns.find( '.' );
             if ( i == string::npos )
                 return ns;
             return ns.substr( 0 , i );
         }
         
-        map<string, map<string,Database*> > _paths;
+        Paths _paths;
         int _size;
         
     };
 
     extern DatabaseHolder dbHolder;
 
-    /* returns true if the database ("database") did not exist, and it was created on this call 
-       path - datafiles directory, if not the default, so we can differentiate between db's of the same
-              name in different places (for example temp ones on repair).
-    */
-    inline bool setClient(const char *ns, const string& path , mongolock *lock ) {
-        if( logLevel > 5 )
-            log() << "setClient: " << ns << endl;
-
-        dbMutex.assertAtLeastReadLocked();
-
-        Client& c = cc();
-        c.top.clientStart( ns );
-
-        Database * db = dbHolder.get( ns , path );
-        if ( db ){
-            c.setns(ns, db );
-            return false;
-        }
-
-        if( lock )
-            lock->releaseAndWriteLock();
-
-        assertInWriteLock();
-        
-        char cl[256];
-        nsToDatabase(ns, cl);
-        bool justCreated;
-        Database *newdb = new Database(cl, justCreated, path);
-        dbHolder.put(ns,path,newdb);
-        c.setns(ns, newdb);
-
-        newdb->finishInit();
-
-        return justCreated;
-    }
-
     // shared functionality for removing references to a database from this program instance
     // does not delete the files on disk
     void closeDatabase( const char *cl, const string& path = dbpath );
-
+    
     struct dbtemprelease {
-        string clientname;
-        string clientpath;
-        int locktype;
+        Client::Context * _context;
+        int _locktype;
+        
         dbtemprelease() {
-            Client& client = cc();
-            Database *database = client.database();
-            if ( database ) {
-                clientname = database->name;
-                clientpath = database->path;
-            }
-            client.top.clientStop();
-            locktype = dbMutex.getState();
-            assert( locktype );
-            if ( locktype > 0 ) {
-				massert( 10298 , "can't temprelease nested write lock", locktype == 1);
+            _context = cc().getContext();
+            _locktype = dbMutex.getState();
+            assert( _locktype );
+            
+            if ( _locktype > 0 ) {
+				massert( 10298 , "can't temprelease nested write lock", _locktype == 1);
+                if ( _context ) _context->unlocked();
                 dbMutex.unlock();
 			}
             else {
-				massert( 10299 , "can't temprelease nested read lock", locktype == -1);
+				massert( 10299 , "can't temprelease nested read lock", _locktype == -1);
+                if ( _context ) _context->unlocked();
                 dbMutex.unlock_shared();
 			}
+
         }
         ~dbtemprelease() {
-            if ( locktype > 0 )
+            if ( _locktype > 0 )
                 dbMutex.lock();
             else
                 dbMutex.lock_shared();
-            if ( clientname.empty() )
-                cc().setns("", 0);
-            else
-                setClient(clientname.c_str(), clientpath.c_str());
+            
+            if ( _context ) _context->relocked();
         }
     };
 
+
     /**
        only does a temp release if we're not nested and have a lock
      */
@@ -212,7 +214,6 @@ namespace mongo {
 
     extern TicketHolder connTicketHolder;
 
-
 } // namespace mongo
 
 //#include "dbinfo.h"
diff --git a/db/db.sln b/db/db.sln
index 35fd85f..79ff2e1 100644
--- a/db/db.sln
+++ b/db/db.sln
@@ -15,10 +15,7 @@ EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
 	ProjectSection(SolutionItems) = preProject
 		..\tools\bridge.cpp = ..\tools\bridge.cpp
-		..\tools\export.cpp = ..\tools\export.cpp
-		..\tools\files.cpp = ..\tools\files.cpp
 		..\tools\sniffer.cpp = ..\tools\sniffer.cpp
-		..\tools\tool.cpp = ..\tools\tool.cpp
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}"
diff --git a/db/db.vcproj b/db/db.vcproj
index 6dc0aae..3ea7506 100644
--- a/db/db.vcproj
+++ b/db/db.vcproj
@@ -144,7 +144,7 @@
 			/>
 			<Tool
 				Name="VCLinkerTool"
-				AdditionalDependencies="ws2_32.lib"
+				AdditionalDependencies="ws2_32.lib psapi.lib"
 				LinkIncremental="1"
 				AdditionalLibraryDirectories="&quot;c:\program files\boost\boost_1_35_0\lib&quot;"
 				GenerateDebugInformation="true"
@@ -350,48 +350,8 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\js\js\Debug\js.lib"
+				RelativePath="..\..\js\src\js.lib"
 				>
-				<FileConfiguration
-					Name="Release|Win32"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\js\js\Release\js.lib"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug Recstore|Win32"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="C:\Program Files\Java\jdk\lib\jvm.lib"
-				>
-				<FileConfiguration
-					Name="release_nojni|Win32"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
 			</File>
 			<File
 				RelativePath="..\pcre-7.4\pcrecpp.cc"
@@ -1342,30 +1302,18 @@
 				>
 			</File>
 			<File
-				RelativePath="..\client\quorum.cpp"
+				RelativePath="..\client\syncclusterconnection.cpp"
 				>
 			</File>
-			<Filter
-				Name="btree related"
-				>
-				<File
-					RelativePath=".\btree.cpp"
-					>
-				</File>
-				<File
-					RelativePath=".\btree.h"
-					>
-				</File>
-				<File
-					RelativePath=".\btreecursor.cpp"
-					>
-				</File>
-			</Filter>
 		</Filter>
 		<Filter
 			Name="db"
 			>
 			<File
+				RelativePath=".\background.h"
+				>
+			</File>
+			<File
 				RelativePath=".\client.h"
 				>
 			</File>
@@ -1374,6 +1322,10 @@
 				>
 			</File>
 			<File
+				RelativePath=".\cmdline.cpp"
+				>
+			</File>
+			<File
 				RelativePath=".\cmdline.h"
 				>
 			</File>
@@ -1414,6 +1366,14 @@
 				>
 			</File>
 			<File
+				RelativePath=".\diskloc.h"
+				>
+			</File>
+			<File
+				RelativePath=".\index.h"
+				>
+			</File>
+			<File
 				RelativePath=".\introspect.h"
 				>
 			</File>
@@ -1485,6 +1445,10 @@
 				RelativePath="..\stdafx.h"
 				>
 			</File>
+			<File
+				RelativePath=".\update.h"
+				>
+			</File>
 			<Filter
 				Name="cpp"
 				Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
@@ -1507,6 +1471,10 @@
 					>
 				</File>
 				<File
+					RelativePath=".\common.cpp"
+					>
+				</File>
+				<File
 					RelativePath=".\cursor.cpp"
 					>
 				</File>
@@ -1539,10 +1507,6 @@
 					>
 				</File>
 				<File
-					RelativePath=".\dbstats.cpp"
-					>
-				</File>
-				<File
 					RelativePath=".\dbwebserver.cpp"
 					>
 				</File>
@@ -1555,6 +1519,10 @@
 					>
 				</File>
 				<File
+					RelativePath=".\index_geo2d.cpp"
+					>
+				</File>
+				<File
 					RelativePath=".\instance.cpp"
 					>
 				</File>
@@ -1671,10 +1639,6 @@
 					>
 				</File>
 				<File
-					RelativePath="..\util\top.cpp"
-					>
-				</File>
-				<File
 					RelativePath=".\update.cpp"
 					>
 				</File>
@@ -1884,6 +1848,42 @@
 					/>
 				</FileConfiguration>
 			</File>
+			<File
+				RelativePath="..\scripting\utils.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="stats"
+			>
+			<File
+				RelativePath=".\stats\counters.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\stats\snapshots.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\stats\top.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="btree related"
+			>
+			<File
+				RelativePath=".\btree.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\btree.h"
+				>
+			</File>
+			<File
+				RelativePath=".\btreecursor.cpp"
+				>
+			</File>
 		</Filter>
 	</Files>
 	<Globals>
diff --git a/db/dbcommands.cpp b/db/dbcommands.cpp
index ff072a1..6d1aa5a 100644
--- a/db/dbcommands.cpp
+++ b/db/dbcommands.cpp
@@ -36,7 +36,8 @@
 #include "security.h"
 #include "queryoptimizer.h"
 #include "../scripting/engine.h"
-#include "dbstats.h"
+#include "stats/counters.h"
+#include "background.h"
 
 namespace mongo {
 
@@ -56,13 +57,15 @@ namespace mongo {
         virtual bool slaveOk() {
             return true;
         }
+        virtual LockType locktype(){ return WRITE; } 
         virtual void help( stringstream& help ) const {
             help << "shutdown the database.  must be ran against admin db and either (1) ran from localhost or (2) authenticated.\n";
         }
         CmdShutdown() : Command("shutdown") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            cc().shutdown();
             log() << "terminating, shutdown command received" << endl;
-            dbexit( EXIT_CLEAN );
+            dbexit( EXIT_CLEAN ); // this never returns
             return true;
         }
     } cmdShutdown;
@@ -75,7 +78,7 @@ namespace mongo {
     */
     class CmdResetError : public Command {
     public:
-        virtual bool readOnly() { return true; }
+        virtual LockType locktype(){ return NONE; } 
         virtual bool requiresAuth() { return false; }
         virtual bool logTheOp() {
             return false;
@@ -98,7 +101,7 @@ namespace mongo {
     /* for diagnostic / testing purposes. */
     class CmdSleep : public Command { 
     public:
-        virtual bool readOnly() { return true; }
+        virtual LockType locktype(){ return READ; } 
         virtual bool adminOnly() { return true; }
         virtual bool logTheOp() {
             return false;
@@ -118,7 +121,7 @@ namespace mongo {
 
     class CmdGetLastError : public Command {
     public:
-        virtual bool readOnly() { return true; }
+        virtual LockType locktype(){ return NONE; } 
         virtual bool requiresAuth() { return false; }
         virtual bool logTheOp() {
             return false;
@@ -155,6 +158,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return true;
         }
+        virtual LockType locktype(){ return NONE; } 
         CmdForceError() : Command("forceerror") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             uassert( 10038 , "forced error", false);
@@ -164,7 +168,7 @@ namespace mongo {
 
     class CmdGetPrevError : public Command {
     public:
-        virtual bool readOnly() { return true; }
+        virtual LockType locktype(){ return NONE; } 
         virtual bool requiresAuth() { return false; }
         virtual bool logTheOp() {
             return false;
@@ -199,6 +203,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return true;
         }
+        virtual LockType locktype(){ return NONE; } 
         CmdSwitchToClientErrors() : Command("switchtoclienterrors") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if ( lastError.getID() ){
@@ -223,9 +228,10 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; } 
         CmdDropDatabase() : Command("dropDatabase") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            BSONElement e = cmdObj.findElement(name);
+            BSONElement e = cmdObj.getField(name);
             log() << "dropDatabase " << ns << endl;
             int p = (int) e.number();
             if ( p != 1 )
@@ -247,16 +253,17 @@ namespace mongo {
         virtual void help( stringstream& help ) const {
             help << "repair database.  also compacts. note: slow.";
         }
+        virtual LockType locktype(){ return WRITE; } 
         CmdRepairDatabase() : Command("repairDatabase") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            BSONElement e = cmdObj.findElement(name);
+            BSONElement e = cmdObj.getField(name);
             log() << "repairDatabase " << ns << endl;
             int p = (int) e.number();
             if ( p != 1 )
                 return false;
-            e = cmdObj.findElement( "preserveClonedFilesOnFailure" );
+            e = cmdObj.getField( "preserveClonedFilesOnFailure" );
             bool preserveClonedFilesOnFailure = e.isBoolean() && e.boolean();
-            e = cmdObj.findElement( "backupOriginalFiles" );
+            e = cmdObj.getField( "backupOriginalFiles" );
             bool backupOriginalFiles = e.isBoolean() && e.boolean();
             return repairDatabase( ns, errmsg, preserveClonedFilesOnFailure, backupOriginalFiles );
         }
@@ -274,9 +281,10 @@ namespace mongo {
         virtual void help( stringstream& help ) const {
             help << "enable or disable performance profiling";
         }
+        virtual LockType locktype(){ return WRITE; } 
         CmdProfile() : Command("profile") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            BSONElement e = cmdObj.findElement(name);
+            BSONElement e = cmdObj.getField(name);
             result.append("was", (double) cc().database()->profile);
             int p = (int) e.number();
             bool ok = false;
@@ -302,9 +310,15 @@ namespace mongo {
         CmdServerStatus() : Command("serverStatus") {
             started = time(0);
         }
+        
+        virtual LockType locktype(){ return NONE; } 
+
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            
+			bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
 
             result.append("uptime",(double) (time(0)-started));
+            result.appendDate( "localTime" , jsTime() );
             
             {
                 BSONObjBuilder t;
@@ -316,19 +330,19 @@ namespace mongo {
                 double tl = (double) timeLocked;
                 t.append("totalTime", tt);
                 t.append("lockTime", tl);
-                t.append("ratio", tl/tt);
+                t.append("ratio", (tt ? tl/tt : 0));
                 
                 result.append( "globalLock" , t.obj() );
             }
             
-            {
-
+            if ( authed ){
+                
                 BSONObjBuilder t( result.subobjStart( "mem" ) );
                 
                 ProcessInfo p;
                 if ( p.supported() ){
-                    t.append( "resident" , p.getResidentSize() );
-                    t.append( "virtual" , p.getVirtualMemorySize() );
+                    t.appendNumber( "resident" , p.getResidentSize() );
+                    t.appendNumber( "virtual" , p.getVirtualMemorySize() );
                     t.appendBool( "supported" , true );
                 }
                 else {
@@ -336,7 +350,7 @@ namespace mongo {
                     t.appendBool( "supported" , false );
                 }
                     
-                t.append( "mapped" , MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) );
+                t.appendNumber( "mapped" , MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) );
 
                 t.done();
                     
@@ -348,7 +362,8 @@ namespace mongo {
                 bb.append( "available" , connTicketHolder.available() );
                 bb.done();
             }
-            {
+            
+            if ( authed ){
                 BSONObjBuilder bb( result.subobjStart( "extra_info" ) );
                 bb.append("note", "fields vary by platform");
                 ProcessInfo p;
@@ -356,8 +371,40 @@ namespace mongo {
                 bb.done();
             }
 
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "indexCounters" ) );
+                globalIndexCounters.append( bb );
+                bb.done();
+            }
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "backgroundFlushing" ) );
+                globalFlushCounters.append( bb );
+                bb.done();
+            }
+            
+            if ( anyReplEnabled() ){
+                BSONObjBuilder bb( result.subobjStart( "repl" ) );
+                appendReplicationInfo( bb , authed , cmdObj["repl"].numberInt() );
+                bb.done();
+            }
+            
             result.append( "opcounters" , globalOpCounters.getObj() );
             
+            {
+                BSONObjBuilder asserts( result.subobjStart( "asserts" ) );
+                asserts.append( "regular" , assertionCount.regular );
+                asserts.append( "warning" , assertionCount.warning );
+                asserts.append( "msg" , assertionCount.msg );
+                asserts.append( "user" , assertionCount.user );
+                asserts.append( "rollovers" , assertionCount.rollovers );
+                asserts.done();
+            }
+
+            if ( ! authed )
+                result.append( "note" , "run against admin for more info" );
+
             return true;
         }
         time_t started;
@@ -372,6 +419,7 @@ namespace mongo {
         virtual void help( stringstream& help ) const {
             help << "check if any asserts have occurred on the server";
         }
+        virtual LockType locktype(){ return WRITE; } 
         CmdAssertInfo() : Command("assertinfo") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             result.appendBool("dbasserted", lastAssert[0].isSet() || lastAssert[1].isSet() || lastAssert[2].isSet());
@@ -389,8 +437,10 @@ namespace mongo {
         virtual bool slaveOk() {
             return true;
         }
+        virtual LockType locktype(){ return NONE; } 
         CmdGetOpTime() : Command("getoptime") { }
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            writelock l( "" );
             result.appendDate("optime", OpTime::now().asDate());
             return true;
         }
@@ -416,6 +466,7 @@ namespace mongo {
         bool adminOnly() {
             return true;
         }
+        virtual LockType locktype(){ return WRITE; } 
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() );
             stringstream ss;
@@ -451,7 +502,12 @@ namespace mongo {
         }
     } dbc_unittest;
 
-    bool deleteIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool mayDeleteIdIndex ) {
+    void assureSysIndexesEmptied(const char *ns, IndexDetails *exceptForIdIndex);
+    int removeFromSysIndexes(const char *ns, const char *idxName);
+
+    bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool mayDeleteIdIndex ) {
+
+        BackgroundOperation::assertNoBgOpInProgForNs(ns);
 
         d->aboutToDeleteAnIndex();
 
@@ -479,7 +535,10 @@ namespace mongo {
             }
             /* assuming here that id index is not multikey: */
             d->multiKeyIndexBits = 0;
-            anObjBuilder.append("msg", "all indexes deleted for collection");
+            assureSysIndexesEmptied(ns, idIndex);
+            anObjBuilder.append("msg", mayDeleteIdIndex ? 
+                "indexes dropped for collection" : 
+                "non-_id indexes dropped for collection");
         }
         else {
             // delete just one index
@@ -503,7 +562,11 @@ namespace mongo {
                 for ( int i = x; i < d->nIndexes; i++ )
                     d->idx(i) = d->idx(i+1);
             } else {
-                log() << "deleteIndexes: " << name << " not found" << endl;
+                int n = removeFromSysIndexes(ns, name); // just in case an orphaned listing there - i.e. should have been repaired but wasn't
+                if( n ) { 
+                    log() << "info: removeFromSysIndexes cleaned up " << n << " entries" << endl;
+                }
+                log() << "dropIndexes: " << name << " not found" << endl;
                 errmsg = "index not found";
                 return false;
             }
@@ -524,8 +587,9 @@ namespace mongo {
         virtual bool adminOnly() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; } 
         virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
-            string nsToDrop = cc().database()->name + '.' + cmdObj.findElement(name).valuestr();
+            string nsToDrop = cc().database()->name + '.' + cmdObj.getField(name).valuestr();
             NamespaceDetails *d = nsdetails(nsToDrop.c_str());
             if ( !cmdLine.quiet )
                 log() << "CMD: drop " << nsToDrop << endl;
@@ -542,14 +606,14 @@ namespace mongo {
     /* select count(*) */
     class CmdCount : public Command {
     public:
-        virtual bool readOnly() { return true; }
+        virtual LockType locktype(){ return READ; } 
         CmdCount() : Command("count") { }
         virtual bool logTheOp() {
             return false;
         }
         virtual bool slaveOk() {
             // ok on --slave setups, not ok for nonmaster of a repl pair (unless override)
-            return slave == SimpleSlave;
+            return replSettings.slave == SimpleSlave;
         }
         virtual bool slaveOverrideOk() {
             return true;
@@ -558,7 +622,7 @@ namespace mongo {
             return false;
         }
         virtual bool run(const char *_ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
-            string ns = cc().database()->name + '.' + cmdObj.findElement(name).valuestr();
+            string ns = cc().database()->name + '.' + cmdObj.getField(name).valuestr();
             string err;
             long long n = runCount(ns.c_str(), cmdObj, err);
             long long nn = n;
@@ -591,11 +655,12 @@ namespace mongo {
         virtual bool adminOnly() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; } 
         virtual void help( stringstream& help ) const {
             help << "create a collection";
         }
         virtual bool run(const char *_ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
-            string ns = cc().database()->name + '.' + cmdObj.findElement(name).valuestr();
+            string ns = cc().database()->name + '.' + cmdObj.getField(name).valuestr();
             string err;
             bool ok = userCreateNS(ns.c_str(), cmdObj, err, true);
             if ( !ok && !err.empty() )
@@ -604,7 +669,8 @@ namespace mongo {
         }
     } cmdCreate;
 
-    class CmdDeleteIndexes : public Command {
+    /* "dropIndexes" is now the preferred form - "deleteIndexes" deprecated */
+    class CmdDropIndexes : public Command {
     public:
         virtual bool logTheOp() {
             return true;
@@ -612,21 +678,34 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; } 
         virtual void help( stringstream& help ) const {
-            help << "delete indexes for a collection";
+            help << "drop indexes for a collection";
         }
-        CmdDeleteIndexes() : Command("deleteIndexes") { }
+        CmdDropIndexes(const char *cmdname = "dropIndexes") : Command(cmdname) { }
         bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) {
-            /* note: temp implementation.  space not reclaimed! */
-            BSONElement e = jsobj.findElement(name.c_str());
+            BSONElement e = jsobj.getField(name.c_str());
             string toDeleteNs = cc().database()->name + '.' + e.valuestr();
             NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
             if ( !cmdLine.quiet )
-                log() << "CMD: deleteIndexes " << toDeleteNs << endl;
+                log() << "CMD: dropIndexes " << toDeleteNs << endl;
             if ( d ) {
-                BSONElement f = jsobj.findElement("index");
+                BSONElement f = jsobj.getField("index");
                 if ( f.type() == String ) {
-                    return deleteIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false );
+                    return dropIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false );
+                }
+                else if ( f.type() == Object ){
+                    int idxId = d->findIndexByKeyPattern( f.embeddedObject() );
+                    if ( idxId < 0 ){
+                        errmsg = "can't find index with key:";
+                        errmsg += f.embeddedObject();
+                        return false;
+                    }
+                    else {
+                        IndexDetails& ii = d->idx( idxId );
+                        string iName = ii.indexName();
+                        return dropIndexes( d, toDeleteNs.c_str(), iName.c_str() , errmsg, anObjBuilder, false );
+                    }
                 }
                 else {
                     errmsg = "invalid index name spec";
@@ -638,6 +717,10 @@ namespace mongo {
                 return false;
             }
         }
+    } cmdDropIndexes;
+    class CmdDeleteIndexes : public CmdDropIndexes { 
+    public:
+        CmdDeleteIndexes() : CmdDropIndexes("deleteIndexes") { }
     } cmdDeleteIndexes;
 
     class CmdReIndex : public Command {
@@ -648,14 +731,17 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; } 
         virtual void help( stringstream& help ) const {
             help << "re-index a collection";
         }
         CmdReIndex() : Command("reIndex") { }
         bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            BackgroundOperation::assertNoBgOpInProgForNs(ns);
+
             static DBDirectClient db;
 
-            BSONElement e = jsobj.findElement(name.c_str());
+            BSONElement e = jsobj.getField(name.c_str());
             string toDeleteNs = cc().database()->name + '.' + e.valuestr();
             NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
             log() << "CMD: reIndex " << toDeleteNs << endl;
@@ -675,9 +761,9 @@ namespace mongo {
             }
 
 
-            bool ok = deleteIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true );
+            bool ok = dropIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true );
             if ( ! ok ){
-                errmsg = "deleteIndexes failed";
+                errmsg = "dropIndexes failed";
                 return false;
             }
 
@@ -693,8 +779,6 @@ namespace mongo {
         }
     } cmdReIndex;
 
-
-
     class CmdListDatabases : public Command {
     public:
         virtual bool logTheOp() {
@@ -709,6 +793,7 @@ namespace mongo {
         virtual bool adminOnly() {
             return true;
         }
+        virtual LockType locktype(){ return WRITE; } 
         CmdListDatabases() : Command("listDatabases") {}
         bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             vector< string > dbNames;
@@ -722,8 +807,8 @@ namespace mongo {
                 b.append( "name", i->c_str() );
                 boost::intmax_t size = dbSize( i->c_str() );
                 b.append( "sizeOnDisk", (double) size );
-                setClient( i->c_str() );
-                b.appendBool( "empty", cc().database()->isEmpty() );
+                Client::Context ctx( *i );
+                b.appendBool( "empty", ctx.db()->isEmpty() );
                 totalSize += size;
                 dbInfos.push_back( b.obj() );
 
@@ -741,8 +826,8 @@ namespace mongo {
 
                 BSONObjBuilder b;
                 b << "name" << name << "sizeOnDisk" << double( 1 );
-                setClient( name.c_str() );
-                b.appendBool( "empty", cc().database()->isEmpty() );
+                Client::Context ctx( name );
+                b.appendBool( "empty", ctx.db()->isEmpty() );
 
                 dbInfos.push_back( b.obj() );
             }
@@ -753,13 +838,17 @@ namespace mongo {
         }
     } cmdListDatabases;
 
+    /* note an access to a database right after this will open it back up - so this is mainly 
+       for diagnostic purposes. 
+       */
     class CmdCloseAllDatabases : public Command {
     public:
         virtual bool adminOnly() { return true; }
         virtual bool slaveOk() { return false; }
+        virtual LockType locktype(){ return WRITE; } 
         CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {}
         bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
-            return dbHolder.closeAll( dbpath , result );
+            return dbHolder.closeAll( dbpath , result, false );
         }
     } cmdCloseAllDatabases;
 
@@ -772,6 +861,7 @@ namespace mongo {
         virtual void help( stringstream& help ) const {
             help << " example: { filemd5 : ObjectId(aaaaaaa) , key : { ts : 1 } }";
         }
+        virtual LockType locktype(){ return READ; } 
         bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
             static DBDirectClient db;
 
@@ -831,6 +921,7 @@ namespace mongo {
     public:
         CmdMedianKey() : Command( "medianKey" ) {}
         virtual bool slaveOk() { return true; }
+        virtual LockType locktype(){ return READ; } 
         virtual void help( stringstream &help ) const {
             help << " example: { medianKey:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }\n"
                 "NOTE: This command may take awhile to run";
@@ -840,6 +931,8 @@ namespace mongo {
             BSONObj min = jsobj.getObjectField( "min" );
             BSONObj max = jsobj.getObjectField( "max" );
             BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
+            
+            Client::Context ctx( ns );
 
             IndexDetails *id = cmdIndexDetailsForRange( ns, errmsg, min, max, keyPattern );
             if ( id == 0 )
@@ -872,6 +965,7 @@ namespace mongo {
     public:
         CmdDatasize() : Command( "datasize" ) {}
         virtual bool slaveOk() { return true; }
+        virtual LockType locktype(){ return READ; } 
         virtual void help( stringstream &help ) const {
             help <<
                 "\ndetermine data size for a set of data in a certain range"
@@ -885,9 +979,10 @@ namespace mongo {
             BSONObj max = jsobj.getObjectField( "max" );
             BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
 
+            Client::Context ctx( ns );
+            
             auto_ptr< Cursor > c;
             if ( min.isEmpty() && max.isEmpty() ) {
-                setClient( ns );
                 c = theDataFileMgr.findAll( ns );
             } else if ( min.isEmpty() || max.isEmpty() ) {
                 errmsg = "only one of min or max specified";
@@ -923,19 +1018,40 @@ namespace mongo {
         }
     } cmdDatasize;
 
+    namespace {
+        long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ){
+            DBDirectClient client;
+            auto_ptr<DBClientCursor> indexes =
+                client.query(db + ".system.indexes", QUERY( "ns" << ns));
+
+            long long totalSize = 0;
+            while (indexes->more()){
+                BSONObj index = indexes->nextSafe();
+                NamespaceDetails * nsd = nsdetails( (ns + ".$" + index["name"].valuestrsafe()).c_str() );
+                if (!nsd)
+                    continue; // nothing to do here
+                totalSize += nsd->datasize;
+                if (details)
+                    details->appendNumber(index["name"].valuestrsafe(), nsd->datasize / scale );
+            }
+            return totalSize;
+        }
+    }
+
     class CollectionStats : public Command {
     public:
         CollectionStats() : Command( "collstats" ) {}
         virtual bool slaveOk() { return true; }
+        virtual LockType locktype(){ return READ; } 
         virtual void help( stringstream &help ) const {
             help << " example: { collstats:\"blog.posts\" } ";
         }
-        bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
-            string ns = dbname;
-            if ( ns.find( "." ) != string::npos )
-                ns = ns.substr( 0 , ns.find( "." ) );
-            ns += ".";
-            ns += jsobj.firstElement().valuestr();
+        bool run(const char *dbname_c, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            string dbname = dbname_c;
+            if ( dbname.find( "." ) != string::npos )
+                dbname = dbname.substr( 0 , dbname.find( "." ) );
+            
+            string ns = dbname + "." + jsobj.firstElement().valuestr();
 
             NamespaceDetails * nsd = nsdetails( ns.c_str() );
             if ( ! nsd ){
@@ -944,12 +1060,25 @@ namespace mongo {
             }
 
             result.append( "ns" , ns.c_str() );
-
-            result.append( "count" , nsd->nrecords );
-            result.append( "size" , nsd->datasize );
-            result.append( "storageSize" , nsd->storageSize() );
+            
+            int scale = 1;
+            if ( jsobj["scale"].isNumber() )
+                scale = jsobj["scale"].numberInt();
+
+            result.appendNumber( "count" , nsd->nrecords );
+            result.appendNumber( "size" , nsd->datasize / scale );
+            int numExtents;
+            result.appendNumber( "storageSize" , nsd->storageSize( &numExtents ) / scale );
+            result.append( "numExtents" , numExtents );
             result.append( "nindexes" , nsd->nIndexes );
+            result.append( "lastExtentSize" , nsd->lastExtentSize / scale );
+            result.append( "paddingFactor" , nsd->paddingFactor );
+            result.append( "flags" , nsd->flags );
 
+            BSONObjBuilder indexSizes;
+            result.appendNumber( "totalIndexSize" , getIndexSizeForCollection(dbname, ns, &indexSizes, scale) / scale );
+            result.append("indexSizes", indexSizes.obj());
+            
             if ( nsd->capped ){
                 result.append( "capped" , nsd->capped );
                 result.append( "max" , nsd->max );
@@ -959,11 +1088,70 @@ namespace mongo {
         }
     } cmdCollectionStatis;
 
+
+    class DBStats : public Command {
+    public:
+        DBStats() : Command( "dbstats" ) {}
+        virtual bool slaveOk() { return true; }
+        virtual LockType locktype(){ return READ; } 
+        virtual void help( stringstream &help ) const {
+            help << " example: { dbstats:1 } ";
+        }
+        bool run(const char *dbname_c, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            string dbname = dbname_c;
+            if ( dbname.find( "." ) != string::npos )
+                dbname = dbname.substr( 0 , dbname.find( "." ) );
+
+            DBDirectClient client;
+            const list<string> collections = client.getCollectionNames(dbname);
+
+            long long ncollections = 0;
+            long long objects = 0;
+            long long size = 0;
+            long long storageSize = 0;
+            long long numExtents = 0;
+            long long indexes = 0;
+            long long indexSize = 0;
+
+            for (list<string>::const_iterator it = collections.begin(); it != collections.end(); ++it){
+                const string ns = *it;
+
+                NamespaceDetails * nsd = nsdetails( ns.c_str() );
+                if ( ! nsd ){
+                    // should this assert here?
+                    continue;
+                }
+
+                ncollections += 1;
+                objects += nsd->nrecords;
+                size += nsd->datasize;
+
+                int temp;
+                storageSize += nsd->storageSize( &temp );
+                numExtents += temp;
+
+                indexes += nsd->nIndexes;
+                indexSize += getIndexSizeForCollection(dbname, ns);
+            }
+
+            result.appendNumber( "collections" , ncollections );
+            result.appendNumber( "objects" , objects );
+            result.appendNumber( "dataSize" , size );
+            result.appendNumber( "storageSize" , storageSize);
+            result.appendNumber( "numExtents" , numExtents );
+            result.appendNumber( "indexes" , indexes );
+            result.appendNumber( "indexSize" , indexSize );
+
+                return true;
+        }
+    } cmdDBStats;
+
     class CmdBuildInfo : public Command {
     public:
         CmdBuildInfo() : Command( "buildinfo" ) {}
         virtual bool slaveOk() { return true; }
         virtual bool adminOnly() { return true; }
+        virtual LockType locktype(){ return NONE; } 
         virtual void help( stringstream &help ) const {
             help << "example: { buildinfo:1 }";
         }
@@ -974,10 +1162,12 @@ namespace mongo {
         }
     } cmdBuildInfo;
 
+    /* convertToCapped seems to use this */
     class CmdCloneCollectionAsCapped : public Command {
     public:
         CmdCloneCollectionAsCapped() : Command( "cloneCollectionAsCapped" ) {}
         virtual bool slaveOk() { return false; }
+        virtual LockType locktype(){ return WRITE; } 
         virtual void help( stringstream &help ) const {
             help << "example: { cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }";
         }
@@ -996,15 +1186,13 @@ namespace mongo {
 
             string fromNs = string( realDbName ) + "." + from;
             string toNs = string( realDbName ) + "." + to;
-            massert( 10300 ,  "source collection " + fromNs + " does not exist", !setClient( fromNs.c_str() ) );
             NamespaceDetails *nsd = nsdetails( fromNs.c_str() );
             massert( 10301 ,  "source collection " + fromNs + " does not exist", nsd );
-            long long excessSize = nsd->datasize - size * 2;
+            long long excessSize = nsd->datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size'
             DiskLoc extent = nsd->firstExtent;
-            for( ; excessSize > 0 && extent != nsd->lastExtent; extent = extent.ext()->xnext ) {
+            for( ; excessSize > extent.ext()->length && extent != nsd->lastExtent; extent = extent.ext()->xnext ) {
                 excessSize -= extent.ext()->length;
-                if ( excessSize > 0 )
-                    log( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl;
+                log( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl;
                 log( 6 ) << "excessSize: " << excessSize << endl;
             }
             DiskLoc startLoc = extent.ext()->firstRecord;
@@ -1012,15 +1200,13 @@ namespace mongo {
             CursorId id;
             {
                 auto_ptr< Cursor > c = theDataFileMgr.findAll( fromNs.c_str(), startLoc );
-                ClientCursor *cc = new ClientCursor();
-                cc->c = c;
-                cc->ns = fromNs;
+                ClientCursor *cc = new ClientCursor(c, fromNs.c_str(), true);
                 cc->matcher.reset( new CoveredIndexMatcher( BSONObj(), fromjson( "{$natural:1}" ) ) );
                 id = cc->cursorid;
             }
 
             DBDirectClient client;
-            setClient( toNs.c_str() );
+            Client::Context ctx( toNs );
             BSONObjBuilder spec;
             spec.appendBool( "capped", true );
             spec.append( "size", double( size ) );
@@ -1037,14 +1223,22 @@ namespace mongo {
         }
     } cmdCloneCollectionAsCapped;
 
+    /* jan2010: 
+       Converts the given collection to a capped collection w/ the specified size. 
+       This command is not highly used, and is not currently supported with sharded 
+       environments. 
+       */
     class CmdConvertToCapped : public Command {
     public:
         CmdConvertToCapped() : Command( "convertToCapped" ) {}
         virtual bool slaveOk() { return false; }
+        virtual LockType locktype(){ return WRITE; } 
         virtual void help( stringstream &help ) const {
             help << "example: { convertToCapped:<fromCollectionName>, size:<sizeInBytes> }";
         }
         bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
+            BackgroundOperation::assertNoBgOpInProgForDb(dbname);
+
             string from = jsobj.getStringField( "convertToCapped" );
             long long size = (long long)jsobj.getField( "size" ).number();
 
@@ -1086,6 +1280,7 @@ namespace mongo {
     class GroupCommand : public Command {
     public:
         GroupCommand() : Command("group"){}
+        virtual LockType locktype(){ return READ; } 
         virtual bool slaveOk() { return true; }
         virtual void help( stringstream &help ) const {
             help << "see http://www.mongodb.org/display/DOCS/Aggregation";
@@ -1260,7 +1455,7 @@ namespace mongo {
     public:
         DistinctCommand() : Command("distinct"){}
         virtual bool slaveOk() { return true; }
-
+        virtual LockType locktype(){ return READ; } 
         virtual void help( stringstream &help ) const {
             help << "{ distinct : 'collection name' , key : 'a.b' }";
         }
@@ -1268,7 +1463,7 @@ namespace mongo {
         bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
             static DBDirectClient db;
 
-            string ns = cc().database()->name + '.' + cmdObj.findElement(name).valuestr();
+            string ns = cc().database()->name + '.' + cmdObj.getField(name).valuestr();
             string key = cmdObj["key"].valuestrsafe();
 
             BSONObj keyPattern = BSON( key << 1 );
@@ -1319,6 +1514,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; } 
         virtual bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             static DBDirectClient db;
 
@@ -1355,23 +1551,232 @@ namespace mongo {
         }
     } cmdFindAndModify;
     
-    bool commandIsReadOnly(BSONObj& _cmdobj) { 
-        BSONObj jsobj;
-        {
-            BSONElement e = _cmdobj.firstElement();
-            if ( e.type() == Object && string("query") == e.fieldName() ) {
-                jsobj = e.embeddedObject();
+    /* Returns client's uri */
+    class CmdWhatsMyUri : public Command {
+    public:
+        CmdWhatsMyUri() : Command("whatsmyuri") { }
+        virtual bool logTheOp() {
+            return false; // the modification will be logged directly
+        }
+        virtual bool slaveOk() {
+            return true;
+        }
+        virtual LockType locktype(){ return NONE; } 
+        virtual bool requiresAuth() {
+            return false;
+        }
+        virtual void help( stringstream &help ) const {
+            help << "{whatsmyuri:1}";
+        }        
+        virtual bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            BSONObj info = cc().curop()->infoNoauth();
+            result << "you" << info[ "client" ];
+            return true;
+        }
+    } cmdWhatsMyUri;
+    
+    /* For testing only, not for general use */
+    class GodInsert : public Command {
+    public:
+        GodInsert() : Command( "godinsert" ) { }
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual bool slaveOk() {
+            return false;
+        }
+        virtual LockType locktype() { return WRITE; }
+        virtual bool requiresAuth() {
+            return true;
+        }
+        virtual void help( stringstream &help ) const {
+            help << "[for testing only]";
+        }        
+        virtual bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
+            string coll = cmdObj[ "godinsert" ].valuestrsafe();
+            uassert( 13049, "godinsert must specify a collection", !coll.empty() );
+            string ns = nsToDatabase( dbname ) + "." + coll;
+            BSONObj obj = cmdObj[ "obj" ].embeddedObjectUserCheck();
+            DiskLoc loc = theDataFileMgr.insert( ns.c_str(), obj, true );
+            return true;
+        }
+    } cmdGodInsert;
+
+    class DBHashCmd : public Command {
+    public:
+        DBHashCmd() : Command( "dbhash" ){}
+        virtual bool slaveOk() { return true; }
+        virtual LockType locktype() { return READ; }
+        virtual bool run(const char * badns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){
+            string dbname = nsToDatabase( badns );
+            
+            list<string> colls = _db.getCollectionNames( dbname );
+            colls.sort();
+            
+            result.appendNumber( "numCollections" , (long long)colls.size() );
+            
+            md5_state_t globalState;
+            md5_init(&globalState);
+
+            BSONObjBuilder bb( result.subobjStart( "collections" ) );
+            for ( list<string>::iterator i=colls.begin(); i != colls.end(); i++ ){
+                string c = *i;
+                if ( c.find( ".system.profil" ) != string::npos )
+                    continue;
+                
+                auto_ptr<Cursor> cursor;
+
+                NamespaceDetails * nsd = nsdetails( c.c_str() );
+                int idNum = nsd->findIdIndex();
+                if ( idNum >= 0 ){
+                    cursor.reset( new BtreeCursor( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) );
+                }
+                else if ( c.find( ".system." ) != string::npos ){
+                    continue;
+                }
+                else if ( nsd->capped ){
+                    cursor = findTableScan( c.c_str() , BSONObj() );
+                }
+                else {
+                    bb.done();
+                    errmsg = (string)"can't find _id index for: " + c;
+                    return 0;
+                }
+
+                md5_state_t st;
+                md5_init(&st);
+                
+                long long n = 0;
+                while ( cursor->ok() ){
+                    BSONObj c = cursor->current();
+                    md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() );
+                    n++;
+                    cursor->advance();
+                }
+                md5digest d;
+                md5_finish(&st, d);
+                string hash = digestToString( d );
+                
+                bb.append( c.c_str() + ( dbname.size() + 1 ) , hash );
+
+                md5_append( &globalState , (const md5_byte_t*)hash.c_str() , hash.size() );
             }
-            else {
-                jsobj = _cmdobj;
+            bb.done();
+
+            md5digest d;
+            md5_finish(&globalState, d);
+            string hash = digestToString( d );
+
+            result.append( "md5" , hash );
+
+            return 1;
+        }
+
+        DBDirectClient _db;
+    } dbhashCmd;
+    
+    /** 
+     * this handles
+     - auth
+     - locking
+     - context
+     then calls run()
+    */
+    bool execCommand( Command * c ,
+                      Client& client , int queryOptions , 
+                      const char *ns, BSONObj& cmdObj , 
+                      BSONObjBuilder& result, 
+                      bool fromRepl ){
+        
+        string dbname = nsToDatabase( ns );
+        
+        AuthenticationInfo *ai = client.getAuthenticationInfo();    
+
+        if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) { 
+            result.append( "errmsg" , 
+                           "unauthorized: this command must run from localhost when running db without auth" );
+            log() << "command denied: " << cmdObj.toString() << endl;
+            return false;
+        }
+        
+
+        if ( c->adminOnly() && ! fromRepl && dbname != "admin" ) {
+            result.append( "errmsg" ,  "access denied" );
+            log() << "command denied: " << cmdObj.toString() << endl;
+            return false;
+        }        
+
+        if ( cmdObj["help"].trueValue() ){
+            stringstream ss;
+            ss << "help for: " << c->name << " ";
+            c->help( ss );
+            result.append( "help" , ss.str() );
+            result.append( "lockType" , c->locktype() );
+            return true;
+        } 
+
+        bool canRunHere = 
+            isMaster( dbname.c_str() ) ||
+            c->slaveOk() ||
+            ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) ||
+            fromRepl;
+
+        if ( ! canRunHere ){
+            result.append( "errmsg" , "not master" );
+            return false;
+        }
+        
+        if ( c->locktype() == Command::NONE ){
+            // we also trust that this won't crash
+            string errmsg;
+            int ok = c->run( ns , cmdObj , errmsg , result , fromRepl );
+            if ( ! ok )
+                result.append( "errmsg" , errmsg );
+            return ok;
+        }
+     
+        bool needWriteLock = c->locktype() == Command::WRITE;
+        
+        if ( ! c->requiresAuth() && 
+             ( ai->isAuthorizedReads( dbname ) && 
+               ! ai->isAuthorized( dbname ) ) ){
+            // this means that they can read, but not write
+            // so only get a read lock
+            needWriteLock = false;
+        }
+        
+        if ( ! needWriteLock ){
+            assert( ! c->logTheOp() );
+        }
+
+        mongolock lk( needWriteLock );
+        Client::Context ctx( ns , dbpath , &lk , c->requiresAuth() );
+        
+        if ( c->adminOnly() )
+            log( 2 ) << "command: " << cmdObj << endl;
+        
+        try {
+            string errmsg;
+            if ( ! c->run(ns, cmdObj, errmsg, result, fromRepl ) ){
+                result.append( "errmsg" , errmsg );
+                return false;
             }
         }
-        BSONElement e = jsobj.firstElement();
-        if ( ! e.type() )
+        catch ( AssertionException& e ){
+            stringstream ss;
+            ss << "assertion: " << e.what();
+            result.append( "errmsg" , ss.str() );
             return false;
-        return Command::readOnly( e.fieldName() );
+        }
+        
+        if ( c->logTheOp() && ! fromRepl ){
+            logOp("c", ns, cmdObj);
+        }
+        
+        return true;
     }
 
+
     /* TODO make these all command objects -- legacy stuff here
 
        usage:
@@ -1380,9 +1785,11 @@ namespace mongo {
        returns true if ran a cmd
     */
     bool _runCommands(const char *ns, BSONObj& _cmdobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+        string dbname = nsToDatabase( ns );
+
         if( logLevel >= 1 ) 
             log() << "run command " << ns << ' ' << _cmdobj << endl;
-
+        
         const char *p = strchr(ns, '.');
         if ( !p ) return false;
         if ( strcmp(p, ".$cmd") != 0 ) return false;
@@ -1398,59 +1805,14 @@ namespace mongo {
             }
         }
 
+        Client& client = cc();
         bool ok = false;
 
         BSONElement e = jsobj.firstElement();
-
+        
         Command * c = e.type() ? Command::findCommand( e.fieldName() ) : 0;
         if ( c ){
-            string errmsg;
-            AuthenticationInfo *ai = currentClient.get()->ai;
-            uassert( 10045 , "unauthorized", ai->isAuthorized(cc().database()->name.c_str()) || !c->requiresAuth());
-
-            bool admin = c->adminOnly();
-
-            if( admin && c->localHostOnlyIfNoAuth(jsobj) && noauth && !ai->isLocalHost ) { 
-                ok = false;
-                errmsg = "unauthorized: this command must run from localhost when running db without auth";
-                log() << "command denied: " << jsobj.toString() << endl;
-            }
-            else if ( admin && !fromRepl && strncmp(ns, "admin", 5) != 0 ) {
-                ok = false;
-                errmsg = "access denied";
-                log() << "command denied: " << jsobj.toString() << endl;
-            }
-            else if ( isMaster() ||
-                      c->slaveOk() ||
-                      ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) ||
-                      fromRepl ){
-                if ( jsobj.getBoolField( "help" ) ) {
-                    stringstream help;
-                    help << "help for: " << e.fieldName() << " ";
-                    c->help( help );
-                    anObjBuilder.append( "help" , help.str() );
-                } 
-                else {
-                    if( admin )
-                        log( 2 ) << "command: " << jsobj << endl;
-                    try {
-                        ok = c->run(ns, jsobj, errmsg, anObjBuilder, fromRepl);
-                    }
-                    catch ( AssertionException& e ){
-                        ok = false;
-                        errmsg = "assertion: ";
-                        errmsg += e.what();
-                    }
-                    if ( ok && c->logTheOp() && !fromRepl )
-                        logOp("c", ns, jsobj);
-                }
-            }
-            else {
-                ok = false;
-                errmsg = "not master";
-            }
-            if ( !ok )
-                anObjBuilder.append("errmsg", errmsg);
+            ok = execCommand( c , client , queryOptions , ns , jsobj , anObjBuilder , fromRepl );
         }
         else {
             anObjBuilder.append("errmsg", "no such cmd");
diff --git a/db/dbcommands_admin.cpp b/db/dbcommands_admin.cpp
index 91052bf..7265002 100644
--- a/db/dbcommands_admin.cpp
+++ b/db/dbcommands_admin.cpp
@@ -31,15 +31,36 @@
 #include "btree.h"
 #include "curop.h"
 #include "../util/background.h"
+#include "../scripting/engine.h"
 
 namespace mongo {
 
+    class FeaturesCmd : public Command {
+    public:
+        FeaturesCmd() : Command( "features" ){}
+
+        virtual bool slaveOk(){ return true; }
+        virtual bool readOnly(){ return true; }
+        virtual LockType locktype(){ return READ; } 
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+            result.append( "readlock" , readLockSupported() );
+            if ( globalScriptEngine ){
+                BSONObjBuilder bb( result.subobjStart( "js" ) );
+                result.append( "utf8" , globalScriptEngine->utf8Ok() );
+                bb.done();
+            }
+            return true;
+        }
+        
+    } featuresCmd;
+
     class CleanCmd : public Command {
     public:
         CleanCmd() : Command( "clean" ){}
 
         virtual bool slaveOk(){ return true; }
-
+        virtual LockType locktype(){ return WRITE; } 
+        
         bool run(const char *nsRaw, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
             string dropns = cc().database()->name + "." + cmdObj.firstElement().valuestrsafe();
             
@@ -70,6 +91,7 @@ namespace mongo {
             return true;
         }
         
+        virtual LockType locktype(){ return WRITE; } 
         //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] } */
         
         bool run(const char *nsRaw, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
@@ -159,7 +181,7 @@ namespace mongo {
                         nlen += r->netLength();
                         c->advance();
                     }
-                    if ( d->capped ) {
+                    if ( d->capped && !d->capLooped() ) {
                         ss << "  capped outOfOrder:" << outOfOrder;
                         if ( outOfOrder > 1 ) {
                             valid = false;
@@ -252,7 +274,7 @@ namespace mongo {
 
     extern bool unlockRequested;
     extern unsigned lockedForWriting;
-    extern boost::mutex lockedForWritingMutex;
+    extern mongo::mutex lockedForWritingMutex;
 
 /*
     class UnlockCommand : public Command { 
@@ -283,8 +305,10 @@ namespace mongo {
         class LockDBJob : public BackgroundJob { 
         protected:
             void run() { 
+                Client::initThread("fsyncjob");
+                Client& c = cc();
                 {
-                    boostlock lk(lockedForWritingMutex);
+                    scoped_lock lk(lockedForWritingMutex);
                     lockedForWriting++;
                 }
                 readlock lk("");
@@ -299,9 +323,10 @@ namespace mongo {
                     sleepmillis(20);
                 }
                 {
-                    boostlock lk(lockedForWritingMutex);
+                    scoped_lock lk(lockedForWritingMutex);
                     lockedForWriting--;
                 }
+                c.shutdown();
             }
         public:
             bool& _ready;
@@ -312,7 +337,7 @@ namespace mongo {
         };
     public:
         FSyncCommand() : Command( "fsync" ){}
-
+        virtual LockType locktype(){ return WRITE; } 
         virtual bool slaveOk(){ return true; }
         virtual bool adminOnly(){ return true; }
         /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { 
@@ -351,6 +376,18 @@ namespace mongo {
         }
         
     } fsyncCmd;
-    
+
+    class LogRotateCmd : public Command {
+    public:
+        LogRotateCmd() : Command( "logRotate" ){}
+        virtual LockType locktype(){ return NONE; } 
+        virtual bool slaveOk(){ return true; }
+        virtual bool adminOnly(){ return true; }
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            rotateLogs();
+            return 1;
+        }        
+        
+    } logRotateCmd;
 }
 
diff --git a/db/dbeval.cpp b/db/dbeval.cpp
index e729135..a3be894 100644
--- a/db/dbeval.cpp
+++ b/db/dbeval.cpp
@@ -73,7 +73,7 @@ namespace mongo {
 
         BSONObj args;
         {
-            BSONElement argsElement = cmd.findElement("args");
+            BSONElement argsElement = cmd.getField("args");
             if ( argsElement.type() == Array ) {
                 args = argsElement.embeddedObject();
                 if ( edebug ) {
@@ -111,8 +111,16 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        // We need at least read only access to run db.eval - auth for eval'd writes will be checked
+        // as they are requested.
+        virtual bool requiresAuth() {
+            return false;
+        }
+        virtual LockType locktype(){ return WRITE; }
         CmdEval() : Command("$eval") { }
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            AuthenticationInfo *ai = cc().getAuthenticationInfo();
+            uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(cc().database()->name.c_str()));
             return dbEval(ns, cmdObj, result, errmsg);
         }
     } cmdeval;
diff --git a/db/dbmessage.h b/db/dbmessage.h
index 54a2ac3..ba5cf94 100644
--- a/db/dbmessage.h
+++ b/db/dbmessage.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "storage.h"
+#include "diskloc.h"
 #include "jsobj.h"
 #include "namespace.h"
 #include "../util/message.h"
@@ -133,8 +133,10 @@ namespace mongo {
             return nextjsobj != 0;
         }
         BSONObj nextJsObj() {
-            if ( nextjsobj == data )
+            if ( nextjsobj == data ) {
                 nextjsobj += strlen(data) + 1; // skip namespace
+                massert( 13066 ,  "Message contains no documents", theEnd > nextjsobj );
+            }
             massert( 10304 ,  "Remaining data too small for BSON object", theEnd - nextjsobj > 3 );
             BSONObj js(nextjsobj);
             massert( 10305 ,  "Invalid object size", js.objsize() > 3 );
@@ -180,7 +182,7 @@ namespace mongo {
         int ntoreturn;
         int queryOptions;
         BSONObj query;
-        auto_ptr< FieldMatcher > fields;
+        BSONObj fields;
         
         /* parses the message into the above fields */
         QueryMessage(DbMessage& d) {
@@ -189,11 +191,7 @@ namespace mongo {
             ntoreturn = d.pullInt();
             query = d.nextJsObj();
             if ( d.moreJSObjs() ) {
-                BSONObj o = d.nextJsObj();
-                if (!o.isEmpty()){
-                    fields = auto_ptr< FieldMatcher >(new FieldMatcher() );
-                    fields->add( o );
-                }
+                fields = d.nextJsObj();
             }
             queryOptions = d.msg().data->dataAsInt();
         }
@@ -222,9 +220,8 @@ namespace mongo {
         qr->startingFrom = startingFrom;
         qr->nReturned = nReturned;
         b.decouple();
-        Message *resp = new Message();
-        resp->setData(qr, true); // transport will free
-        p->reply(requestMsg, *resp, requestMsg.data->id);
+        Message resp(qr, true);
+        p->reply(requestMsg, resp, requestMsg.data->id);
     }
 
 } // namespace mongo
diff --git a/db/dbstats.cpp b/db/dbstats.cpp
deleted file mode 100644
index 902b57b..0000000
--- a/db/dbstats.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// dbstats.cpp
-
-#include "stdafx.h"
-#include "dbstats.h"
-
-namespace mongo {
-
-    OpCounters::OpCounters(){
-        int zero = 0;
-
-        BSONObjBuilder b;
-        b.append( "insert" , zero );
-        b.append( "query" , zero );
-        b.append( "update" , zero );
-        b.append( "delete" , zero );
-        b.append( "getmore" , zero );
-        _obj = b.obj();
-
-        _insert = (int*)_obj["insert"].value();
-        _query = (int*)_obj["query"].value();
-        _update = (int*)_obj["update"].value();
-        _delete = (int*)_obj["delete"].value();
-        _getmore = (int*)_obj["getmore"].value();
-    }
-
-    void OpCounters::gotOp( int op ){
-        switch ( op ){
-        case dbInsert: gotInsert(); break;
-        case dbQuery: gotQuery(); break;
-        case dbUpdate: gotUpdate(); break;
-        case dbDelete: gotDelete(); break;
-        case dbGetMore: gotGetMore(); break;
-        case dbKillCursors:
-        case opReply:
-        case dbMsg:
-            break;
-        default: log() << "OpCounters::gotOp unknown op: " << op << endl;
-        }
-    }
-
-
-    OpCounters globalOpCounters;
-}
diff --git a/db/dbstats.h b/db/dbstats.h
deleted file mode 100644
index c7d6340..0000000
--- a/db/dbstats.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// dbstats.h
-
-#include "../stdafx.h"
-#include "jsobj.h"
-#include "../util/message.h"
-
-namespace mongo {
-
-    /**
-     * for storing operation counters
-     * note: not thread safe.  ok with that for speed
-     */
-    class OpCounters {
-    public:
-        
-        OpCounters();
-
-        int * getInsert(){ return _insert; }
-        int * getQuery(){ return _query; }
-        int * getUpdate(){ return _update; }
-        int * getDelete(){ return _delete; }
-        int * getGetGore(){ return _getmore; }
-
-        void gotInsert(){ _insert[0]++; }
-        void gotQuery(){ _query[0]++; }
-        void gotUpdate(){ _update[0]++; }
-        void gotDelete(){ _delete[0]++; }
-        void gotGetMore(){ _getmore[0]++; }
-
-        void gotOp( int op );
-
-        BSONObj& getObj(){ return _obj; }
-    private:
-        BSONObj _obj;
-        int * _insert;
-        int * _query;
-        int * _update;
-        int * _delete;
-        int * _getmore;
-    };
-
-    extern OpCounters globalOpCounters;
-
-}
diff --git a/db/dbwebserver.cpp b/db/dbwebserver.cpp
index 0e1483c..75d3a92 100644
--- a/db/dbwebserver.cpp
+++ b/db/dbwebserver.cpp
@@ -27,6 +27,9 @@
 #include "replset.h"
 #include "instance.h"
 #include "security.h"
+#include "stats/snapshots.h"
+#include "background.h"
+#include "commands.h"
 
 #include <pcrecpp.h>
 #include <boost/date_time/posix_time/posix_time.hpp>
@@ -61,48 +64,6 @@ namespace mongo {
         }
         unsigned long long start, timeLocked;
     };
-    Timing tlast;
-    const int NStats = 32;
-    string lockStats[NStats];
-    unsigned q = 0;
-
-    void statsThread() {
-        /*cout << "TEMP disabled statsthread" << endl;
-        if( 1 ) 
-            return;*/
-        Client::initThread("stats");
-        unsigned long long timeLastPass = 0;
-        while ( 1 ) {
-            {
-                /* todo: do we even need readlock here?  if so for what? */
-                readlock lk("");
-                Top::completeSnapshot();
-                q = (q+1)%NStats;
-                Timing timing;
-                dbMutex.info().getTimingInfo(timing.start, timing.timeLocked);
-                unsigned long long now = curTimeMicros64();
-                if ( timeLastPass ) {
-                    unsigned long long dt = now - timeLastPass;
-                    unsigned long long dlocked = timing.timeLocked - tlast.timeLocked;
-                    {
-                        stringstream ss;
-                        ss << dt / 1000 << '\t';
-                        ss << dlocked / 1000 << '\t';
-                        if ( dt )
-                            ss << (dlocked*100)/dt << '%';
-                        string s = ss.str();
-                        if ( cmdLine.cpu )
-                            log() << "cpu: " << s << endl;
-                        lockStats[q] = s;
-                        ClientCursor::idleTimeReport( (unsigned) ((dt - dlocked)/1000) );
-                    }
-                }
-                timeLastPass = now;
-                tlast = timing;
-            }
-            sleepsecs(4);
-        }
-    }
 
     bool _bold;
     string bold(bool x) {
@@ -118,14 +79,11 @@ namespace mongo {
         // caller locks
         void doLockedStuff(stringstream& ss) {
             ss << "# databases: " << dbHolder.size() << '\n';
-            if ( cc().database() ) {
-                ss << "curclient: " << cc().database()->name; // TODO: isn't this useless?
-                ss << '\n';
-            }
+
             ss << bold(ClientCursor::byLocSize()>10000) << "Cursors byLoc.size(): " << ClientCursor::byLocSize() << bold() << '\n';
             ss << "\n<b>replication</b>\n";
-            ss << "master: " << master << '\n';
-            ss << "slave:  " << slave << '\n';
+            ss << "master: " << replSettings.master << '\n';
+            ss << "slave:  " << replSettings.slave << '\n';
             if ( replPair ) {
                 ss << "replpair:\n";
                 ss << replPair->getInfo();
@@ -135,26 +93,76 @@ namespace mongo {
             ss <<   "initialSyncCompleted: " << seemCaughtUp;
             if ( !seemCaughtUp ) ss << "</b>";
             ss << '\n';
-
-            ss << "\n<b>DBTOP</b>\n";
-            ss << "<table border=1><tr align='left'><th>Namespace</th><th>%</th><th>Reads</th><th>Writes</th><th>Calls</th><th>Time</th>";
-            vector< Top::Usage > usage;
-            Top::usage( usage );
-            for( vector< Top::Usage >::iterator i = usage.begin(); i != usage.end(); ++i )
-                ss << setprecision( 2 ) << fixed << "<tr><td>" << i->ns << "</td><td>" << i->pct << "</td><td>"
-                   << i->reads << "</td><td>" << i->writes << "</td><td>" << i->calls << "</td><td>" << i->time << "</td></tr>\n";
-            ss << "</table>";
             
-            ss << "\n<b>dt\ttlocked</b>\n";
-            unsigned i = q;
-            while ( 1 ) {
-                ss << lockStats[i] << '\n';
-                i = (i-1)%NStats;
-                if ( i == q )
-                    break;
+            auto_ptr<SnapshotDelta> delta = statsSnapshots.computeDelta();
+            if ( delta.get() ){
+                ss << "\n<b>DBTOP  (occurences|percent of elapsed)</b>\n";
+                ss << "<table border=1>";
+                ss << "<tr align='left'>";
+                ss << "<th>NS</th>"
+                      "<th colspan=2>total</th>"
+                      "<th colspan=2>Reads</th>"
+                      "<th colspan=2>Writes</th>"
+                      "<th colspan=2>Queries</th>"
+                      "<th colspan=2>GetMores</th>"
+                      "<th colspan=2>Inserts</th>"
+                      "<th colspan=2>Updates</th>"
+                      "<th colspan=2>Removes</th>";
+                ss << "</tr>";
+                
+                display( ss , (double) delta->elapsed() , "GLOBAL" , delta->globalUsageDiff() );
+                
+                Top::UsageMap usage = delta->collectionUsageDiff();
+                for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ){
+                    display( ss , (double) delta->elapsed() , i->first , i->second );
+                }
+                
+                ss << "</table>";
             }
+
+            statsSnapshots.outputLockInfoHTML( ss );
+
+            BackgroundOperation::dump(ss);
         }
 
+        void display( stringstream& ss , double elapsed , const Top::UsageData& usage ){
+            ss << "<td>";
+            ss << usage.count;
+            ss << "</td><td>";
+            double per = 100 * ((double)usage.time)/elapsed;
+            ss << setprecision(2) << fixed << per << "%";
+            ss << "</td>";
+        }
+
+        void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ){
+            if ( ns != "GLOBAL" && data.total.count == 0 )
+                return;
+            ss << "<tr><th>" << ns << "</th>";
+            
+            display( ss , elapsed , data.total );
+
+            display( ss , elapsed , data.readLock );
+            display( ss , elapsed , data.writeLock );
+
+            display( ss , elapsed , data.queries );
+            display( ss , elapsed , data.getmore );
+            display( ss , elapsed , data.insert );
+            display( ss , elapsed , data.update );
+            display( ss , elapsed , data.remove );
+            
+            ss << "</tr>";
+        }
+
+        void tablecell( stringstream& ss , bool b ){
+            ss << "<td>" << (b ? "<b>X</b>" : "") << "</td>";
+        }
+        
+
+        template< typename T> 
+        void tablecell( stringstream& ss , const T& t ){
+            ss << "<td>" << t << "</td>";
+        }
+        
         void doUnlockedStuff(stringstream& ss) {
             /* this is in the header already ss << "port:      " << port << '\n'; */
             ss << mongodVersion() << "\n";
@@ -178,21 +186,51 @@ namespace mongo {
             ss << "\nreplInfo:  " << replInfo << "\n\n";
 
             ss << "Clients:\n";
-            ss << "<table border=1><tr align='left'><th>Thread</th><th>Current op</th>\n";
+            ss << "<table border=1>";
+            ss << "<tr align='left'>"
+               << "<th>Thread</th>" 
+             
+               << "<th>OpId</th>" 
+               << "<th>Active</th>" 
+               << "<th>LockType</th>"
+               << "<th>Waiting</th>"
+               << "<th>SecsRunning</th>"
+               << "<th>Op</th>"
+               << "<th>NameSpace</th>"
+               << "<th>Query</th>"
+               << "<th>client</th>"
+               << "<th>msg</th>"
+               << "<th>progress</th>"
+
+               << "</tr>\n";
             {
-                boostlock bl(Client::clientsMutex);
+                scoped_lock bl(Client::clientsMutex);
                 for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { 
                     Client *c = *i;
                     CurOp& co = *(c->curop());
-                    ss << "<tr><td>" << c->desc() << "</td><td";
-                    BSONObj info = co.infoNoauth();
-                    /*
-                    if( info.getIntField("inLock") > 0 )
-                        ss << "style='color:red'";
-                    else if( info.getIntField("inLock") < 0 ) 
-                        ss << "style='color:green'";
-                        */
-                    ss << ">" << info << "</td></tr>\n";
+                    ss << "<tr><td>" << c->desc() << "</td>";
+                    
+                    tablecell( ss , co.opNum() );
+                    tablecell( ss , co.active() );
+                    tablecell( ss , co.getLockType() );
+                    tablecell( ss , co.isWaitingForLock() );
+                    if ( co.active() )
+                        tablecell( ss , co.elapsedSeconds() );
+                    else
+                        tablecell( ss , "" );
+                    tablecell( ss , co.getOp() );
+                    tablecell( ss , co.getNS() );
+                    if ( co.haveQuery() )
+                        tablecell( ss , co.query() );
+                    else
+                        tablecell( ss , "" );
+                    tablecell( ss , co.getRemoteString() );
+
+                    tablecell( ss , co.getMessage() );
+                    tablecell( ss , co.getProgressMeter().toString() );
+
+
+                    ss << "</tr>";
                 }
             }
             ss << "</table>\n";
@@ -203,7 +241,7 @@ namespace mongo {
             if ( from.localhost() )
                 return true;
             
-            if ( db.findOne( "admin.system.users" , BSONObj() ).isEmpty() )
+            if ( db.findOne( "admin.system.users" , BSONObj() , 0 , QueryOption_SlaveOk ).isEmpty() )
                 return true;
             
             string auth = getHeader( rq , "Authorization" );
@@ -270,6 +308,23 @@ namespace mongo {
             //out() << "url [" << url << "]" << endl;
             
             if ( url.size() > 1 ) {
+                
+                if ( url.find( "/_status" ) == 0 ){
+                    if ( ! allowed( rq , headers, from ) ){
+                        responseCode = 401;
+                        responseMsg = "not allowed\n";
+                        return;
+                    }              
+                    generateServerStatus( url , responseMsg );
+                    responseCode = 200;
+                    return;
+                }
+
+                if ( ! cmdLine.rest ){
+                    responseCode = 403;
+                    responseMsg = "rest is not enabled.  use --rest to turn on";
+                    return;
+                }
                 if ( ! allowed( rq , headers, from ) ){
                     responseCode = 401;
                     responseMsg = "not allowed\n";
@@ -294,23 +349,18 @@ namespace mongo {
 
             doUnlockedStuff(ss);
 
-            int n = 2000;
-            Timer t;
-            while ( 1 ) {
-                if ( !dbMutex.info().isLocked() ) {
-                    {
-                        readlock lk("");
-                        ss << "time to get dblock: " << t.millis() << "ms\n";
-                        doLockedStuff(ss);
-                    }
-                    break;
+            {
+                Timer t;
+                readlocktry lk( "" , 2000 );
+                if ( lk.got() ){
+                    ss << "time to get dblock: " << t.millis() << "ms\n";
+                    doLockedStuff(ss);
                 }
-                sleepmillis(1);
-                if ( --n < 0 ) {
+                else {
                     ss << "\n<b>timed out getting dblock</b>\n";
-                    break;
                 }
             }
+            
 
             ss << "</pre></body></html>";
             responseMsg = ss.str();
@@ -323,6 +373,51 @@ namespace mongo {
             }            
         }
 
+        void generateServerStatus( string url , string& responseMsg ){
+            static vector<string> commands;
+            if ( commands.size() == 0 ){
+                commands.push_back( "serverStatus" );
+                commands.push_back( "buildinfo" );
+            }
+
+            BSONObj params;
+            if ( url.find( "?" ) != string::npos ) {
+                parseParams( params , url.substr( url.find( "?" ) + 1 ) );
+            }
+            
+            BSONObjBuilder buf(1024);
+            
+            for ( unsigned i=0; i<commands.size(); i++ ){
+                string cmd = commands[i];
+
+                Command * c = Command::findCommand( cmd );
+                assert( c );
+                assert( c->locktype() == 0 );
+                
+                BSONObj co;
+                {
+                    BSONObjBuilder b;
+                    b.append( cmd.c_str() , 1 );
+                    
+                    if ( cmd == "serverStatus" && params["repl"].type() ){
+                        b.append( "repl" , atoi( params["repl"].valuestr() ) );
+                    }
+                    
+                    co = b.obj();
+                }
+                
+                string errmsg;
+                
+                BSONObjBuilder sub;
+                if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) )
+                    buf.append( cmd.c_str() , errmsg );
+                else
+                    buf.append( cmd.c_str() , sub.obj() );
+            }
+            
+            responseMsg = buf.obj().jsonString();
+        }
+
         void handleRESTRequest( const char *rq, // the full request
                                 string url,
                                 string& responseMsg,
@@ -341,7 +436,7 @@ namespace mongo {
             string coll = url.substr( first + 1 );
             string action = "";
 
-            map<string,string> params;
+            BSONObj params;
             if ( coll.find( "?" ) != string::npos ) {
                 parseParams( params , coll.substr( coll.find( "?" ) + 1 ) );
                 coll = coll.substr( 0 , coll.find( "?" ) );
@@ -361,7 +456,7 @@ namespace mongo {
                 if ( coll[i] == '/' )
                     coll[i] = '.';
 
-            string fullns = dbname + "." + coll;
+            string fullns = urlDecode(dbname + "." + coll);
 
             headers.push_back( (string)"x-action: " + action );
             headers.push_back( (string)"x-ns: " + fullns );
@@ -387,26 +482,29 @@ namespace mongo {
             responseMsg = ss.str();
         }
 
-        void handleRESTQuery( string ns , string action , map<string,string> & params , int & responseCode , stringstream & out ) {
+        void handleRESTQuery( string ns , string action , BSONObj & params , int & responseCode , stringstream & out ) {
             Timer t;
 
             int skip = _getOption( params["skip"] , 0 );
             int num = _getOption( params["limit"] , _getOption( params["count" ] , 1000 ) ); // count is old, limit is new
 
             int one = 0;
-            if ( params["one"].size() > 0 && tolower( params["one"][0] ) == 't' ) {
+            if ( params["one"].type() == String && tolower( params["one"].valuestr()[0] ) == 't' ) {
                 num = 1;
                 one = 1;
             }
 
             BSONObjBuilder queryBuilder;
 
-            for ( map<string,string>::iterator i = params.begin(); i != params.end(); i++ ) {
-                if ( ! i->first.find( "filter_" ) == 0 )
+            BSONObjIterator i(params);
+            while ( i.more() ){
+                BSONElement e = i.next();
+                string name = e.fieldName();
+                if ( ! name.find( "filter_" ) == 0 )
                     continue;
 
-                const char * field = i->first.substr( 7 ).c_str();
-                const char * val = i->second.c_str();
+                const char * field = name.substr( 7 ).c_str();
+                const char * val = e.valuestr();
 
                 char * temp;
 
@@ -454,7 +552,7 @@ namespace mongo {
         }
 
         // TODO Generate id and revision per couch POST spec
-        void handlePost( string ns, const char *body, map<string,string> & params, int & responseCode, stringstream & out ) {
+        void handlePost( string ns, const char *body, BSONObj& params, int & responseCode, stringstream & out ) {
             try {
                 BSONObj obj = fromjson( body );
                 db.insert( ns.c_str(), obj );
@@ -468,10 +566,12 @@ namespace mongo {
             out << "{ \"ok\" : true }";
         }
 
-        int _getOption( string val , int def ) {
-            if ( val.size() == 0 )
-                return def;
-            return atoi( val.c_str() );
+        int _getOption( BSONElement e , int def ) {
+            if ( e.isNumber() )
+                return e.numberInt();
+            if ( e.type() == String )
+                return atoi( e.valuestr() );
+            return def;
         }
 
     private:
@@ -481,7 +581,6 @@ namespace mongo {
     DBDirectClient DbWebServer::db;
 
     void webServerThread() {
-        boost::thread thr(statsThread);
         Client::initThread("websvr");
         DbWebServer mini;
         int p = cmdLine.port + 1000;
diff --git a/db/storage.h b/db/diskloc.h
index cc29e60..cc29e60 100644
--- a/db/storage.h
+++ b/db/diskloc.h
diff --git a/db/driverHelpers.cpp b/db/driverHelpers.cpp
new file mode 100644
index 0000000..c2d1b9d
--- /dev/null
+++ b/db/driverHelpers.cpp
@@ -0,0 +1,63 @@
+// driverHelpers.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+   this file has dbcommands that are for drivers
+   mostly helpers
+*/
+
+
+#include "stdafx.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "namespace.h"
+#include "commands.h"
+#include "cmdline.h"
+#include "btree.h"
+#include "curop.h"
+#include "../util/background.h"
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+    class BasicDriverHelper : public Command {
+    public:
+        BasicDriverHelper( const char * name ) : Command( name ){}
+        
+        virtual LockType locktype(){ return NONE; }
+        virtual bool slaveOk(){ return true; }
+        virtual bool slaveOverrideOk(){ return true; }
+        
+    };
+
+    class ObjectIdTest : public BasicDriverHelper {
+    public:
+        ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ){}
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+            if ( cmdObj.firstElement().type() != jstOID ){
+                errmsg = "not oid";
+                return false;
+            }
+
+            const OID& oid = cmdObj.firstElement().__oid();
+            result.append( "oid" , oid );
+            result.append( "str" , oid.str() );
+
+            return true;
+        }
+    } driverObjectIdTest;
+}
diff --git a/db/extsort.cpp b/db/extsort.cpp
index 08b343a..a0b9f7a 100644
--- a/db/extsort.cpp
+++ b/db/extsort.cpp
@@ -27,11 +27,12 @@
 
 namespace mongo {
     
+    BSONObj BSONObjExternalSorter::extSortOrder;
     unsigned long long BSONObjExternalSorter::_compares = 0;
     
     BSONObjExternalSorter::BSONObjExternalSorter( const BSONObj & order , long maxFileSize )
         : _order( order.getOwned() ) , _maxFilesize( maxFileSize ) , 
-          _cur(0), _curSizeSoFar(0), _sorted(0){
+          _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0){
         
         stringstream rootpath;
         rootpath << dbpath;
@@ -56,13 +57,21 @@ namespace mongo {
         wassert( removed == 1 + _files.size() );
     }
 
+    void BSONObjExternalSorter::_sortInMem(){
+        // extSortComp needs to use glbals
+        // qsort_r only seems available on bsd, which is what i really want to use
+        dblock l;
+        extSortOrder = _order;
+        _cur->sort( BSONObjExternalSorter::extSortComp );
+    }
+    
     void BSONObjExternalSorter::sort(){
         uassert( 10048 ,  "already sorted" , ! _sorted );
-
+        
         _sorted = true;
 
         if ( _cur && _files.size() == 0 ){
-            _cur->sort( MyCmp( _order ) );
+            _sortInMem();
             log(1) << "\t\t not using file.  size:" << _curSizeSoFar << " _compares:" << _compares << endl;
             return;
         }
@@ -85,16 +94,20 @@ namespace mongo {
         uassert( 10049 ,  "sorted already" , ! _sorted );
         
         if ( ! _cur ){
-            _cur = new InMemory();
+            _cur = new InMemory( _arraySize );
         }
         
-        _cur->push_back( pair<BSONObj,DiskLoc>( o.getOwned() , loc ) );
-
+        Data& d = _cur->getNext();
+        d.first = o.getOwned();
+        d.second = loc;
+        
         long size = o.objsize();
-        _curSizeSoFar += size + sizeof( DiskLoc );
+        _curSizeSoFar += size + sizeof( DiskLoc ) + sizeof( BSONObj );
         
-        if ( _curSizeSoFar > _maxFilesize )
+        if (  _cur->hasSpace() == false ||  _curSizeSoFar > _maxFilesize ){
             finishMap();
+            log(1) << "finishing map" << endl;
+        }
 
     }
     
@@ -105,7 +118,7 @@ namespace mongo {
         if ( _cur->size() == 0 )
             return;
         
-        _cur->sort( MyCmp( _order ) );
+        _sortInMem();
         
         stringstream ss;
         ss << _root.string() << "/file." << _files.size();
@@ -113,10 +126,10 @@ namespace mongo {
         
         ofstream out;
         out.open( file.c_str() , ios_base::out | ios_base::binary );
-        uassert( 10051 ,  (string)"couldn't open file: " + file , out.good() );
+        ASSERT_STREAM_GOOD( 10051 ,  (string)"couldn't open file: " + file , out );
         
         int num = 0;
-        for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); i++ ){
+        for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ){
             Data p = *i;
             out.write( p.first.objdata() , p.first.objsize() );
             out.write( (char*)(&p.second) , sizeof( DiskLoc ) );
@@ -169,10 +182,12 @@ namespace mongo {
         return false;
     }
         
-    pair<BSONObj,DiskLoc> BSONObjExternalSorter::Iterator::next(){
+    BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next(){
         
         if ( _in ){
-            return *(_it++);
+            Data& d = *_it;
+            ++_it;
+            return d;
         }
         
         Data best;
@@ -204,7 +219,7 @@ namespace mongo {
     
     BSONObjExternalSorter::FileIterator::FileIterator( string file ){
         long length;
-        _buf = (char*)_file.map( file.c_str() , length );
+        _buf = (char*)_file.map( file.c_str() , length , MemoryMappedFile::SEQUENTIAL );
         massert( 10308 ,  "mmap failed" , _buf );
         assert( (unsigned long)length == file_size( file ) );
         _end = _buf + length;
@@ -216,7 +231,7 @@ namespace mongo {
         return _buf < _end;
     }
     
-    pair<BSONObj,DiskLoc> BSONObjExternalSorter::FileIterator::next(){
+    BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next(){
         BSONObj o( _buf );
         _buf += o.objsize();
         DiskLoc * l = (DiskLoc*)_buf;
diff --git a/db/extsort.h b/db/extsort.h
index 5bfa86f..60ee423 100644
--- a/db/extsort.h
+++ b/db/extsort.h
@@ -22,9 +22,11 @@
 #include "jsobj.h"
 #include "namespace.h"
 #include "curop.h"
+#include "../util/array.h"
 
 namespace mongo {
 
+
     /**
        for sorting by BSONObj and attaching a value
      */
@@ -32,8 +34,21 @@ namespace mongo {
     public:
         
         typedef pair<BSONObj,DiskLoc> Data;
-        
+
     private:
+        static BSONObj extSortOrder;
+
+        static int extSortComp( const void *lv, const void *rv ){
+            RARELY killCurrentOp.checkForInterrupt();
+            _compares++;
+            Data * l = (Data*)lv;
+            Data * r = (Data*)rv;
+            int cmp = l->first.woCompare( r->first , extSortOrder );
+            if ( cmp )
+                return cmp;
+            return l->second.compare( r->second );
+        };
+
         class FileIterator : boost::noncopyable {
         public:
             FileIterator( string file );
@@ -57,13 +72,14 @@ namespace mongo {
                     return x < 0;
                 return l.second.compare( r.second ) < 0;
             };
+
         private:
             BSONObj _order;
         };
-        
-    public:
 
-        typedef list<Data> InMemory;
+    public:
+        
+        typedef FastArray<Data> InMemory;
 
         class Iterator : boost::noncopyable {
         public:
@@ -102,8 +118,17 @@ namespace mongo {
         int numFiles(){
             return _files.size();
         }
+        
+        long getCurSizeSoFar(){ return _curSizeSoFar; }
+
+        void hintNumObjects( long long numObjects ){
+            if ( numObjects < _arraySize )
+                _arraySize = (int)(numObjects + 100);
+        }
 
     private:
+
+        void _sortInMem();
         
         void sort( string file );
         void finishMap();
@@ -112,6 +137,7 @@ namespace mongo {
         long _maxFilesize;
         path _root;
         
+        int _arraySize;
         InMemory * _cur;
         long _curSizeSoFar;
         
diff --git a/db/flushtest.cpp b/db/flushtest.cpp
index a301e0e..00cebcf 100644
--- a/db/flushtest.cpp
+++ b/db/flushtest.cpp
@@ -1,3 +1,19 @@
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 #include "stdafx.h"
 #include <stdio.h>
 #include "../util/goodies.h"
diff --git a/db/index.cpp b/db/index.cpp
index fab6918..5ec2658 100644
--- a/db/index.cpp
+++ b/db/index.cpp
@@ -21,22 +21,80 @@
 #include "index.h"
 #include "btree.h"
 #include "query.h"
+#include "background.h"
 
 namespace mongo {
 
+    map<string,IndexPlugin*> * IndexPlugin::_plugins;
+
+    IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec )
+        : _plugin( plugin ) , _spec( spec ){
+        
+    }
+
+    IndexType::~IndexType(){
+    }
+    
+    const BSONObj& IndexType::keyPattern() const { 
+        return _spec->keyPattern; 
+    }
+
+    IndexPlugin::IndexPlugin( const string& name )
+        : _name( name ){
+        if ( ! _plugins )
+            _plugins = new map<string,IndexPlugin*>();
+        (*_plugins)[name] = this;
+    }
+    
+    int IndexType::compare( const BSONObj& l , const BSONObj& r ) const {
+        return l.woCompare( r , _spec->keyPattern );
+    }
+
+
+    int removeFromSysIndexes(const char *ns, const char *idxName) { 
+        string system_indexes = cc().database()->name + ".system.indexes";
+        BSONObjBuilder b;
+        b.append("ns", ns);
+        b.append("name", idxName); // e.g.: { name: "ts_1", ns: "foo.coll" }
+        BSONObj cond = b.done();
+        return (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
+    }
+
+    /* this is just an attempt to clean up old orphaned stuff on a delete all indexes 
+       call. repair database is the clean solution, but this gives one a lighter weight 
+       partial option.  see dropIndexes()
+    */
+    void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) { 
+        string system_indexes = cc().database()->name + ".system.indexes";
+        BSONObjBuilder b;
+        b.append("ns", ns);
+        if( idIndex ) { 
+            b.append("name", BSON( "$ne" << idIndex->indexName().c_str() ));
+        }
+        BSONObj cond = b.done();
+        int n = (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
+        if( n ) { 
+            log() << "info: assureSysIndexesEmptied cleaned up " << n << " entries" << endl;
+        }
+    }
+
+    const IndexSpec& IndexDetails::getSpec() const {
+        scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        return NamespaceDetailsTransient::get_inlock( info.obj()["ns"].valuestr() ).getIndexSpec( this );
+    }
+
     /* delete this index.  does NOT clean up the system catalog
        (system.indexes or system.namespaces) -- only NamespaceIndex.
     */
     void IndexDetails::kill_idx() {
         string ns = indexNamespace(); // e.g. foo.coll.$ts_1
+
+        string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below 
         
         // clean up parent namespace index cache
-        NamespaceDetailsTransient::get_w( parentNS().c_str() ).deletedIndex();
+        NamespaceDetailsTransient::get_w( pns.c_str() ).deletedIndex();
 
-        BSONObjBuilder b;
-        b.append("name", indexName().c_str());
-        b.append("ns", parentNS().c_str());
-        BSONObj cond = b.done(); // e.g.: { name: "ts_1", ns: "foo.coll" }
+        string name = indexName();
 
         /* important to catch exception here so we can finish cleanup below. */
         try { 
@@ -48,22 +106,44 @@ namespace mongo {
         head.setInvalid();
         info.setInvalid();
 
-        // clean up in system.indexes.  we do this last on purpose.  note we have 
-        // to make the cond object before the drop() above though.
-        string system_indexes = cc().database()->name + ".system.indexes";
-        int n = deleteObjects(system_indexes.c_str(), cond, false, false, true);
+        // clean up in system.indexes.  we do this last on purpose.
+        int n = removeFromSysIndexes(pns.c_str(), name.c_str());
         wassert( n == 1 );
     }
+    
+    void IndexSpec::reset( const IndexDetails * details ){
+        _details = details;
+        reset( details->info );
+    }
+
+    void IndexSpec::reset( const DiskLoc& loc ){
+        info = loc.obj();
+        keyPattern = info["key"].embeddedObjectUserCheck();
+        if ( keyPattern.objsize() == 0 ) {
+            out() << info.toString() << endl;
+            assert(false);
+        }
+        _init();
+    }
+
 
     void IndexSpec::_init(){
-        assert( keys.objsize() );
+        assert( keyPattern.objsize() );
         
-        BSONObjIterator i( keys );
+        string pluginName = "";
+
+        BSONObjIterator i( keyPattern );
         BSONObjBuilder nullKeyB;
         while( i.more() ) {
-            _fieldNames.push_back( i.next().fieldName() );
+            BSONElement e = i.next();
+            _fieldNames.push_back( e.fieldName() );
             _fixed.push_back( BSONElement() );
             nullKeyB.appendNull( "" );
+            if ( e.type() == String ){
+                uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 );
+                pluginName = e.valuestr();
+            }
+                
         }
         
         _nullKey = nullKeyB.obj();
@@ -72,10 +152,25 @@ namespace mongo {
         b.appendNull( "" );
         _nullObj = b.obj();
         _nullElt = _nullObj.firstElement();
+        
+        if ( pluginName.size() ){
+            IndexPlugin * plugin = IndexPlugin::get( pluginName );
+            if ( ! plugin ){
+                log() << "warning: can't find plugin [" << pluginName << "]" << endl;
+            }
+            else {
+                _indexType.reset( plugin->generate( this ) );
+            }
+        }
+        _finishedInit = true;
     }
 
-
+    
     void IndexSpec::getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
+        if ( _indexType.get() ){
+            _indexType->getKeys( obj , keys );
+            return;
+        }
         vector<const char*> fieldNames( _fieldNames );
         vector<BSONElement> fixed( _fixed );
         _getKeys( fieldNames , fixed , obj, keys );
@@ -115,7 +210,7 @@ namespace mongo {
         if ( allFound ) {
             if ( arrElt.eoo() ) {
                 // no terminal array element to expand
-                BSONObjBuilder b;
+                BSONObjBuilder b(_sizeTracker);
                 for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
                     b.appendAs( *i, "" );
                 keys.insert( b.obj() );
@@ -125,7 +220,7 @@ namespace mongo {
                 BSONObjIterator i( arrElt.embeddedObject() );
                 if ( i.more() ){
                     while( i.more() ) {
-                        BSONObjBuilder b;
+                        BSONObjBuilder b(_sizeTracker);
                         for( unsigned j = 0; j < fixed.size(); ++j ) {
                             if ( j == arrIdx )
                                 b.appendAs( i.next(), "" );
@@ -137,7 +232,7 @@ namespace mongo {
                 }
                 else if ( fixed.size() > 1 ){
                     // x : [] - need to insert undefined
-                    BSONObjBuilder b;
+                    BSONObjBuilder b(_sizeTracker);
                     for( unsigned j = 0; j < fixed.size(); ++j ) {
                         if ( j == arrIdx )
                             b.appendUndefined( "" );
@@ -165,7 +260,7 @@ namespace mongo {
        Keys will be left empty if key not found in the object.
     */
     void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSetDefaultOrder& keys) const {
-        NamespaceDetailsTransient::get_w( info.obj()["ns"].valuestr() ).getIndexSpec( this ).getKeys( obj, keys );
+        getSpec().getKeys( obj, keys );
     }
 
     void setDifference(BSONObjSetDefaultOrder &l, BSONObjSetDefaultOrder &r, vector<BSONObj*> &diff) {
@@ -185,27 +280,27 @@ namespace mongo {
     }
 
     void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj) { 
-        v.resize(d.nIndexes);
+        int z = d.nIndexesBeingBuilt();
+        v.resize(z);
         NamespaceDetails::IndexIterator i = d.ii();
-        while( i.more() ) {
-            int j = i.pos();
-            IndexDetails& idx = i.next();
+        for( int i = 0; i < z; i++ ) {
+            IndexDetails& idx = d.idx(i);
             BSONObj idxKey = idx.info.obj().getObjectField("key"); // eg { ts : 1 }
-            IndexChanges& ch = v[j];
+            IndexChanges& ch = v[i];
             idx.getKeysFromObject(oldObj, ch.oldkeys);
             idx.getKeysFromObject(newObj, ch.newkeys);
             if( ch.newkeys.size() > 1 ) 
-                d.setIndexIsMultikey(j);
+                d.setIndexIsMultikey(i);
             setDifference(ch.oldkeys, ch.newkeys, ch.removed);
             setDifference(ch.newkeys, ch.oldkeys, ch.added);
         }
     }
 
-    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d) {
-        NamespaceDetails::IndexIterator i = d.ii();
-        while( i.more() ) {
-            int j = i.pos();
-            v[j].dupCheck(i.next());
+    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc) {
+        int z = d.nIndexesBeingBuilt();
+        for( int i = 0; i < z; i++ ) {
+            IndexDetails& idx = d.idx(i);
+            v[i].dupCheck(idx, curObjLoc);
         }
     }
 
@@ -248,6 +343,12 @@ namespace mongo {
         uassert(10097, "bad table to index name on add index attempt", 
             cc().database()->name == nsToDatabase(sourceNS.c_str()));
 
+        /* we can't build a new index for the ns if a build is already in progress in the background - 
+           EVEN IF this is a foreground build.
+           */
+        uassert(12588, "cannot add index with a background operation in progress", 
+            !BackgroundOperation::inProgForNs(sourceNS.c_str()));
+
         BSONObj key = io.getObjectField("key");
         uassert(12524, "index key pattern too large", key.objsize() <= 2048);
         if( !validKeyPattern(key) ) {
@@ -303,4 +404,40 @@ namespace mongo {
         return true;
     }
 
+    bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ){
+        BSONObjIterator x(a);
+        while ( x.more() ){
+            BSONElement e = x.next();
+            BSONObjIterator y(b);
+            while ( y.more() ){
+                BSONElement f = y.next();
+                FieldCompareResult res = compareDottedFieldNames( e.fieldName() , f.fieldName() );
+                if ( res == SAME || res == LEFT_SUBFIELD || res == RIGHT_SUBFIELD )
+                    return true;
+            }
+        }
+        return false;
+    }
+    
+    IndexSuitability IndexSpec::suitability( const BSONObj& query , const BSONObj& order ) const {
+        if ( _indexType.get() )
+            return _indexType->suitability( query , order );
+        return _suitability( query , order );
+    }
+    
+    IndexSuitability IndexSpec::_suitability( const BSONObj& query , const BSONObj& order ) const {
+        // TODO: optimize
+        if ( anyElementNamesMatch( keyPattern , query ) == 0 && anyElementNamesMatch( keyPattern , order ) == 0 )
+            return USELESS;
+        return HELPFUL;
+    }
+
+    IndexSuitability IndexType::suitability( const BSONObj& query , const BSONObj& order ) const {
+        return _spec->_suitability( query , order );
+    }
+
+    bool IndexType::scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const {
+        return ! order.isEmpty();
+    }
+
 }
diff --git a/db/index.h b/db/index.h
index 696e84d..6965f11 100644
--- a/db/index.h
+++ b/db/index.h
@@ -19,46 +19,136 @@
 #pragma once
 
 #include "../stdafx.h"
+#include "diskloc.h"
+#include "jsobj.h"
+#include <map>
 
 namespace mongo {
+
+    class IndexSpec;
+    class IndexType; // TODO: this name sucks
+    class IndexPlugin;
+    class IndexDetails;
+
+    enum IndexSuitability { USELESS = 0 , HELPFUL = 1 , OPTIMAL = 2 };
+
+    /**
+     * this represents an instance of a index plugin
+     * done this way so parsing, etc... can be cached
+     * so if there is a FTS IndexPlugin, for each index using FTS
+     * there will be 1 of these, and it can have things pre-parsed, etc...
+     */
+    class IndexType : boost::noncopyable {
+    public:
+        IndexType( const IndexPlugin * plugin , const IndexSpec * spec );
+        virtual ~IndexType();
+
+        virtual void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const = 0;
+        virtual auto_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const = 0;
+        
+        /** optional op : changes query to match what's in the index */
+        virtual BSONObj fixKey( const BSONObj& in ) { return in; }
+
+        /** optional op : compare 2 objects with regards to this index */
+        virtual int compare( const BSONObj& l , const BSONObj& r ) const;        
+
+        /** @return plugin */
+        const IndexPlugin * getPlugin() const { return _plugin; }
+        
+        const BSONObj& keyPattern() const;
+
+        virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+        virtual bool scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const ;
+
+    protected:
+        const IndexPlugin * _plugin;
+        const IndexSpec * _spec;
+    };
     
+    /**
+     * this represents a plugin
+     * a plugin could be something like full text search, sparse index, etc...
+     * 1 of these exists per type of index per server
+     * 1 IndexType is created per index using this plugin
+     */
+    class IndexPlugin : boost::noncopyable {
+    public:
+        IndexPlugin( const string& name );
+        virtual ~IndexPlugin(){}
+        
+        virtual IndexType* generate( const IndexSpec * spec ) const = 0;
+
+        static IndexPlugin* get( const string& name ){
+            if ( ! _plugins )
+                return 0;
+            map<string,IndexPlugin*>::iterator i = _plugins->find( name );
+            if ( i == _plugins->end() )
+                return 0;
+            return i->second;
+        }
+
+        string getName() const { return _name; }
+    private:
+        string _name;
+        static map<string,IndexPlugin*> * _plugins;
+    };
+
+    /* precomputed details about an index, used for inserting keys on updates
+       stored/cached in NamespaceDetailsTransient, or can be used standalone
+       */
     class IndexSpec {
     public:
-        BSONObj keys;
-        BSONObj meta;
+        BSONObj keyPattern; // e.g., { name : 1 }
+        BSONObj info; // this is the same as IndexDetails::info.obj()
         
-        IndexSpec(){
+        IndexSpec()
+            : _details(0) , _finishedInit(false){
         }
 
         IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() )
-            : keys(k) , meta(m){
+            : keyPattern(k) , info(m) , _details(0) , _finishedInit(false){
             _init();
         }
-
+        
         /**
-           this is a DickLock of an IndexDetails info
+           this is a DiscLoc of an IndexDetails info
            should have a key field 
          */
         IndexSpec( const DiskLoc& loc ){
             reset( loc );
         }
         
-        void reset( const DiskLoc& loc ){
-            meta = loc.obj();
-            keys = meta["key"].embeddedObjectUserCheck();
-            if ( keys.objsize() == 0 ) {
-                out() << meta.toString() << endl;
-                assert(false);
-                
-            }
-            _init();
-        }
+        void reset( const DiskLoc& loc );
+        void reset( const IndexDetails * details );
         
         void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const;
 
-    private:
+        BSONElement missingField() const { return _nullElt; }
+        
+        string getTypeName() const {
+            if ( _indexType.get() )
+                return _indexType->getPlugin()->getName();
+            return "";
+        }
+
+        IndexType* getType() const {
+            return _indexType.get();
+        }
+
+        const IndexDetails * getDetails() const {
+            return _details;
+        }
+
+        IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+    protected:
+
+        IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ;
 
         void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const;
+        
+        BSONSizeTracker _sizeTracker;
 
         vector<const char*> _fieldNames;
         vector<BSONElement> _fixed;
@@ -67,14 +157,23 @@ namespace mongo {
         BSONObj _nullObj;
         BSONElement _nullElt;
         
+        shared_ptr<IndexType> _indexType;
+
+        const IndexDetails * _details;
+        
         void _init();
+
+    public:
+        bool _finishedInit;
+
+        friend class IndexType;
     };
 
 	/* Details about a particular index. There is one of these effectively for each object in 
 	   system.namespaces (although this also includes the head pointer, which is not in that 
 	   collection).
 
-       ** MemoryMapped Record  **
+       ** MemoryMapped Record ** (i.e., this is on disk data)
 	 */
     class IndexDetails {
     public:
@@ -117,6 +216,7 @@ namespace mongo {
 
         /* true if the specified key is in the index */
         bool hasKey(const BSONObj& key);
+        bool wouldCreateDup(const BSONObj& key, DiskLoc self);
 
         // returns name of this index's storage area
         // database.table.$index
@@ -172,6 +272,8 @@ namespace mongo {
            (system.indexes or system.namespaces) -- only NamespaceIndex.
         */
         void kill_idx();
+        
+        const IndexSpec& getSpec() const;
 
         operator string() const {
             return info.obj().toString();
@@ -184,15 +286,20 @@ namespace mongo {
         vector<BSONObj*> removed; // these keys were removed as part of the change
         vector<BSONObj*> added;   // these keys were added as part of the change
 
-        void dupCheck(IndexDetails& idx) {
+        /** @curObjLoc - the object we want to add's location.  if it is already in the 
+                         index, that is allowed here (for bg indexing case).
+        */
+        void dupCheck(IndexDetails& idx, DiskLoc curObjLoc) {
             if( added.empty() || !idx.unique() )
                 return;
-            for( vector<BSONObj*>::iterator i = added.begin(); i != added.end(); i++ )
-                uassert( 11001 , "E11001 duplicate key on update", !idx.hasKey(**i));
+            for( vector<BSONObj*>::iterator i = added.begin(); i != added.end(); i++ ) {
+                bool dup = idx.wouldCreateDup(**i, curObjLoc);
+                uassert( 11001 , "E11001 duplicate key on update", !dup);
+            }
         }
     };
 
     class NamespaceDetails;
     void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj);
-    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d);
+    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc);
 } // namespace mongo
diff --git a/db/index_geo2d.cpp b/db/index_geo2d.cpp
new file mode 100644
index 0000000..4730c29
--- /dev/null
+++ b/db/index_geo2d.cpp
@@ -0,0 +1,1675 @@
+// geo2d.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "namespace.h"
+#include "jsobj.h"
+#include "index.h"
+#include "../util/unittest.h"
+#include "commands.h"
+#include "pdfile.h"
+#include "btree.h"
+#include "curop.h"
+#include "matcher.h"
+
+//#define GEODEBUG(x) cout << x << endl;
+#define GEODEBUG(x) 
+
+namespace mongo {
+
+    const string GEO2DNAME = "2d";
+
+    class GeoBitSets {
+    public:
+        GeoBitSets(){
+            for ( int i=0; i<32; i++ ){
+                masks32[i] = ( 1 << ( 31 - i ) );
+            }
+            for ( int i=0; i<64; i++ ){
+                masks64[i] = ( 1LL << ( 63 - i ) );
+            }
+        }
+        int masks32[32];
+        long long masks64[64];
+    } geoBitSets;
+
+    
+    class GeoHash {
+    public:
+        GeoHash()
+            : _hash(0),_bits(0){
+        }
+
+        GeoHash( const char * hash ){
+            init( hash );
+        }
+
+        GeoHash( const string& hash ){
+            init( hash );
+        }
+
+        GeoHash( const BSONElement& e , unsigned bits=32 ){
+            _bits = bits;
+            if ( e.type() == BinData ){
+                int len = 0;
+                _copy( (char*)&_hash , e.binData( len ) );
+                assert( len == 8 );
+                _bits = bits;
+            }
+            else {
+                cout << "GeoHash cons e : " << e << endl;
+                uassert(13047,"wrong type for geo index. if you're using a pre-release version, need to rebuild index",0);
+            }
+            _fix();
+        }
+        
+        GeoHash( unsigned x , unsigned y , unsigned bits=32){
+            init( x , y , bits );
+        }
+
+        GeoHash( const GeoHash& old ){
+            _hash = old._hash;
+            _bits = old._bits;
+        }
+
+        GeoHash( long long hash , unsigned bits )
+            : _hash( hash ) , _bits( bits ){
+            _fix();
+        }
+
+        void init( unsigned x , unsigned y , unsigned bits ){
+            assert( bits <= 32 );
+            _hash = 0;
+            _bits = bits;
+            for ( unsigned i=0; i<bits; i++ ){
+                if ( isBitSet( x , i ) ) _hash |= geoBitSets.masks64[i*2];
+                if ( isBitSet( y , i ) ) _hash |= geoBitSets.masks64[(i*2)+1];
+            }
+        }
+
+        void unhash( unsigned& x , unsigned& y ) const {
+            x = 0;
+            y = 0;
+            for ( unsigned i=0; i<_bits; i++ ){
+                if ( getBitX(i) )
+                    x |= geoBitSets.masks32[i];
+                if ( getBitY(i) )
+                    y |= geoBitSets.masks32[i];
+            }
+        }
+
+        /**
+         * @param 0 = high
+         */
+        static bool isBitSet( unsigned val , unsigned  bit ){
+            return geoBitSets.masks32[bit] & val;
+        }
+        
+        GeoHash up() const {
+            return GeoHash( _hash , _bits - 1 );
+        }
+        
+        bool hasPrefix( const GeoHash& other ) const {
+            assert( other._bits <= _bits );
+            if ( other._bits == 0 )
+                return true;
+            long long x = other._hash ^ _hash;
+            x = x >> (64-(other._bits*2));
+            return x == 0;
+        }
+        
+
+        string toString() const { 
+            StringBuilder buf( _bits * 2 );
+            for ( unsigned x=0; x<_bits*2; x++ )
+                buf.append( _hash & geoBitSets.masks64[x] ? "1" : "0" );
+            return buf.str();
+        }
+
+        string toStringHex1() const {
+            stringstream ss;
+            ss << hex << _hash;
+            return ss.str();
+        }
+
+        void init( const string& s ){
+            _hash = 0;
+            _bits = s.size() / 2;
+            for ( unsigned pos=0; pos<s.size(); pos++ )
+                if ( s[pos] == '1' )
+                    setBit( pos , 1 );
+        }
+
+        void setBit( unsigned pos , bool one ){
+            assert( pos < _bits * 2 );
+            if ( one )
+                _hash |= geoBitSets.masks64[pos];
+            else if ( _hash & geoBitSets.masks64[pos] )
+                _hash &= ~geoBitSets.masks64[pos];
+        }
+        
+        bool getBit( unsigned pos ) const {
+            return _hash & geoBitSets.masks64[pos];
+        }
+
+        bool getBitX( unsigned pos ) const {
+            assert( pos < 32 );
+            return getBit( pos * 2 );
+        }
+
+        bool getBitY( unsigned pos ) const {
+            assert( pos < 32 );
+            return getBit( ( pos * 2 ) + 1 );
+        }
+        
+        BSONObj wrap() const {
+            BSONObjBuilder b(20);
+            append( b , "" );
+            BSONObj o = b.obj();
+            assert( o.objsize() == 20 );
+            return o;
+        }
+
+        bool constrains() const {
+            return _bits > 0;
+        }
+        
+        void move( int x , int y ){
+            assert( _bits );
+            _move( 0 , x );
+            _move( 1 , y );
+        }
+
+        void _move( unsigned offset , int d ){
+            if ( d == 0 )
+                return;
+            assert( d <= 1 && d>= -1 ); // TEMP
+            
+            bool from, to;
+            if ( d > 0 ){
+                from = 0;
+                to = 1;
+            }
+            else {
+                from = 1;
+                to = 0;
+            }
+
+            unsigned pos = ( _bits * 2 ) - 1;
+            if ( offset == 0 )
+                pos--;
+            while ( true ){
+                if ( getBit(pos) == from ){
+                    setBit( pos , to );
+                    return;
+                }
+
+                if ( pos < 2 ){
+                    // overflow
+                    for ( ; pos < ( _bits * 2 ) ; pos += 2 ){
+                        setBit( pos , from );
+                    }
+                    return;
+                }
+                
+                setBit( pos , from );
+                pos -= 2;
+            }
+            
+            assert(0);
+        }
+
+        GeoHash& operator=(const GeoHash& h) { 
+            _hash = h._hash;
+            _bits = h._bits;
+            return *this;
+        }
+        
+        bool operator==(const GeoHash& h ){
+            return _hash == h._hash && _bits == h._bits;
+        }
+
+        GeoHash& operator+=( const char * s ) {
+            unsigned pos = _bits * 2;
+            _bits += strlen(s) / 2;
+            assert( _bits <= 32 );
+            while ( s[0] ){
+                if ( s[0] == '1' )
+                    setBit( pos , 1 );
+                pos++;
+                s++;
+            }
+
+            return *this;
+        }
+
+        GeoHash operator+( const char * s ) const {
+            GeoHash n = *this;
+            n+=s;
+            return n;
+        }
+      
+        void _fix(){
+            if ( ( _hash << ( _bits * 2 ) ) == 0 )
+                return;
+            long long mask = 0;
+            for ( unsigned i=0; i<_bits*2; i++ )
+                mask |= geoBitSets.masks64[i];
+            _hash &= mask;
+        }
+        
+        void append( BSONObjBuilder& b , const char * name ) const {
+            char buf[8];
+            _copy( buf , (char*)&_hash );
+            b.appendBinData( name , 8 , bdtCustom , buf );
+        }
+        
+        long long getHash() const {
+            return _hash;
+        }
+
+        GeoHash commonPrefix( const GeoHash& other ) const {
+            unsigned i=0;
+            for ( ; i<_bits && i<other._bits; i++ ){
+                if ( getBitX( i ) == other.getBitX( i ) &&
+                     getBitY( i ) == other.getBitY( i ) )
+                    continue;
+                break;
+            }
+            return GeoHash(_hash,i);
+        }
+    private:
+
+        void _copy( char * dst , const char * src ) const {
+            for ( unsigned a=0; a<8; a++ ){
+                dst[a] = src[7-a];
+            }
+        }
+
+        long long _hash;
+        unsigned _bits; // bits per field, so 1 to 32
+    };
+
+    ostream& operator<<( ostream &s, const GeoHash &h ){
+        s << h.toString();
+        return s;
+    } // end GeoHash
+
+    class Geo2dType : public IndexType {
+    public:
+        Geo2dType( const IndexPlugin * plugin , const IndexSpec* spec )
+            : IndexType( plugin , spec ){
+            
+            BSONObjBuilder orderBuilder;
+
+            BSONObjIterator i( spec->keyPattern );
+            while ( i.more() ){
+                BSONElement e = i.next();
+                if ( e.type() == String && GEO2DNAME == e.valuestr() ){
+                    uassert( 13022 , "can't have 2 geo field" , _geo.size() == 0 );
+                    uassert( 13023 , "2d has to be first in index" , _other.size() == 0 );
+                    _geo = e.fieldName();
+                }
+                else {
+                    _other.push_back( e.fieldName() );
+                }
+                orderBuilder.append( "" , 1 );
+            }
+            
+            uassert( 13024 , "no geo field specified" , _geo.size() );
+            
+            _bits = _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft
+
+            uassert( 13028 , "can't have more than 32 bits in geo index" , _bits <= 32 );
+
+            _max = _configval( spec , "max" , 180 );
+            _min = _configval( spec , "min" , -180 );
+            
+            _scaling = (1024*1024*1024*4.0)/(_max-_min);
+
+            _order = orderBuilder.obj();
+        }
+
+        int _configval( const IndexSpec* spec , const string& name , int def ){
+            BSONElement e = spec->info[name];
+            if ( e.isNumber() )
+                return e.numberInt();
+            return def;
+        }
+
+        ~Geo2dType(){
+            
+        }
+
+        virtual BSONObj fixKey( const BSONObj& in ) { 
+            if ( in.firstElement().type() == BinData )
+                return in;
+
+            BSONObjBuilder b(in.objsize()+16);
+            
+            if ( in.firstElement().isABSONObj() )
+                _hash( in.firstElement().embeddedObject() ).append( b , "" );
+            else if ( in.firstElement().type() == String )
+                GeoHash( in.firstElement().valuestr() ).append( b , "" );
+            else if ( in.firstElement().type() == RegEx )
+                GeoHash( in.firstElement().regex() ).append( b , "" );
+            else 
+                return in;
+
+            BSONObjIterator i(in);
+            i.next();
+            while ( i.more() )
+                b.append( i.next() );
+            return b.obj();
+        }
+
+        virtual void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const {
+            BSONElement geo = obj.getFieldDotted(_geo.c_str());
+            if ( geo.eoo() )
+                return;
+
+            BSONObjBuilder b(64);
+
+            if ( ! geo.isABSONObj() )
+                return;
+
+            BSONObj embed = geo.embeddedObject();
+            if ( embed.isEmpty() )
+                return;
+
+            _hash( embed ).append( b , "" );
+
+            for ( size_t i=0; i<_other.size(); i++ ){
+                BSONElement e = obj[_other[i]];
+                if ( e.eoo() )
+                    e = _spec->missingField();
+                b.appendAs( e , "" );
+            }
+            keys.insert( b.obj() );
+        }
+        
+        GeoHash _tohash( const BSONElement& e ) const {
+            if ( e.isABSONObj() )
+                return _hash( e.embeddedObject() );
+            
+            return GeoHash( e , _bits );
+        }
+
+        GeoHash _hash( const BSONObj& o ) const {
+            BSONObjIterator i(o);
+            uassert( 13067 , "geo field is empty" , i.more() );
+            BSONElement x = i.next();
+            uassert( 13068 , "geo field only has 1 element" , i.more() );
+            BSONElement y = i.next();
+            
+            uassert( 13026 , "geo values have to be numbers" , x.isNumber() && y.isNumber() );
+
+            return _hash( x.number() , y.number() );
+        }
+
+        GeoHash _hash( double x , double y ) const {
+            return GeoHash( _convert(x), _convert(y) , _bits );
+        }
+
+        BSONObj _unhash( const GeoHash& h ) const {
+            unsigned x , y;
+            h.unhash( x , y );
+            BSONObjBuilder b;
+            b.append( "x" , _unconvert( x ) );
+            b.append( "y" , _unconvert( y ) );
+            return b.obj();
+        }
+        
+        unsigned _convert( double in ) const {
+            uassert( 13027 , "point not in range" , in <= _max && in >= _min );
+            in -= _min;
+            assert( in > 0 );
+            return (unsigned)(in * _scaling);
+        }
+        
+        double _unconvert( unsigned in ) const {
+            double x = in;
+            x /= _scaling;
+            x += _min;
+            return x;
+        }
+
+        void _unconvert( const GeoHash& h , double& x , double& y ) const {
+            unsigned a,b;
+            h.unhash(a,b);
+            x = _unconvert( a );
+            y = _unconvert( b );
+        }
+        
+        double distance( const GeoHash& a , const GeoHash& b ) const {
+            double ax,ay,bx,by;
+            _unconvert( a , ax , ay );
+            _unconvert( b , bx , by );
+            
+            double dx = bx - ax;
+            double dy = by - ay;
+
+            return sqrt( ( dx * dx ) + ( dy * dy ) );
+        }
+
+        double size( const GeoHash& a ) const {
+            GeoHash b = a;
+            b.move( 1 , 1 );
+            return distance( a , b );
+        }
+
+        const IndexDetails* getDetails() const {
+            return _spec->getDetails();
+        }
+
+        virtual auto_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const;
+
+        virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const {
+            BSONElement e = query.getFieldDotted(_geo.c_str());
+            switch ( e.type() ){
+            case Object: {
+                BSONObj sub = e.embeddedObject();
+                switch ( sub.firstElement().getGtLtOp() ){
+                case BSONObj::opNEAR:
+                case BSONObj::opWITHIN:
+                    return OPTIMAL;
+                default:;
+                }
+            }
+            case Array:
+                return HELPFUL;
+            default:
+                return USELESS;
+            }
+        }
+
+        string _geo;
+        vector<string> _other;
+        
+        unsigned _bits;
+        int _max;
+        int _min;
+        double _scaling;
+
+        BSONObj _order;
+    };
+
+    class Point {
+    public:
+        
+        Point( const Geo2dType * g , const GeoHash& hash ){
+            g->_unconvert( hash , _x , _y );
+        }
+
+        Point( double x , double y )
+            : _x( x ) , _y( y ){
+        }
+        
+        Point() : _x(0),_y(0){
+        }
+
+        GeoHash hash( const Geo2dType * g ){
+            return g->_hash( _x , _y );
+        }
+        
+        string toString() const {
+            StringBuilder buf(32);
+            buf << "(" << _x << "," << _y << ")";
+            return buf.str();
+  
+        }
+
+        double _x;
+        double _y;
+    };
+
+    class Box {
+    public:
+        
+        Box( const Geo2dType * g , const GeoHash& hash )
+            : _min( g , hash ) , 
+              _max( _min._x + g->size( hash ) , _min._y + g->size( hash ) ){
+        }
+        
+        Box( double x , double y , double size )
+            : _min( x , y ) , 
+              _max( x + size , y + size ){
+        }
+
+        Box( Point min , Point max )
+            : _min( min ) , _max( max ){
+        }
+
+        Box(){}
+
+        string toString() const {
+            StringBuilder buf(64);
+            buf << _min.toString() << " -->> " << _max.toString();
+            return buf.str();
+        }
+        
+        operator string() const {
+            return toString();
+        }
+
+        bool between( double min , double max , double val , double fudge=0) const {
+            return val + fudge >= min && val <= max + fudge;
+        }
+        
+        bool mid( double amin , double amax , double bmin , double bmax , bool min , double& res ) const {
+            assert( amin < amax );
+            assert( bmin < bmax );
+
+            if ( amin < bmin ){
+                if ( amax < bmin )
+                    return false;
+                res = min ? bmin : amax;
+                return true;
+            }
+            if ( amin > bmax )
+                return false;
+            res = min ? amin : bmax;
+            return true;
+        }
+
+        double intersects( const Box& other ) const {
+            
+            Point boundMin(0,0);
+            Point boundMax(0,0);
+            
+            if ( mid( _min._x , _max._x , other._min._x , other._max._x , true , boundMin._x ) == false ||
+                 mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false ||
+                 mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false ||
+                 mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false )
+                return 0;
+            
+            Box intersection( boundMin , boundMax );
+
+            return intersection.area() / ( ( area() + other.area() ) / 2 );
+        }
+
+        double area() const {
+            return ( _max._x - _min._x ) * ( _max._y - _min._y );
+        }
+
+        Point center() const {
+            return Point( ( _min._x + _max._x ) / 2 ,
+                          ( _min._y + _max._y ) / 2 );
+        }
+
+        bool inside( Point p , double fudge = 0 ){
+            bool res = inside( p._x , p._y , fudge );
+            //cout << "is : " << p.toString() << " in " << toString() << " = " << res << endl;
+            return res;
+        }
+        
+        bool inside( double x , double y , double fudge = 0 ){
+            return 
+                between( _min._x , _max._x  , x , fudge ) &&
+                between( _min._y , _max._y  , y , fudge );
+        }
+        
+        Point _min;
+        Point _max;
+    };
+    
+    class Geo2dPlugin : public IndexPlugin {
+    public:
+        Geo2dPlugin() : IndexPlugin( GEO2DNAME ){
+        }
+        
+        virtual IndexType* generate( const IndexSpec* spec ) const {
+            return new Geo2dType( this , spec );
+        }
+    } geo2dplugin;
+    
+    struct GeoUnitTest : public UnitTest {
+        
+        int round( double d ){
+            return (int)(.5+(d*1000));
+        }
+        
+#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == b ); }
+
+        void run(){
+            assert( ! GeoHash::isBitSet( 0 , 0 ) );
+            assert( ! GeoHash::isBitSet( 0 , 31 ) );
+            assert( GeoHash::isBitSet( 1 , 31 ) );
+            
+            IndexSpec i( BSON( "loc" << "2d" ) );
+            Geo2dType g( &geo2dplugin , &i );
+            {
+                double x = 73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+
+            {
+                double x = -73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+            
+            {
+                GeoHash h( "0000" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0001" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0000" );
+
+                h.init( "0001" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0100" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0001" );
+                
+
+                h.init( "0000" );
+                h.move( 1 , 0 );
+                GEOHEQ( h , "0010" );
+            }
+            
+            {
+                Box b( 5 , 5 , 2 );
+                assert( "(5,5) -->> (7,7)" == b.toString() );
+            }
+
+            {
+                GeoHash a = g._hash( 1 , 1 );
+                GeoHash b = g._hash( 4 , 5 );
+                assert( 5 == (int)(g.distance( a , b ) ) );
+                a = g._hash( 50 , 50 );
+                b = g._hash( 42 , 44 );
+                assert( round(10) == round(g.distance( a , b )) );
+            }
+            
+            {
+                GeoHash x("0000");
+                assert( 0 == x.getHash() );
+                x.init( 0 , 1 , 32 );
+                GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" )
+
+                assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) );
+                assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) );
+            }
+               
+            {
+                GeoHash x("1010");
+                GEOHEQ( x , "1010" );
+                GeoHash y = x + "01";
+                GEOHEQ( y , "101001" );
+            }
+
+            { 
+                
+                GeoHash a = g._hash( 5 , 5 );
+                GeoHash b = g._hash( 5 , 7 );
+                GeoHash c = g._hash( 100 , 100 );
+                /*
+                cout << "a: " << a << endl;
+                cout << "b: " << b << endl;
+                cout << "c: " << c << endl;
+
+                cout << "a: " << a.toStringHex1() << endl;
+                cout << "b: " << b.toStringHex1() << endl;
+                cout << "c: " << c.toStringHex1() << endl;
+                */
+                BSONObj oa = a.wrap();
+                BSONObj ob = b.wrap();
+                BSONObj oc = c.wrap();
+                /*
+                cout << "a: " << oa.hexDump() << endl;
+                cout << "b: " << ob.hexDump() << endl;
+                cout << "c: " << oc.hexDump() << endl;
+                */
+                assert( oa.woCompare( ob ) < 0 );
+                assert( oa.woCompare( oc ) < 0 );
+
+            }
+
+            {
+                GeoHash x( "000000" );
+                x.move( -1 , 0 );
+                GEOHEQ( x , "101010" );
+                x.move( 1 , -1 );
+                GEOHEQ( x , "010101" );
+                x.move( 0 , 1 );
+                GEOHEQ( x , "000000" );
+            }
+
+            {
+                GeoHash prefix( "110011000000" );
+                GeoHash entry(  "1100110000011100000111000001110000011100000111000001000000000000" );
+                assert( ! entry.hasPrefix( prefix ) );
+
+                entry = "1100110000001100000111000001110000011100000111000001000000000000";
+                assert( entry.toString().find( prefix.toString() ) == 0 );
+                assert( entry.hasPrefix( GeoHash( "1100" ) ) );
+                assert( entry.hasPrefix( prefix ) );
+            }
+            
+            {
+                GeoHash a = g._hash( 50 , 50 );
+                GeoHash b = g._hash( 48 , 54 );
+                assert( round( 4.47214 ) == round( g.distance( a , b ) ) );
+            }
+            
+
+            {
+                Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) );
+                assert( b.inside( 29.763 , -95.363 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) );
+            }
+
+            {
+                GeoHash a( "11001111" );
+                assert( GeoHash( "11" ) == a.commonPrefix( "11" ) );
+                assert( GeoHash( "11" ) == a.commonPrefix( "11110000" ) );
+            }
+            
+        }
+    } geoUnitTest;
+    
+    class GeoPoint {
+    public:
+        GeoPoint(){
+        }
+
+        GeoPoint( const KeyNode& node , double distance )
+            : _key( node.key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ) , _distance( distance ){
+        }
+
+        GeoPoint( const BSONObj& key , DiskLoc loc , double distance )
+            : _key(key) , _loc(loc) , _o( loc.obj() ) , _distance( distance ){
+        }
+
+        bool operator<( const GeoPoint& other ) const {
+            return _distance < other._distance;
+        }
+
+        bool isEmpty() const {
+            return _o.isEmpty();
+        }
+
+        BSONObj _key;
+        DiskLoc _loc;
+        BSONObj _o;
+        double _distance;
+    };
+
+    class GeoAccumulator {
+    public:
+        GeoAccumulator( const Geo2dType * g , const BSONObj& filter )
+            : _g(g) , _lookedAt(0) , _objectsLoaded(0) , _found(0) {
+            if ( ! filter.isEmpty() ){
+                _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) );
+            }
+        }
+
+        virtual ~GeoAccumulator(){
+        }
+
+        virtual void add( const KeyNode& node ){
+            // when looking at other boxes, don't want to look at some object twice
+            if ( _seen.count( node.recordLoc ) ){
+                GEODEBUG( "\t\t\t\t already seen : " << node.recordLoc.obj()["_id"] );
+                return;
+            }
+            _seen.insert( node.recordLoc );
+            _lookedAt++;
+            
+            // distance check
+            double d = 0;
+            if ( ! checkDistance( GeoHash( node.key.firstElement() ) , d ) ){
+                GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj()  << "\t" << d );
+                return;
+            }
+            
+            // matcher
+            MatchDetails details;
+            if ( _matcher.get() ){
+                bool good = _matcher->matches( node.key , node.recordLoc , &details );
+                if ( details.loadedObject )
+                    _objectsLoaded++;
+                
+                if ( ! good ){
+                    GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] );
+                    return;
+                }
+            }
+            
+            if ( ! details.loadedObject ) // dont double count
+                _objectsLoaded++;
+
+            addSpecific( node , d );
+            _found++;
+        }
+
+        virtual void addSpecific( const KeyNode& node , double d ) = 0;
+        virtual bool checkDistance( const GeoHash& node , double& d ) = 0;
+
+        long long found() const {
+            return _found;
+        }
+        
+        const Geo2dType * _g;
+        set<DiskLoc> _seen;
+        auto_ptr<CoveredIndexMatcher> _matcher;
+
+        long long _lookedAt;
+        long long _objectsLoaded;
+        long long _found;
+    };
+    
+    class GeoHopper : public GeoAccumulator {
+    public:
+        typedef multiset<GeoPoint> Holder;
+
+        GeoHopper( const Geo2dType * g , unsigned max , const GeoHash& n , const BSONObj& filter = BSONObj() )
+            : GeoAccumulator( g , filter ) , _max( max ) , _near( n ) {
+
+        }
+
+        virtual bool checkDistance( const GeoHash& h , double& d ){
+            d = _g->distance( _near , h );
+            bool good = _points.size() < _max || d < farthest();
+            GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near << "\t" << h << "\t" << d 
+                      << " ok: " << good << " farthest: " << farthest() );
+            return good;
+        }
+        
+        virtual void addSpecific( const KeyNode& node , double d ){
+            GEODEBUG( "\t\t" << GeoHash( node.key.firstElement() ) << "\t" << node.recordLoc.obj() << "\t" << d );
+            _points.insert( GeoPoint( node.key , node.recordLoc , d ) );
+            if ( _points.size() > _max ){
+                _points.erase( --_points.end() );
+            }
+        }
+
+        double farthest(){
+            if ( _points.size() == 0 )
+                return -1;
+                
+            Holder::iterator i = _points.end();
+            i--;
+            return i->_distance;
+        }
+
+        unsigned _max;
+        GeoHash _near;
+        Holder _points;
+
+    };
+
+    struct BtreeLocation {
+        int pos;
+        bool found;
+        DiskLoc bucket;
+        
+        BSONObj key(){
+            if ( bucket.isNull() )
+                return BSONObj();
+            return bucket.btree()->keyNode( pos ).key;
+        }
+        
+        bool hasPrefix( const GeoHash& hash ){
+            BSONElement e = key().firstElement();
+            if ( e.eoo() )
+                return false;
+            return GeoHash( e ).hasPrefix( hash );
+        }
+        
+        bool advance( int direction , int& totalFound , GeoAccumulator* all ){
+
+            if ( bucket.isNull() )
+                return false;
+            bucket = bucket.btree()->advance( bucket , pos , direction , "btreelocation" );
+            
+            return checkCur( totalFound , all );
+        }
+
+        bool checkCur( int& totalFound , GeoAccumulator* all ){
+            if ( bucket.isNull() )
+                return false;
+
+            if ( bucket.btree()->isUsed(pos) ){
+                totalFound++;
+                all->add( bucket.btree()->keyNode( pos ) );
+            }
+            else {
+                GEODEBUG( "\t\t\t\t not used: " << key() );
+            }
+
+            return true;
+        }
+
+        string toString(){
+            stringstream ss;
+            ss << "bucket: " << bucket.toString() << " pos: " << pos << " found: " << found;
+            return ss.str();
+        }
+
+        static bool initial( const IndexDetails& id , const Geo2dType * spec , 
+                             BtreeLocation& min , BtreeLocation&  max , 
+                             GeoHash start ,
+                             int & found , GeoAccumulator * hopper ){
+            
+            min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , 
+                                                  spec->_order , min.pos , min.found , minDiskLoc );
+            min.checkCur( found , hopper );
+            max = min;
+            
+            if ( min.bucket.isNull() ){
+                min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , 
+                                                      spec->_order , min.pos , min.found , minDiskLoc , -1 );
+                min.checkCur( found , hopper );
+            }
+            
+            return ! min.bucket.isNull() || ! max.bucket.isNull();
+        }
+    };
+
+    class GeoSearch {
+    public:
+        GeoSearch( const Geo2dType * g , const GeoHash& n , int numWanted=100 , BSONObj filter=BSONObj() )
+            : _spec( g ) , _n( n ) , _start( n ) ,
+              _numWanted( numWanted ) , _filter( filter ) , 
+              _hopper( new GeoHopper( g , numWanted , n , filter ) )
+        {
+            assert( g->getDetails() );
+            _nscanned = 0;
+            _found = 0;
+        }
+        
+        void exec(){
+            const IndexDetails& id = *_spec->getDetails();
+            
+            BtreeBucket * head = id.head.btree();
+            assert( head );
+            /*
+             * Search algorithm
+             * 1) use geohash prefix to find X items
+             * 2) compute max distance from want to an item
+             * 3) find optimal set of boxes that complete circle
+             * 4) use regular btree cursors to scan those boxes
+             */
+            
+            GeoHopper * hopper = _hopper.get();
+
+            _prefix = _start;
+            { // 1 regular geo hash algorithm
+                
+
+                BtreeLocation min,max;
+                if ( ! BtreeLocation::initial( id , _spec , min , max , _n , _found , hopper ) )
+                    return;
+                
+                while ( _hopper->found() < _numWanted ){
+                    GEODEBUG( _prefix << "\t" << _found << "\t DESC" );
+                    while ( min.hasPrefix( _prefix ) && min.advance( -1 , _found , hopper ) )
+                        _nscanned++;
+                    GEODEBUG( _prefix << "\t" << _found << "\t ASC" );
+                    while ( max.hasPrefix( _prefix ) && max.advance( 1 , _found , hopper ) )
+                        _nscanned++;
+                    if ( ! _prefix.constrains() )
+                        break;
+                    _prefix = _prefix.up();
+                }
+            }
+            GEODEBUG( "done part 1" );
+            if ( _found && _prefix.constrains() ){
+                // 2
+                Point center( _spec , _n );
+                double boxSize = _spec->size( _prefix );
+                double farthest = hopper->farthest();
+                if ( farthest > boxSize )
+                    boxSize = farthest;
+                Box want( center._x - ( boxSize / 2 ) , center._y - ( boxSize / 2 ) , boxSize );
+                while ( _spec->size( _prefix ) < boxSize )
+                    _prefix = _prefix.up();
+                log(1) << "want: " << want << " found:" << _found << " hash size:" << _spec->size( _prefix ) << endl;
+                
+                for ( int x=-1; x<=1; x++ ){
+                    for ( int y=-1; y<=1; y++ ){
+                        GeoHash toscan = _prefix;
+                        toscan.move( x , y );
+                        
+                        // 3 & 4
+                        doBox( id , want , toscan );
+                    }
+                }
+            }
+            GEODEBUG( "done search" )
+            
+        }
+
+        void doBox( const IndexDetails& id , const Box& want , const GeoHash& toscan , int depth = 0 ){
+            Box testBox( _spec , toscan );
+            if ( logLevel > 0 ) log(1) << "\t doBox: " << testBox << "\t" << toscan.toString() << endl;
+
+            double intPer = testBox.intersects( want );
+
+            if ( intPer <= 0 )
+                return;
+            
+            if ( intPer < .5 && depth < 3 ){
+                doBox( id , want , toscan + "00" , depth + 1);
+                doBox( id , want , toscan + "01" , depth + 1);
+                doBox( id , want , toscan + "10" , depth + 1);
+                doBox( id , want , toscan + "11" , depth + 1);
+                return;
+            }
+
+            BtreeLocation loc;
+            loc.bucket = id.head.btree()->locate( id , id.head , toscan.wrap() , _spec->_order , 
+                                                        loc.pos , loc.found , minDiskLoc );
+            loc.checkCur( _found , _hopper.get() );
+            while ( loc.hasPrefix( toscan ) && loc.advance( 1 , _found , _hopper.get() ) )
+                _nscanned++;
+
+        }
+
+
+        const Geo2dType * _spec;
+
+        GeoHash _n;
+        GeoHash _start;
+        GeoHash _prefix;
+        int _numWanted;
+        BSONObj _filter;
+        shared_ptr<GeoHopper> _hopper;
+
+        long long _nscanned;
+        int _found;
+    };
+
+    class GeoCursorBase : public Cursor {
+    public:
+        GeoCursorBase( const Geo2dType * spec )
+            : _spec( spec ), _id( _spec->getDetails() ){
+
+        }
+
+        virtual DiskLoc refLoc(){ return DiskLoc(); }
+
+        virtual BSONObj indexKeyPattern() {
+            return _spec->keyPattern();
+        }
+
+        virtual void noteLocation() { 
+            assert(0);
+        }
+
+        /* called before query getmore block is iterated */
+        virtual void checkLocation() {
+            assert(0);
+        }
+
+        virtual bool supportGetMore() { return false; }
+
+        virtual bool getsetdup(DiskLoc loc){
+            return false;
+        }
+
+        const Geo2dType * _spec;
+        const IndexDetails * _id;
+    };
+
+    class GeoSearchCursor : public GeoCursorBase {
+    public:
+        GeoSearchCursor( shared_ptr<GeoSearch> s )
+            : GeoCursorBase( s->_spec ) , 
+              _s( s ) , _cur( s->_hopper->_points.begin() ) , _end( s->_hopper->_points.end() ) {
+        }
+        
+        virtual ~GeoSearchCursor() {}
+        
+        virtual bool ok(){
+            return _cur != _end;
+        }
+        
+        virtual Record* _current(){ assert(ok()); return _cur->_loc.rec(); }
+        virtual BSONObj current(){ assert(ok()); return _cur->_o; }
+        virtual DiskLoc currLoc(){ assert(ok()); return _cur->_loc; }
+        virtual bool advance(){ _cur++; return ok(); }
+        virtual BSONObj currKey() const { return _cur->_key; }
+
+        virtual string toString() {
+            return "GeoSearchCursor";
+        }
+
+
+        virtual BSONObj prettyStartKey() const { 
+            return BSON( _s->_spec->_geo << _s->_prefix.toString() ); 
+        }
+        virtual BSONObj prettyEndKey() const { 
+            GeoHash temp = _s->_prefix;
+            temp.move( 1 , 1 );
+            return BSON( _s->_spec->_geo << temp.toString() ); 
+        }
+
+
+        shared_ptr<GeoSearch> _s;
+        GeoHopper::Holder::iterator _cur;
+        GeoHopper::Holder::iterator _end;
+    };
+
+    class GeoBrowse : public GeoCursorBase , public GeoAccumulator {
+    public:
+        GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj() )
+            : GeoCursorBase( g ) ,GeoAccumulator( g , filter ) ,
+              _type( type ) , _filter( filter ) , _firstCall(true) {
+        }
+        
+        virtual string toString() {
+            return (string)"GeoBrowse-" + _type;
+        }
+
+        virtual bool ok(){
+            if ( _firstCall ){
+                fillStack();
+                _firstCall = false;
+            }
+            if ( ! _cur.isEmpty() || _stack.size() )
+                return true;
+
+            while ( moreToDo() ){
+                fillStack();
+                if ( ! _cur.isEmpty() )
+                    return true;
+            }
+            
+            return false;
+        }
+        
+        virtual bool advance(){ 
+            _cur._o = BSONObj();
+            
+            if ( _stack.size() ){
+                _cur = _stack.front();
+                _stack.pop_front();
+                return true;
+            }
+            
+            if ( ! moreToDo() )
+                return false;
+            
+            while ( _cur.isEmpty() && moreToDo() )
+                fillStack();
+            return ! _cur.isEmpty();
+        }
+        
+        virtual Record* _current(){ assert(ok()); return _cur._loc.rec(); }
+        virtual BSONObj current(){ assert(ok()); return _cur._o; }
+        virtual DiskLoc currLoc(){ assert(ok()); return _cur._loc; }
+        virtual BSONObj currKey() const { return _cur._key; }
+
+
+        virtual bool moreToDo() = 0;
+        virtual void fillStack() = 0;
+
+        virtual void addSpecific( const KeyNode& node , double d ){
+            if ( _cur.isEmpty() )
+                _cur = GeoPoint( node , d );
+            else
+                _stack.push_back( GeoPoint( node , d ) );
+        }
+
+        string _type;
+        BSONObj _filter;
+        list<GeoPoint> _stack;
+
+        GeoPoint _cur;
+        bool _firstCall;
+
+    };
+
+    class GeoCircleBrowse : public GeoBrowse {
+    public:
+        
+        enum State {
+            START , 
+            DOING_EXPAND ,
+            DOING_AROUND ,
+            DONE
+        } _state;
+
+        GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() )        
+            : GeoBrowse( g , "circle" , filter ){
+            
+            uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 );
+            BSONObjIterator i(circle);
+            _start = g->_tohash( i.next() );
+            _prefix = _start;
+            _maxDistance = i.next().numberDouble();
+            uassert( 13061 , "need a max distance > 0 " , _maxDistance > 0 );
+
+            _state = START;
+            _found = 0;
+
+            ok();
+        }
+
+        virtual bool moreToDo(){
+            return _state != DONE;
+        }
+        
+        virtual void fillStack(){
+            if ( _state == START ){
+                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , 
+                                               _prefix , _found , this ) ){
+                    _state = DONE;
+                    return;
+                }
+                _state = DOING_EXPAND;
+            }
+            
+            if ( _state == DOING_EXPAND ){
+                GEODEBUG( "circle prefix [" << _prefix << "]" );
+                while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) );
+                while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) );
+                
+                if ( ! _prefix.constrains() ){
+                    GEODEBUG( "\t exhausted the btree" );
+                    _state = DONE;
+                    return;
+                }
+                
+                if ( _g->distance( _prefix , _start ) > _maxDistance ){
+                    GEODEBUG( "\tpast circle bounds" );
+                    GeoHash tr = _prefix;
+                    tr.move( 1 , 1 );
+                    if ( _g->distance( tr , _start ) > _maxDistance )
+                        _state = DOING_AROUND;
+                    else
+                        _prefix = _prefix.up();
+                }
+                else
+                    _prefix = _prefix.up();
+                return;
+            }
+            
+            if ( _state == DOING_AROUND ){
+                _state = DONE;
+                return;
+            }
+        }
+        
+        virtual bool checkDistance( const GeoHash& h , double& d ){
+            d = _g->distance( _start , h );
+            GEODEBUG( "\t " << h << "\t" << d );
+            return d <= ( _maxDistance + .01 );
+        }
+
+        GeoHash _start;
+        double _maxDistance;
+        
+        int _found;
+        
+        GeoHash _prefix;        
+        BtreeLocation _min;
+        BtreeLocation _max;
+
+    };    
+
+    class GeoBoxBrowse : public GeoBrowse {
+    public:
+        
+        enum State {
+            START , 
+            DOING_EXPAND ,
+            DONE
+        } _state;
+
+        GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj() )        
+            : GeoBrowse( g , "box" , filter ){
+            
+            uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 );
+            BSONObjIterator i(box);
+            _bl = g->_tohash( i.next() );
+            _tr = g->_tohash( i.next() );
+
+            _want._min = Point( _g , _bl );
+            _want._max = Point( _g , _tr );
+            
+            uassert( 13064 , "need an area > 0 " , _want.area() > 0 );
+
+            _state = START;
+            _found = 0;
+
+            Point center = _want.center();
+            _prefix = _g->_hash( center._x , center._y );
+            
+            GEODEBUG( "center : " << center.toString() << "\t" << _prefix );
+
+	    {
+	      GeoHash a(0LL,32);
+	      GeoHash b(0LL,32);
+	      b.move(1,1);
+	      _fudge = _g->distance(a,b);
+	    }
+
+            ok();
+        }
+
+        virtual bool moreToDo(){
+            return _state != DONE;
+        }
+        
+        virtual void fillStack(){
+            if ( _state == START ){
+
+                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , 
+                                               _prefix , _found , this ) ){
+                    _state = DONE;
+                    return;
+                }
+                _state = DOING_EXPAND;
+            }
+            
+            if ( _state == DOING_EXPAND ){
+                int started = _found;
+                while ( started == _found || _state == DONE ){
+                    GEODEBUG( "box prefix [" << _prefix << "]" );
+                    while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) );
+                    while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) );
+                    
+                    if ( _state == DONE )
+                        return;
+
+                    if ( ! _prefix.constrains() ){
+                        GEODEBUG( "box exhausted" );
+                        _state = DONE;
+                        return;
+                    }
+                    
+                    Box cur( _g , _prefix );
+                    if ( cur._min._x + _fudge < _want._min._x &&
+                         cur._min._y + _fudge < _want._min._y &&
+                         cur._max._x - _fudge > _want._max._x &&
+                         cur._max._y - _fudge > _want._max._y ){
+                        
+                        _state = DONE;
+                        GeoHash temp = _prefix.commonPrefix( cur._max.hash( _g ) );
+
+                        GEODEBUG( "box done : " << cur.toString() << " prefix:" << _prefix << " common:" << temp );
+                        
+                        if ( temp == _prefix )
+                            return;
+                        _prefix = temp;
+                        GEODEBUG( "\t one more loop" );
+                        continue;
+                    }
+                    else {
+                        _prefix = _prefix.up();
+                    }
+                }
+                return;
+            }
+
+        }
+        
+        virtual bool checkDistance( const GeoHash& h , double& d ){
+            bool res = _want.inside( Point( _g , h ) , _fudge );
+            GEODEBUG( "\t want : " << _want.toString() 
+                      << " point: " << Point( _g , h ).toString() 
+                      << " in : " << res );
+            return res;
+        }
+
+        GeoHash _bl;
+        GeoHash _tr;
+        Box _want;
+
+        int _found;
+        
+        GeoHash _prefix;        
+        BtreeLocation _min;
+        BtreeLocation _max;
+
+        double _fudge;
+    };    
+
+
+    auto_ptr<Cursor> Geo2dType::newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
+        if ( numWanted < 0 )
+            numWanted = numWanted * -1;
+        else if ( numWanted == 0 )
+             numWanted = 100;
+        
+        BSONObjIterator i(query);
+        while ( i.more() ){
+            BSONElement e = i.next();
+
+            if ( _geo != e.fieldName() )
+                continue;
+
+            if ( e.type() != Object )
+                continue;
+            
+            switch ( e.embeddedObject().firstElement().getGtLtOp() ){
+            case BSONObj::opNEAR: {
+                e = e.embeddedObject().firstElement();
+                shared_ptr<GeoSearch> s( new GeoSearch( this , _tohash(e) , numWanted , query ) );
+                s->exec();
+                auto_ptr<Cursor> c;
+                c.reset( new GeoSearchCursor( s ) );
+                return c;   
+            }
+            case BSONObj::opWITHIN: {
+                e = e.embeddedObject().firstElement();
+                uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() );
+                e = e.embeddedObject().firstElement();
+                string type = e.fieldName();
+                if ( type == "$center" ){
+                    uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() );
+                    auto_ptr<Cursor> c;
+                    c.reset( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query ) );
+                    return c;   
+                }
+                else if ( type == "$box" ){
+                    uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() );
+                    auto_ptr<Cursor> c;
+                    c.reset( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query ) );
+                    return c;   
+                }
+                throw UserException( 13058 , (string)"unknown $with type: " + type );
+            }
+            default: 
+                break;
+            }
+        }
+
+        throw UserException( 13042 , (string)"missing geo field (" + _geo + ") in : " + query.toString() );
+    }
+
+    // ------
+    // commands
+    // ------
+
+    class Geo2dFindNearCmd : public Command {
+    public:
+        Geo2dFindNearCmd() : Command( "geoNear" ){}
+        virtual LockType locktype(){ return READ; } 
+        bool slaveOk() { return true; }
+        bool slaveOverrideOk() { return true; }
+        bool run(const char * stupidns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+            string ns = nsToDatabase( stupidns ) + "." + cmdObj.firstElement().valuestr();
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( ! d ){
+                errmsg = "can't find ns";
+                return false;
+            }
+
+            int geoIdx = -1;
+            {
+                NamespaceDetails::IndexIterator ii = d->ii();
+                while ( ii.more() ){
+                    IndexDetails& id = ii.next();
+                    if ( id.getSpec().getTypeName() == GEO2DNAME ){
+                        if ( geoIdx >= 0 ){
+                            errmsg = "2 geo indexes :(";
+                            return false;
+                        }
+                        geoIdx = ii.pos() - 1;
+                    }
+                }
+            }
+            
+            if ( geoIdx < 0 ){
+                errmsg = "no geo index :(";
+                return false;
+            }
+            
+            result.append( "ns" , ns );
+
+            IndexDetails& id = d->idx( geoIdx );
+            Geo2dType * g = (Geo2dType*)id.getSpec().getType();
+            assert( &id == g->getDetails() );
+            
+            int numWanted = 100;
+            if ( cmdObj["num"].isNumber() )
+                numWanted = cmdObj["num"].numberInt();
+
+            uassert(13046, "'near' param missing/invalid", !cmdObj["near"].eoo());
+            const GeoHash n = g->_tohash( cmdObj["near"] );
+            result.append( "near" , n.toString() );
+
+            BSONObj filter;
+            if ( cmdObj["query"].type() == Object )
+                filter = cmdObj["query"].embeddedObject();
+
+            GeoSearch gs( g , n , numWanted , filter );
+
+            if ( cmdObj["start"].type() == String){
+                GeoHash start = (string) cmdObj["start"].valuestr();
+                gs._start = start;
+            }
+            
+            gs.exec();
+
+            double distanceMultiplier = 1;
+            if ( cmdObj["distanceMultiplier"].isNumber() )
+                distanceMultiplier = cmdObj["distanceMultiplier"].number();
+            
+            double totalDistance = 0;
+
+
+            BSONObjBuilder arr( result.subarrayStart( "results" ) );
+            int x = 0;
+            for ( GeoHopper::Holder::iterator i=gs._hopper->_points.begin(); i!=gs._hopper->_points.end(); i++ ){
+                const GeoPoint& p = *i;
+                
+                double dis = distanceMultiplier * p._distance;
+                totalDistance += dis;
+                
+                BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ).c_str() ) );
+                bb.append( "dis" , dis );
+                bb.append( "obj" , p._o );
+                bb.done();
+            }
+            arr.done();
+            
+            BSONObjBuilder stats( result.subobjStart( "stats" ) );
+            stats.append( "time" , cc().curop()->elapsedMillis() );
+            stats.appendNumber( "btreelocs" , gs._nscanned );
+            stats.appendNumber( "nscanned" , gs._hopper->_lookedAt );
+            stats.appendNumber( "objectsLoaded" , gs._hopper->_objectsLoaded );
+            stats.append( "avgDistance" , totalDistance / x );
+            stats.done();
+            
+            return true;
+        }
+        
+    } geo2dFindNearCmd;
+
+    class GeoWalkCmd : public Command {
+    public:
+        GeoWalkCmd() : Command( "geoWalk" ){}
+        virtual LockType locktype(){ return READ; } 
+        bool slaveOk() { return true; }
+        bool slaveOverrideOk() { return true; }
+        bool run(const char * stupidns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+            string ns = nsToDatabase( stupidns ) + "." + cmdObj.firstElement().valuestr();
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( ! d ){
+                errmsg = "can't find ns";
+                return false;
+            }
+
+            int geoIdx = -1;
+            {
+                NamespaceDetails::IndexIterator ii = d->ii();
+                while ( ii.more() ){
+                    IndexDetails& id = ii.next();
+                    if ( id.getSpec().getTypeName() == GEO2DNAME ){
+                        if ( geoIdx >= 0 ){
+                            errmsg = "2 geo indexes :(";
+                            return false;
+                        }
+                        geoIdx = ii.pos() - 1;
+                    }
+                }
+            }
+            
+            if ( geoIdx < 0 ){
+                errmsg = "no geo index :(";
+                return false;
+            }
+            
+
+            IndexDetails& id = d->idx( geoIdx );
+            Geo2dType * g = (Geo2dType*)id.getSpec().getType();
+            assert( &id == g->getDetails() );
+
+            int max = 100000;
+
+            BtreeCursor c( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 );
+            while ( c.ok() && max-- ){
+                GeoHash h( c.currKey().firstElement() );
+                int len;
+                cout << "\t" << h.toString()
+                     << "\t" << c.current()[g->_geo] 
+                     << "\t" << hex << h.getHash() 
+                     << "\t" << hex << ((long long*)c.currKey().firstElement().binData(len))[0]
+                     << "\t" << c.current()["_id"]
+                     << endl;
+                c.advance();
+            }
+
+            return true;
+        }
+        
+    } geoWalkCmd;
+
+}
diff --git a/db/instance.cpp b/db/instance.cpp
index e8515c4..909911e 100644
--- a/db/instance.cpp
+++ b/db/instance.cpp
@@ -35,7 +35,8 @@
 #if !defined(_WIN32)
 #include <sys/file.h>
 #endif
-#include "dbstats.h"
+#include "stats/counters.h"
+#include "background.h"
 
 namespace mongo {
 
@@ -45,19 +46,9 @@ namespace mongo {
     void receivedInsert(Message& m, CurOp& op);
     bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop );
 
-    CmdLine cmdLine;
-
     int nloggedsome = 0;
 #define LOGSOME if( ++nloggedsome < 1000 || nloggedsome % 100 == 0 )
 
-    SlaveTypes slave = NotSlave;
-    bool master = false; // true means keep an op log
-    bool autoresync = false;
-    
-    /* we use new here so we don't have to worry about destructor orders at program shutdown */
-    MongoMutex &dbMutex( *(new MongoMutex) );
-//    MutexInfo dbMutexInfo;
-
     string dbExecCommand;
 
     string bind_ip = "";
@@ -66,8 +57,6 @@ namespace mongo {
 
     DiagLog _diaglog;
 
-    int opIdMem = 100000000;
-
     bool useCursors = true;
     bool useHints = true;
     
@@ -87,25 +76,30 @@ namespace mongo {
 
     // see FSyncCommand:
     unsigned lockedForWriting; 
-    boost::mutex lockedForWritingMutex;
+    mongo::mutex lockedForWritingMutex;
     bool unlockRequested = false;
 
     void inProgCmd( Message &m, DbResponse &dbresponse ) {
         BSONObjBuilder b;
 
-        AuthenticationInfo *ai = cc().ai;
-        if( !ai->isAuthorized("admin") ) { 
+        if( ! cc().isAdmin() ){
             BSONObjBuilder b;
             b.append("err", "unauthorized");
         }
         else {
+            DbMessage d(m);
+            QueryMessage q(d);
+            bool all = q.query["$all"].trueValue();
             vector<BSONObj> vals;
             {
-                boostlock bl(Client::clientsMutex);
+                Client& me = cc();
+                scoped_lock bl(Client::clientsMutex);
                 for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { 
                     Client *c = *i;
+                    if ( c == &me )
+                        continue;
                     CurOp& co = *(c->curop());
-                    if( co.active() )
+                    if( all || co.active() )
                         vals.push_back( co.infoNoauth() );
                 }
             }
@@ -116,14 +110,13 @@ namespace mongo {
                 b.append("info", "use command {unlock:0} to terminate the fsync write/snapshot lock");
             }
         }
-
+        
         replyToQuery(0, m, dbresponse, b.obj());
     }
     
     void killOp( Message &m, DbResponse &dbresponse ) {
         BSONObj obj;
-        AuthenticationInfo *ai = currentClient.get()->ai;
-        if( !ai->isAuthorized("admin") ) { 
+        if( ! cc().isAdmin() ){
             obj = fromjson("{\"err\":\"unauthorized\"}");
         }
         /*else if( !dbMutexInfo.isLocked() ) 
@@ -146,8 +139,7 @@ namespace mongo {
 
     void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) {
         BSONObj obj;
-        AuthenticationInfo *ai = currentClient.get()->ai;
-        if( !ai->isAuthorized("admin") || strncmp(ns, "admin.", 6) != 0 ) { 
+        if( ! cc().isAdmin() || strncmp(ns, "admin.", 6) != 0 ) { 
             obj = fromjson("{\"err\":\"unauthorized\"}");
         }
         else {
@@ -163,10 +155,7 @@ namespace mongo {
         replyToQuery(0, m, dbresponse, obj);
     }
 
-    static bool receivedQuery(DbResponse& dbresponse, Message& m, 
-                              CurOp& op, bool logit, 
-                              mongolock& lock
-      ) {
+    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ){
         bool ok = true;
         MSGID responseTo = m.data->id;
 
@@ -174,26 +163,9 @@ namespace mongo {
         QueryMessage q(d);
         QueryResult* msgdata;
 
-        Client& c = cc();
-
+        CurOp& op = *(c.curop());
+        
         try {
-            if (q.fields.get() && q.fields->errmsg)
-                uassert( 10053 , q.fields->errmsg, false);
-
-            /* note these are logged BEFORE authentication -- which is sort of ok */
-            if ( _diaglog.level && logit ) {
-                if ( strstr(q.ns, ".$cmd") ) {
-                    /* $cmd queries are "commands" and usually best treated as write operations */
-                    OPWRITE;
-                }
-                else {
-                    OPREAD;
-                }
-            }
-
-            setClient( q.ns, dbpath, &lock );
-            c.top.setRead();
-            c.curop()->setNS(q.ns);
             msgdata = runQuery(m, q, op ).release();
         }
         catch ( AssertionException& e ) {
@@ -230,32 +202,25 @@ namespace mongo {
         resp->setData(msgdata, true); // transport will free
         dbresponse.response = resp;
         dbresponse.responseTo = responseTo;
-        Database *database = c.database();
-        if ( database ) {
-            if ( database->profile )
-                op.debug().str << " bytes:" << resp->data->dataLen();
-        }
-        else {
-            if ( strstr(q.ns, "$cmd") == 0 ) // (this condition is normal for $cmd dropDatabase)
-                log() << "ERROR: receiveQuery: database is null; ns=" << q.ns << endl;
+        
+        if ( op.shouldDBProfile( 0 ) ){
+            op.debug().str << " bytes:" << resp->data->dataLen();
         }
 
         return ok;
     }
 
-    bool commandIsReadOnly(BSONObj& _cmdobj);
-
     // Returns false when request includes 'end'
     bool assembleResponse( Message &m, DbResponse &dbresponse, const sockaddr_in &client ) {
 
-        bool writeLock = true;
-
         // before we lock...
         int op = m.data->operation();
-        globalOpCounters.gotOp( op );
+        bool isCommand = false;
         const char *ns = m.data->_data + 4;
         if ( op == dbQuery ) {
             if( strstr(ns, ".$cmd") ) {
+                isCommand = true;
+                OPWRITE;
                 if( strstr(ns, ".$cmd.sys.") ) { 
                     if( strstr(ns, "$cmd.sys.inprog") ) {
                         inProgCmd(m, dbresponse);
@@ -270,17 +235,21 @@ namespace mongo {
                         return true;
                     }
                 }
-                DbMessage d( m );
-                QueryMessage q( d );
-                writeLock = !commandIsReadOnly(q.query);
+
+            }
+            else {
+                OPREAD;
             }
-            else
-                writeLock = false;
         }
         else if( op == dbGetMore ) {
-            writeLock = false;
+            OPREAD;
+        }
+        else {
+            OPWRITE;
         }
         
+        globalOpCounters.gotOp( op , isCommand );
+        
         if ( handlePossibleShardedMessage( m , dbresponse ) ){
             /* important to do this before we lock
                so if a message has to be forwarded, doesn't block for that
@@ -289,161 +258,115 @@ namespace mongo {
         }
 
         Client& c = cc();
-        c.clearns();
         
         auto_ptr<CurOp> nestedOp;
         CurOp* currentOpP = c.curop();
         if ( currentOpP->active() ){
-            nestedOp.reset( new CurOp() );
+            nestedOp.reset( new CurOp( &c , currentOpP ) );
             currentOpP = nestedOp.get();
         }
         CurOp& currentOp = *currentOpP;
-        currentOp.reset(client);
-        currentOp.setOp(op);
+        currentOp.reset(client,op);
         
         OpDebug& debug = currentOp.debug();
         StringBuilder& ss = debug.str;
+        ss << opToString( op ) << " ";
 
         int logThreshold = cmdLine.slowMS;
         bool log = logLevel >= 1;
-
-        Timer t( currentOp.startTime() );
-
-        mongolock lk(writeLock);
-
-#if 0
-        /* use this if you only want to process operations for a particular namespace.
-         maybe add to cmd line parms or something fancier.
-         */
-        DbMessage ddd(m);
-        if ( strncmp(ddd.getns(), "clusterstock", 12) != 0 ) {
-            static int q;
-            if ( ++q < 20 )
-                out() << "TEMP skip " << ddd.getns() << endl;
-            goto skip;
-        }
-#endif
-
+        
         if ( op == dbQuery ) {
-            // receivedQuery() does its own authorization processing.
-            if ( ! receivedQuery(dbresponse, m, currentOp, true, lk) )
+            if ( ! receivedQuery(c , dbresponse, m ) )
                 log = true;
         }
         else if ( op == dbGetMore ) {
-            // does its own authorization processing.
-            OPREAD;
             DEV log = true;
-            ss << "getmore ";
             if ( ! receivedGetMore(dbresponse, m, currentOp) )
                 log = true;
         }
         else if ( op == dbMsg ) {
-			/* deprecated / rarely used.  intended for connection diagnostics. */
-            ss << "msg ";
+            // deprecated - replaced by commands
             char *p = m.data->_data;
             int len = strlen(p);
             if ( len > 400 )
                 out() << curTimeMillis() % 10000 <<
-                     " long msg received, len:" << len <<
-                     " ends with: " << p + len - 10 << endl;
-            bool end = false; //strcmp("end", p) == 0;
+                    " long msg received, len:" << len <<
+                    " ends with: " << p + len - 10 << endl;
+
             Message *resp = new Message();
-            resp->setData(opReply, "i am fine");
+            if ( strcmp( "end" , p ) == 0 )
+                resp->setData( opReply , "dbMsg end no longer supported" );
+            else
+                resp->setData( opReply , "i am fine - dbMsg deprecated");
+
             dbresponse.response = resp;
             dbresponse.responseTo = m.data->id;
-            //dbMsgPort.reply(m, resp);
-            if ( end )
-                return false;
         }
         else {
             const char *ns = m.data->_data + 4;
             char cl[256];
             nsToDatabase(ns, cl);
-            currentOp.setNS(ns);
-            AuthenticationInfo *ai = currentClient.get()->ai;
-            if( !ai->isAuthorized(cl) ) { 
+            if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) { 
                 uassert_nothrow("unauthorized");
             }
-            else if ( op == dbInsert ) {
-                OPWRITE;
-                try {
-                    ss << "insert ";
-                    receivedInsert(m, currentOp);
-                }
-                catch ( AssertionException& e ) {
-                    LOGSOME problem() << " Caught Assertion insert, continuing\n";
-                    ss << " exception " << e.toString();
-                    log = true;
-                }
-            }
-            else if ( op == dbUpdate ) {
-                OPWRITE;
-                try {
-                    ss << "update ";
-                    receivedUpdate(m, currentOp);
-                }
-                catch ( AssertionException& e ) {
-                    LOGSOME problem() << " Caught Assertion update, continuing" << endl;
-                    ss << " exception " << e.toString();
-                    log = true;
-                }
-            }
-            else if ( op == dbDelete ) {
-                OPWRITE;
-                try {
-                    ss << "remove ";
-                    receivedDelete(m, currentOp);
-                }
-                catch ( AssertionException& e ) {
-                    LOGSOME problem() << " Caught Assertion receivedDelete, continuing" << endl;
-                    ss << " exception " << e.toString();
-                    log = true;
-                }
-            }
-            else if ( op == dbKillCursors ) {
-                OPREAD;
+            else {
                 try {
-                    logThreshold = 10;
-                    ss << "killcursors ";
-                    receivedKillCursors(m);
+                    if ( op == dbInsert ) {
+                        receivedInsert(m, currentOp);
+                    }
+                    else if ( op == dbUpdate ) {
+                        receivedUpdate(m, currentOp);
+                    }
+                    else if ( op == dbDelete ) {
+                        receivedDelete(m, currentOp);
+                    }
+                    else if ( op == dbKillCursors ) {
+                        currentOp.ensureStarted();
+                        logThreshold = 10;
+                        ss << "killcursors ";
+                        receivedKillCursors(m);
+                    }
+                    else {
+                        out() << "    operation isn't supported: " << op << endl;
+                        currentOp.done();
+                        log = true;
+                    }
                 }
                 catch ( AssertionException& e ) {
-                    problem() << " Caught Assertion in kill cursors, continuing" << endl;
+                    problem() << " Caught Assertion in " << opToString(op) << " , continuing" << endl;
                     ss << " exception " + e.toString();
                     log = true;
                 }
             }
-            else {
-                out() << "    operation isn't supported: " << op << endl;
-                currentOp.setActive(false);
-                assert(false);
-            }
         }
-        int ms = t.millis();
+        currentOp.ensureStarted();
+        currentOp.done();
+        int ms = currentOp.totalTimeMillis();
+        
         log = log || (logLevel >= 2 && ++ctr % 512 == 0);
         DEV log = true;
         if ( log || ms > logThreshold ) {
             ss << ' ' << ms << "ms";
             mongo::log() << ss.str() << endl;
         }
-        Database *database = c.database();
-        if ( database && database->profile >= 1 ) {
-            if ( database->profile >= 2 || ms >= cmdLine.slowMS ) {
-                // performance profiling is on
-                if ( dbMutex.getState() > 1 || dbMutex.getState() < -1 ){
-                    out() << "warning: not profiling because recursive lock" << endl;
+        
+        if ( currentOp.shouldDBProfile( ms ) ){
+            // performance profiling is on
+            if ( dbMutex.getState() < 0 ){
+                mongo::log(1) << "note: not profiling because recursive read lock" << endl;
+            }
+            else {
+                mongolock lk(true);
+                if ( dbHolder.isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ){
+                    Client::Context c( currentOp.getNS() );
+                    profile(ss.str().c_str(), ms);
                 }
                 else {
-                    string old_ns = c.ns();
-                    Database * old_db = c.database();
-                    lk.releaseAndWriteLock();
-                    Client::Context c( old_ns , old_db );
-                    profile(ss.str().c_str(), ms);
+                    mongo::log() << "note: not profiling because db went away - probably a close on: " << currentOp.getNS() << endl;
                 }
             }
         }
 
-        currentOp.setActive(false);
         return true;
     } /* assembleResponse() */
 
@@ -452,7 +375,7 @@ namespace mongo {
         int *x = (int *) m.data->_data;
         x++; // reserved
         int n = *x++;
-        assert( n >= 1 );
+        uassert( 13004 , "sent 0 cursors to kill" , n >= 1 );
         if ( n > 2000 ) {
             problem() << "Assertion failure, receivedKillCursors, n=" << n << endl;
             assert( n < 30000 );
@@ -460,29 +383,34 @@ namespace mongo {
         killCursors(n, (long long *) x);
     }
 
-    /* cl - database name
+    /* db - database name
        path - db directory
     */
-    void closeDatabase( const char *cl, const string& path ) {
-        Database *database = cc().database();
-        assert( database );
-        assert( database->name == cl );
-		/*
-        if ( string("local") != cl ) {
-            DBInfo i(cl);
-            i.dbDropped();
-			}*/
+    void closeDatabase( const char *db, const string& path ) {
+        assertInWriteLock();
+        
+        Client::Context * ctx = cc().getContext();
+        assert( ctx );
+        assert( ctx->inDB( db , path ) );
+        Database *database = ctx->db();
+        assert( database->name == db );
+        
+        replCheckCloseDatabase( database );
+
+        if( BackgroundOperation::inProgForDb(db) ) { 
+            log() << "warning: bg op in prog during close db? " << db << endl;
+        }
 
         /* important: kill all open cursors on the database */
-        string prefix(cl);
+        string prefix(db);
         prefix += '.';
         ClientCursor::invalidate(prefix.c_str());
 
         NamespaceDetailsTransient::clearForPrefix( prefix.c_str() );
 
-        dbHolder.erase( cl, path );
+        dbHolder.erase( db, path );
         delete database; // closes files
-        cc().clearns();
+        ctx->clear();
     }
 
     void receivedUpdate(Message& m, CurOp& op) {
@@ -490,9 +418,6 @@ namespace mongo {
         const char *ns = d.getns();
         assert(*ns);
         uassert( 10054 ,  "not master", isMasterNs( ns ) );
-        setClient(ns);
-        Client& client = cc();
-        client.top.setWrite();
         op.debug().str << ns << ' ';
         int flags = d.pullInt();
         BSONObj query = d.nextJsObj();
@@ -507,13 +432,18 @@ namespace mongo {
         bool multi = flags & UpdateOption_Multi;
         {
             string s = query.toString();
-            /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. */
+            /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. 
+               instead, let's just story the query BSON in the debug object, and it can toString() 
+               lazily
+            */
             op.debug().str << " query: " << s;
-            CurOp& currentOp = *client.curop();
-            currentOp.setQuery(query);
+            op.setQuery(query);
         }        
+
+        mongolock lk(1);
+        Client::Context ctx( ns );
+
         UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() );
-        /* TODO FIX: recordUpdate should take a long int for parm #2 */
         recordUpdate( res.existing , (int) res.num ); // for getlasterror
     }
 
@@ -522,9 +452,6 @@ namespace mongo {
         const char *ns = d.getns();
         assert(*ns);
         uassert( 10056 ,  "not master", isMasterNs( ns ) );
-        setClient(ns);
-        Client& client = cc();
-        client.top.setWrite();
         int flags = d.pullInt();
         bool justOne = flags & 1;
         assert( d.moreJSObjs() );
@@ -532,35 +459,38 @@ namespace mongo {
         {
             string s = pattern.toString();
             op.debug().str << " query: " << s;
-            CurOp& currentOp = *client.curop();
-            currentOp.setQuery(pattern);
+            op.setQuery(pattern);
         }        
-        int n = deleteObjects(ns, pattern, justOne, true);
-        recordDelete( n );
+
+        writelock lk(ns);
+        Client::Context ctx(ns);
+
+        long long n = deleteObjects(ns, pattern, justOne, true);
+        recordDelete( (int) n );
     }
     
     QueryResult* emptyMoreResult(long long);
 
     bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) {
+        StringBuilder& ss = curop.debug().str;
         bool ok = true;
+        
         DbMessage d(m);
+
         const char *ns = d.getns();
-        StringBuilder& ss = curop.debug().str;
-        ss << ns;
-        setClient(ns);
-        cc().top.setRead();
         int ntoreturn = d.pullInt();
         long long cursorid = d.pullInt64();
-        ss << " cid:" << cursorid;
-        ss << " ntoreturn:" << ntoreturn;
+        
+        ss << ns << " cid:" << cursorid << " ntoreturn:" << ntoreturn;;
+
         QueryResult* msgdata;
         try {
-            AuthenticationInfo *ai = currentClient.get()->ai;
-            uassert( 10057 , "unauthorized", ai->isAuthorized(cc().database()->name.c_str()));
+            mongolock lk(false);
+            Client::Context ctx(ns);
             msgdata = getMore(ns, ntoreturn, cursorid, curop);
         }
         catch ( AssertionException& e ) {
-            ss << " exception " + e.toString();
+            ss << " exception " << e.toString();
             msgdata = emptyMoreResult(cursorid);
             ok = false;
         }
@@ -570,7 +500,7 @@ namespace mongo {
         ss << " nreturned:" << msgdata->nReturned;
         dbresponse.response = resp;
         dbresponse.responseTo = m.data->id;
-        //dbMsgPort.reply(m, resp);
+
         return ok;
     }
 
@@ -579,10 +509,10 @@ namespace mongo {
 		const char *ns = d.getns();
 		assert(*ns);
         uassert( 10058 ,  "not master", isMasterNs( ns ) );
-		setClient(ns);
-        cc().top.setWrite();
         op.debug().str << ns;
-		
+
+        writelock lk(ns);
+        Client::Context ctx(ns);		
         while ( d.moreJSObjs() ) {
             BSONObj js = d.nextJsObj();
             uassert( 10059 , "object to insert too large", js.objsize() <= MaxBSONObjectSize);
@@ -610,14 +540,21 @@ namespace mongo {
         boost::filesystem::path path( dbpath );
         for ( boost::filesystem::directory_iterator i( path );
                 i != boost::filesystem::directory_iterator(); ++i ) {
-            string fileName = boost::filesystem::path(*i).leaf();
-            if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" )
-                names.push_back( fileName.substr( 0, fileName.length() - 3 ) );
+            if ( directoryperdb ) {
+                boost::filesystem::path p = *i;
+                string dbName = p.leaf();
+                p /= ( dbName + ".ns" );
+                if ( boost::filesystem::exists( p ) )
+                    names.push_back( dbName );
+            } else {
+                string fileName = boost::filesystem::path(*i).leaf();
+                if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" )
+                    names.push_back( fileName.substr( 0, fileName.length() - 3 ) );
+            }
         }
     }
 
     bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk ) {
-        SavedContext c;
         if ( lastError._get() )
             lastError.startRequest( toSend, lastError._get() );
         DbResponse dbResponse;
@@ -628,7 +565,6 @@ namespace mongo {
     }
 
     void DBDirectClient::say( Message &toSend ) {
-        SavedContext c;
         if ( lastError._get() )
             lastError.startRequest( toSend, lastError._get() );
         DbResponse dbResponse;
@@ -646,15 +582,13 @@ namespace mongo {
     }
 
 
-    DBDirectClient::AlwaysAuthorized DBDirectClient::SavedContext::always;
-
     DBClientBase * createDirectClient(){
         return new DBDirectClient();
     }
 
     void recCacheCloseAll();
 
-    boost::mutex &exitMutex( *( new boost::mutex ) );
+    mongo::mutex exitMutex;
     int numExitCalls = 0;
     void shutdown();
 
@@ -680,8 +614,9 @@ namespace mongo {
 
     /* not using log() herein in case we are already locked */
     void dbexit( ExitCode rc, const char *why) {        
+        Client * c = currentClient.get();
         {
-            boostlock lk( exitMutex );
+            scoped_lock lk( exitMutex );
             if ( numExitCalls++ > 0 ) {
                 if ( numExitCalls > 5 ){
                     // this means something horrible has happened
@@ -690,6 +625,7 @@ namespace mongo {
                 stringstream ss;
                 ss << "dbexit: " << why << "; exiting immediately" << endl;
                 tryToOutputFatal( ss.str() );
+                if ( c ) c->shutdown();
                 ::exit( rc );                
             }
         }
@@ -706,12 +642,12 @@ namespace mongo {
         }
         
         tryToOutputFatal( "dbexit: really exiting now\n" );
+        if ( c ) c->shutdown();
         ::exit(rc);
     }
     
     void shutdown() {
 
-
         log() << "\t shutdown: going to close listening sockets..." << endl;        
         ListeningSockets::get()->closeAll();
 
@@ -751,10 +687,29 @@ namespace mongo {
     void acquirePathLock() {
 #if !defined(_WIN32) && !defined(__sunos__)
         string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
-        lockFile = open( name.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO );
-        massert( 10309 ,  "Unable to create / open lock file for dbpath: " + name, lockFile > 0 );
-        massert( 10310 ,  "Unable to acquire lock for dbpath: " + name, flock( lockFile, LOCK_EX | LOCK_NB ) == 0 );
+
+        bool oldFile = false;
+
+        if ( boost::filesystem::exists( name ) && boost::filesystem::file_size( name ) > 0 ){
+            oldFile = true;
+        }
         
+        lockFile = open( name.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO );
+        uassert( 10309 ,  "Unable to create / open lock file for dbpath: " + name, lockFile > 0 );
+        uassert( 10310 ,  "Unable to acquire lock for dbpath: " + name, flock( lockFile, LOCK_EX | LOCK_NB ) == 0 );
+
+        if ( oldFile ){
+            // we check this here because we want to see if we can get the lock
+            // if we can't, then its probably just another mongod running
+            cout << "************** \n" 
+                 << "old lock file: " << name << ".  probably means unclean shutdown\n"
+                 << "recommend removing file and running --repair\n" 
+                 << "see: http://dochub.mongodb.org/core/repair for more information\n"
+                 << "*************" << endl;
+            uassert( 12596 , "old lock file" , 0 );
+        }
+
+
         stringstream ss;
         ss << getpid() << endl;
         string s = ss.str();
diff --git a/db/instance.h b/db/instance.h
index b2b2c94..b545a78 100644
--- a/db/instance.h
+++ b/db/instance.h
@@ -38,12 +38,14 @@ namespace mongo {
            7 = log a few reads, and all writes.
         */
         int level;
+        mongo::mutex mutex;
+
         DiagLog() : f(0) , level(0) { }
         void init() {
             if ( ! f && level ){
                 log() << "diagLogging = " << level << endl;
                 stringstream ss;
-                ss << "diaglog." << hex << time(0);
+                ss << dbpath << "/diaglog." << hex << time(0);
                 string name = ss.str();
                 f = new ofstream(name.c_str(), ios::out | ios::binary);
                 if ( ! f->good() ) {
@@ -62,17 +64,26 @@ namespace mongo {
             return old;
         }
         void flush() {
-            if ( level ) f->flush();
+            if ( level ){
+                scoped_lock lk(mutex);
+                f->flush();
+            }
         }
         void write(char *data,int len) {
-            if ( level & 1 ) f->write(data,len);
+            if ( level & 1 ){
+                scoped_lock lk(mutex);
+                f->write(data,len);
+            }
         }
         void readop(char *data, int len) {
             if ( level & 2 ) {
                 bool log = (level & 4) == 0;
                 OCCASIONALLY log = true;
-                if ( log )
+                if ( log ){
+                    scoped_lock lk(mutex);
+                    assert( f );
                     f->write(data,len);
+                }
             }
         }
     };
@@ -124,53 +135,6 @@ namespace mongo {
             // don't need to piggy back when connected locally
             return say( toSend );
         }
-        class AlwaysAuthorized : public AuthenticationInfo {
-            virtual bool isAuthorized( const char *dbname ) {
-                return true;   
-            }
-        };
-
-        /* TODO: this looks bad that auth is set to always.  is that really always safe? */
-        class SavedContext {
-        public:
-            SavedContext() {
-                _save = dbMutex.atLeastReadLocked();
-
-                Client *c = currentClient.get();
-                oldAuth = c->ai;
-                // careful, don't want to free this:
-                c->ai = &always;
-
-                /* it only makes sense to manipulate a pointer - c->database() - if locked. 
-                   thus the _saved flag.
-                */
-                if( _save ) {
-                    if ( c->database() ) {
-                        dbMutex.assertAtLeastReadLocked();
-                        _oldName = c->database()->name;
-                    }
-                }
-            }
-            ~SavedContext() {
-                Client *c = currentClient.get();
-                c->ai = oldAuth;
-                if( _save ) {
-                    if ( !_oldName.empty() ) {
-                        dbMutex.assertAtLeastReadLocked();
-                        setClient( _oldName.c_str() );
-                    }
-                }
-                else {
-                    // defensive
-                    cc().clearns();
-                }
-            }
-        private:
-            bool _save;
-            static AlwaysAuthorized always;
-            AuthenticationInfo *oldAuth;
-            string _oldName;
-        };
     };
 
     extern int lockFile;
diff --git a/db/introspect.cpp b/db/introspect.cpp
index 9cb477d..a041d48 100644
--- a/db/introspect.cpp
+++ b/db/introspect.cpp
@@ -26,8 +26,7 @@
 
 namespace mongo {
 
-    void profile(const char *str,
-                 int millis)
+    void profile( const char *str, int millis)
     {
         BSONObjBuilder b;
         b.appendDate("ts", jsTime());
diff --git a/db/jsobj.cpp b/db/jsobj.cpp
index 1a299a5..9f9a684 100644
--- a/db/jsobj.cpp
+++ b/db/jsobj.cpp
@@ -20,6 +20,7 @@
 #include "stdafx.h"
 #include "jsobj.h"
 #include "nonce.h"
+#include "../util/atomic_int.h"
 #include "../util/goodies.h"
 #include "../util/base64.h"
 #include "../util/md5.hpp"
@@ -30,6 +31,7 @@
 #include "jsobjmanipulator.h"
 #include "../util/optime.h"
 #include <boost/static_assert.hpp>
+#include <boost/any.hpp>
 #undef assert
 #define assert xassert
 
@@ -50,12 +52,6 @@ namespace mongo {
     }
 
     IDLabeler GENOID;
-    BSONObjBuilder& operator<<(BSONObjBuilder& b, IDLabeler& id) {
-        OID oid;
-        oid.init();
-        b.appendOID("_id", &oid);
-        return b;
-    }
 
     DateNowLabeler DATENOW;
 
@@ -156,7 +152,7 @@ namespace mongo {
         return s.str();
     }
 
-    string escape( string s ) {
+    string escape( string s , bool escape_slash=false) {
         stringstream ret;
         for ( string::iterator i = s.begin(); i != s.end(); ++i ) {
             switch ( *i ) {
@@ -167,7 +163,7 @@ namespace mongo {
                 ret << "\\\\";
                 break;
             case '/':
-                ret << "\\/";
+                ret << (escape_slash ? "\\/" : "/");
                 break;
             case '\b':
                 ret << "\\b";
@@ -306,17 +302,13 @@ namespace mongo {
                 s << " )";
             break;
         case RegEx:
-            if ( format == Strict )
-                s << "{ \"$regex\" : \"";
-            else
-                s << "/";
-            s << escape( regex() );
-            if ( format == Strict )
+            if ( format == Strict ){
+                s << "{ \"$regex\" : \"" << escape( regex() );
                 s << "\", \"$options\" : \"" << regexFlags() << "\" }";
-            else {
-                s << "/";
+            } else {
+                s << "/" << escape( regex() , true ) << "/";
                 // FIXME Worry about alpha order?
-                for ( const char *f = regexFlags(); *f; ++f )
+                for ( const char *f = regexFlags(); *f; ++f ){
                     switch ( *f ) {
                     case 'g':
                     case 'i':
@@ -325,6 +317,7 @@ namespace mongo {
                     default:
                         break;
                     }
+                }
             }
             break;
 
@@ -413,7 +406,8 @@ namespace mongo {
         default: {
             stringstream ss;
             ss << "BSONElement: bad type " << (int) type();
-            massert( 10320 , ss.str().c_str(),false);
+            string msg = ss.str();
+            massert( 10320 , msg.c_str(),false);
         }
         }
         totalSize =  x + fieldNameSize() + 1; // BSONType
@@ -434,8 +428,12 @@ namespace mongo {
                     else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::LTE;
                 }
             }
-            else if ( fn[1] == 'n' && fn[2] == 'e' && fn[3] == 0)
-                return BSONObj::NE;
+            else if ( fn[1] == 'n' && fn[2] == 'e' ){
+                if ( fn[3] == 0 )
+                    return BSONObj::NE;
+                if ( fn[3] == 'a' && fn[4] == 'r' && fn[5] == 0 )
+                    return BSONObj::opNEAR;
+            }
             else if ( fn[1] == 'm' && fn[2] == 'o' && fn[3] == 'd' && fn[4] == 0 )
                 return BSONObj::opMOD;
             else if ( fn[1] == 't' && fn[2] == 'y' && fn[3] == 'p' && fn[4] == 'e' && fn[5] == 0 )
@@ -458,6 +456,8 @@ namespace mongo {
                 return BSONObj::opREGEX;
             else if ( fn[1] == 'o' && fn[2] == 'p' && fn[3] == 't' && fn[4] == 'i' && fn[5] == 'o' && fn[6] == 'n' && fn[7] == 's' && fn[8] == 0 )
                 return BSONObj::opOPTIONS;
+            else if ( fn[1] == 'w' && fn[2] == 'i' && fn[3] == 't' && fn[4] == 'h' && fn[5] == 'i' && fn[6] == 'n' && fn[7] == 0 )
+                return BSONObj::opWITHIN;
         }
         return def;
     }
@@ -541,13 +541,18 @@ namespace mongo {
         case Object:
         case Array:
             return l.embeddedObject().woCompare( r.embeddedObject() );
-        case DBRef:
-        case BinData: {
+        case DBRef: {
             int lsz = l.valuesize();
             int rsz = r.valuesize();
             if ( lsz - rsz != 0 ) return lsz - rsz;
             return memcmp(l.value(), r.value(), lsz);
         }
+        case BinData: {
+            int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
+            int rsz = r.objsize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value()+4, r.value()+4, lsz+1);
+        }
         case RegEx:
         {
             int c = strcmp(l.regex(), r.regex());
@@ -576,31 +581,35 @@ namespace mongo {
 
     void BSONElement::validate() const {
         switch( type() ) {
-            case DBRef:
-            case Code:
-            case Symbol:
-            case String:
-                massert( 10321 ,  "Invalid dbref/code/string/symbol size",
-                        valuestrsize() > 0 &&
-                        valuestrsize() - 1 == strnlen( valuestr(), valuestrsize() ) );
-                break;
-            case CodeWScope: {
-                int totalSize = *( int * )( value() );
-                massert( 10322 ,  "Invalid CodeWScope size", totalSize >= 8 );
-                int strSizeWNull = *( int * )( value() + 4 );
-                massert( 10323 ,  "Invalid CodeWScope string size", totalSize >= strSizeWNull + 4 + 4 );
-                massert( 10324 ,  "Invalid CodeWScope string size",
-                        strSizeWNull > 0 &&
-                        strSizeWNull - 1 == strnlen( codeWScopeCode(), strSizeWNull ) );
-                massert( 10325 ,  "Invalid CodeWScope size", totalSize >= strSizeWNull + 4 + 4 + 4 );
-                int objSize = *( int * )( value() + 4 + 4 + strSizeWNull );
-                massert( 10326 ,  "Invalid CodeWScope object size", totalSize == 4 + 4 + strSizeWNull + objSize );
-                // Subobject validation handled elsewhere.
-            }
-            case Object:
-                // We expect Object size validation to be handled elsewhere.
-            default:
-                break;
+        case DBRef:
+        case Code:
+        case Symbol:
+        case String: {
+            int x = valuestrsize();
+            if ( x > 0 && valuestr()[x-1] == 0 )
+                return;
+            StringBuilder buf;
+            buf <<  "Invalid dbref/code/string/symbol size: " << x << " strnlen:" << strnlen( valuestr() , x );
+            massert( 10321 , buf.str() , 0 );
+            break;
+        }
+        case CodeWScope: {
+            int totalSize = *( int * )( value() );
+            massert( 10322 ,  "Invalid CodeWScope size", totalSize >= 8 );
+            int strSizeWNull = *( int * )( value() + 4 );
+            massert( 10323 ,  "Invalid CodeWScope string size", totalSize >= strSizeWNull + 4 + 4 );
+            massert( 10324 ,  "Invalid CodeWScope string size",
+                     strSizeWNull > 0 &&
+                     strSizeWNull - 1 == strnlen( codeWScopeCode(), strSizeWNull ) );
+            massert( 10325 ,  "Invalid CodeWScope size", totalSize >= strSizeWNull + 4 + 4 + 4 );
+            int objSize = *( int * )( value() + 4 + 4 + strSizeWNull );
+            massert( 10326 ,  "Invalid CodeWScope object size", totalSize == 4 + 4 + strSizeWNull + objSize );
+            // Subobject validation handled elsewhere.
+        }
+        case Object:
+            // We expect Object size validation to be handled elsewhere.
+        default:
+            break;
         }
     }
 
@@ -653,7 +662,7 @@ namespace mongo {
             const string& c = l.substr( lstart , lend - lstart );
             const string& d = r.substr( rstart , rend - rstart );
 
-            int x = c.compare( d );
+            int x = lexNumCmp( c.c_str(), d.c_str() );
 
             if ( x < 0 )
                 return LEFT_BEFORE;
@@ -766,9 +775,18 @@ namespace mongo {
             if ( r.eoo() )
                 return 1;
 
-            int x = l.woCompare( r, considerFieldName );
-            if ( ordered && o.number() < 0 )
-                x = -x;
+            int x;
+/*
+            if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 && 
+                l.type() == String && r.type() == String ) { 
+                // note: no negative support yet, as this is just sort of a POC
+                x = _stricmp(l.valuestr(), r.valuestr());
+            }
+            else*/ {
+                x = l.woCompare( r, considerFieldName );
+                if ( ordered && o.number() < 0 )
+                    x = -x;
+            }
             if ( x != 0 )
                 return x;
         }
@@ -809,18 +827,6 @@ namespace mongo {
     }
 
 
-    BSONElement BSONObj::getField(const char *name) const {
-        BSONObjIterator i(*this);
-        while ( i.moreWithEOO() ) {
-            BSONElement e = i.next();
-            if ( e.eoo() )
-                break;
-            if ( strcmp(e.fieldName(), name) == 0 )
-                return e;
-        }
-        return nullElement;
-    }
-
     /* return has eoo() true if no match
        supports "." notation to reach into embedded objects
     */
@@ -838,49 +844,62 @@ namespace mongo {
         return e;
     }
 
-    /* jul09 : 'deep' and this function will be going away in the future - kept only for backward compatibility of datafiles for now. */
-    void trueDat( bool *deep ) {
-        if( deep )
-            *deep = true;
-    }
+    void BSONObj::getFieldsDotted(const char *name, BSONElementSet &ret ) const {
+        BSONObjIterator i(*this);
+        while ( i.more() ){
+            BSONElement e = i.next();
+            FieldCompareResult cmp = compareDottedFieldNames( name , e.fieldName() );
+            switch ( cmp ){
 
-    void BSONObj::getFieldsDotted(const char *name, BSONElementSet &ret, bool *deep ) const {
-        BSONElement e = getField( name );
-        if ( e.eoo() ) {
-            const char *p = strchr(name, '.');
-            if ( p ) {
-                string left(name, p-name);
-                BSONElement e = getField( left );
-                if ( e.type() == Array ) {
-                    trueDat( deep );
-                    BSONObjIterator i( e.embeddedObject() );
-                    while( i.moreWithEOO() ) {
-                        BSONElement f = i.next();
-                        if ( f.eoo() )
-                            break;
+            case LEFT_BEFORE: 
+            case RIGHT_BEFORE:
+                break;
+
+            case RIGHT_SUBFIELD: 
+                assert(0); 
+                break;
+
+            case LEFT_SUBFIELD: {
+                const char * next = name + strlen( e.fieldName() ) + 1;
+                bool allDigits = false;
+                if ( isdigit( *next ) ){
+                    const char * temp = next + 1;
+                    while ( isdigit( *temp ) )
+                        temp++;
+                    allDigits = *temp == '.';
+                }
+
+                if ( e.type() == Object || allDigits ){
+                    e.embeddedObject().getFieldsDotted( next , ret );
+                }
+                else if ( e.type() == Array ){
+                    BSONObjIterator j( e.embeddedObject() );
+                    while ( j.more() ){
+                        BSONElement f = j.next();
                         if ( f.type() == Object )
-                            f.embeddedObject().getFieldsDotted(p+1, ret);
+                            f.embeddedObject().getFieldsDotted( next , ret );
                     }
-                } else if ( e.type() == Object ) {
-                    e.embeddedObject().getFieldsDotted(p+1, ret);
                 }
+                else {
+                    // intentially left blank, this means no match
+                }
+                return;
             }
-        } else {
-            if ( e.type() == Array ) {
-                trueDat( deep );
-                BSONObjIterator i( e.embeddedObject() );
-                while( i.moreWithEOO() ) {
-                    BSONElement f = i.next();
-                    if ( f.eoo() )
-                        break;
-                    ret.insert( f );
+
+            case SAME: {
+                if ( e.type() == Array ){
+                    BSONObjIterator j( e.embeddedObject() );
+                    while ( j.more() )
+                        ret.insert( j.next() );
                 }
-            } else {
-                ret.insert( e );
+                else {
+                    ret.insert( e );
+                }
+                return;
+            }
+
             }
         }
-        if ( ret.empty() && deep )
-            *deep = false;
     }
 
     BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const {
@@ -1141,7 +1160,10 @@ namespace mongo {
             
             if ( strchr( name , '.' ) ||
                  strchr( name , '$' ) ){
-                return false;
+                return 
+                    strcmp( name , "$ref" ) == 0 ||
+                    strcmp( name , "$id" ) == 0
+                    ;
             }
             
             if ( e.mayEncapsulate() ){
@@ -1410,7 +1432,7 @@ namespace mongo {
     }
     
     void OID::init() {
-        static WrappingInt inc = (unsigned) security.getNonce();
+        static AtomicUInt inc = (unsigned) security.getNonce();
         unsigned t = (unsigned) time(0);
         char *T = (char *) &t;
         data[0] = T[3];
@@ -1420,7 +1442,7 @@ namespace mongo {
 
         (unsigned&) data[4] = _machine;
 
-        int new_inc = inc.atomicIncrement();
+        int new_inc = inc++;
         T = (char *) &new_inc;
         char * raw = (char*)&b;
         raw[0] = T[3];
@@ -1464,7 +1486,7 @@ namespace mongo {
     Labeler::Label SIZE( "$size" );
 
     void BSONElementManipulator::initTimestamp() {
-        massert( 10332 ,  "Expected CurrentTime type", element_.type() == Timestamp );
+        massert( 10332 ,  "Expected CurrentTime type", _element.type() == Timestamp );
         unsigned long long &timestamp = *( reinterpret_cast< unsigned long long* >( value() ) );
         if ( timestamp == 0 )
             timestamp = OpTime::now().asDate();
@@ -1610,12 +1632,23 @@ namespace mongo {
 
     }
 
+    void BSONObjBuilder::appendKeys( const BSONObj& keyPattern , const BSONObj& values ){
+        BSONObjIterator i(keyPattern);
+        BSONObjIterator j(values);
+        
+        while ( i.more() && j.more() ){
+            appendAs( j.next() , i.next().fieldName() );
+        }
+        
+        assert( ! i.more() );
+        assert( ! j.more() );
+    }
 
     int BSONElementFieldSorter( const void * a , const void * b ){
         const char * x = *((const char**)a);
         const char * y = *((const char**)b);
         x++; y++;
-        return strcmp( x , y );
+        return lexNumCmp( x , y );
     }
     
     BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ){
diff --git a/db/jsobj.h b/db/jsobj.h
index 4030122..aaf059b 100644
--- a/db/jsobj.h
+++ b/db/jsobj.h
@@ -480,7 +480,7 @@ namespace mongo {
         BSONObj embeddedObject() const;
 
         /* uasserts if not an object */
-        BSONObj embeddedObjectUserCheck();
+        BSONObj embeddedObjectUserCheck() const;
 
         BSONObj codeWScopeObject() const;
 
@@ -509,7 +509,7 @@ namespace mongo {
         BinDataType binDataType() const {
             // BinData: <int len> <byte subtype> <byte[len] data>
             assert( type() == BinData );
-            char c = (value() + 4)[0];
+            unsigned char c = (value() + 4)[0];
             return (BinDataType)c;
         }
 
@@ -574,9 +574,25 @@ namespace mongo {
 
         /** True if this element may contain subobjects. */
         bool mayEncapsulate() const {
-            return type() == Object ||
-                type() == Array ||
-                type() == CodeWScope;
+            switch ( type() ){
+            case Object:
+            case Array:
+            case CodeWScope:
+                return true;
+            default:
+                return false;
+            }
+        }
+
+        /** True if this element can be a BSONObj */
+        bool isABSONObj() const {
+            switch( type() ){
+            case Object:
+            case Array:
+                return true;
+            default:
+                return false;
+            }
         }
 
         Date_t timestampTime() const{
@@ -625,7 +641,7 @@ namespace mongo {
         mutable int fieldNameSize_; // cached value
         int fieldNameSize() const {
             if ( fieldNameSize_ == -1 )
-                fieldNameSize_ = strlen( fieldName() ) + 1;
+                fieldNameSize_ = (int)strlen( fieldName() ) + 1;
             return fieldNameSize_;
         }
         mutable int totalSize; /* caches the computed size */
@@ -635,7 +651,7 @@ namespace mongo {
 
     struct BSONElementCmpWithoutField {
         bool operator()( const BSONElement &l, const BSONElement &r ) const {
-            return l.woCompare( r, false );
+            return l.woCompare( r, false ) < 0;
         }
     };
     
@@ -700,6 +716,11 @@ namespace mongo {
             if ( ! isValid() ){
                 stringstream ss;
                 ss << "Invalid BSONObj spec size: " << objsize();
+                try {
+                    BSONElement e = firstElement();
+                    ss << " first element:" << e.toString() << " ";
+                }
+                catch ( ... ){}
                 string s = ss.str();
                 massert( 10334 ,  s , 0 );
             }
@@ -759,7 +780,7 @@ namespace mongo {
         BSONElement getFieldDotted(const char *name) const;
         /** Like getFieldDotted(), but expands multikey arrays and returns all matching objects
          */
-        void getFieldsDotted(const char *name, BSONElementSet &ret, bool *deep = 0) const;
+        void getFieldsDotted(const char *name, BSONElementSet &ret ) const;
         /** Like getFieldDotted(), but returns first array encountered while traversing the
             dotted fields of name.  The name variable is updated to represent field
             names with respect to the returned element. */
@@ -768,14 +789,14 @@ namespace mongo {
         /** Get the field of the specified name. eoo() is true on the returned 
             element if not found. 
         */
-        BSONElement getField(const string name) const {
-            return getField( name.c_str() );
-        };
+        BSONElement getField(const char *name) const;
 
         /** Get the field of the specified name. eoo() is true on the returned 
             element if not found. 
         */
-        BSONElement getField(const char *name) const; /* return has eoo() true if no match */
+        BSONElement getField(const string name) const {
+            return getField( name.c_str() );
+        };
 
         /** Get the field of the specified name. eoo() is true on the returned 
             element if not found. 
@@ -902,13 +923,9 @@ namespace mongo {
             return BSONElement(objdata() + 4);
         }
 
-		/** @return element with fieldname "name".  returnvalue.eoo() is true if not found */
-        BSONElement findElement(const char *name) const;
-
-		/** @return element with fieldname "name".  returnvalue.eoo() is true if not found */
-        BSONElement findElement(string name) const {
-            return findElement(name.c_str());
-        }
+		/** use getField() instead. */
+        //BSONElement getField(const char *name) const;
+        //BSONElement getField(string name) const {
 
 		/** @return true if field exists in the object */
         bool hasElement(const char *name) const;
@@ -976,7 +993,9 @@ namespace mongo {
             opTYPE = 0x0F,
             opREGEX = 0x10,
             opOPTIONS = 0x11,
-            opELEM_MATCH = 0x12
+            opELEM_MATCH = 0x12,
+            opNEAR = 0x13,
+            opWITHIN = 0x14,
         };        
     };
     ostream& operator<<( ostream &s, const BSONObj &o );
@@ -1028,7 +1047,7 @@ namespace mongo {
     BSON( "a" << GT << 23.4 << NE << 30 << "b" << 2 ) produces the object
     { a: { \$gt: 23.4, \$ne: 30 }, b: 2 }.
 */
-#define BSON(x) (( mongo::BSONObjBuilder() << x ).obj())
+#define BSON(x) (( mongo::BSONObjBuilder(64) << x ).obj())
 
 /** Use BSON_ARRAY macro like BSON macro, but without keys
 
@@ -1042,7 +1061,6 @@ namespace mongo {
          cout << BSON( GENOID << "z" << 3 ); // { _id : ..., z : 3 }
     */
     extern struct IDLabeler { } GENOID;
-    BSONObjBuilder& operator<<(BSONObjBuilder& b, IDLabeler& id);
 
     /* Utility class to add a Date element with the current time
        Example: 
@@ -1107,20 +1125,63 @@ namespace mongo {
     };
     
     /**
+       used in conjuction with BSONObjBuilder, allows for proper buffer size to prevent crazy memory usage
+     */
+    class BSONSizeTracker {
+    public:
+#define BSONSizeTrackerSize 10
+
+        BSONSizeTracker(){
+            _pos = 0;
+            for ( int i=0; i<BSONSizeTrackerSize; i++ )
+                _sizes[i] = 512; // this is the default, so just be consistent
+        }
+        
+        ~BSONSizeTracker(){
+        }
+        
+        void got( int size ){
+            _sizes[_pos++] = size;
+            if ( _pos >= BSONSizeTrackerSize )
+                _pos = 0;
+        }
+        
+        /**
+         * right now choosing largest size
+         */
+        int getSize() const {
+            int x = 16; // sane min
+            for ( int i=0; i<BSONSizeTrackerSize; i++ ){
+                if ( _sizes[i] > x )
+                    x = _sizes[i];
+            }
+            return x;
+        }
+        
+    private:
+        int _pos;
+        int _sizes[BSONSizeTrackerSize];
+    };
+    
+    /**
        utility for creating a BSONObj
      */
     class BSONObjBuilder : boost::noncopyable {
     public:
         /** @param initsize this is just a hint as to the final size of the object */
-        BSONObjBuilder(int initsize=512) : b(buf_), buf_(initsize), offset_( 0 ), s_( this ) {
+        BSONObjBuilder(int initsize=512) : b(buf_), buf_(initsize), offset_( 0 ), s_( this ) , _tracker(0) {
             b.skip(4); /*leave room for size field*/
         }
 
         /** @param baseBuilder construct a BSONObjBuilder using an existing BufBuilder */
-        BSONObjBuilder( BufBuilder &baseBuilder ) : b( baseBuilder ), buf_( 0 ), offset_( baseBuilder.len() ), s_( this ) {
+        BSONObjBuilder( BufBuilder &baseBuilder ) : b( baseBuilder ), buf_( 0 ), offset_( baseBuilder.len() ), s_( this ) , _tracker(0) {
             b.skip( 4 );
         }
         
+        BSONObjBuilder( const BSONSizeTracker & tracker ) : b(buf_) , buf_(tracker.getSize() ), offset_(0), s_( this ) , _tracker( (BSONSizeTracker*)(&tracker) ){
+            b.skip( 4 );
+        }
+
         /** add all the fields from the object specified to this object */
         BSONObjBuilder& appendElements(BSONObj x);
 
@@ -1188,6 +1249,13 @@ namespace mongo {
             b.append((char) (val?1:0));
         }
 
+        /** Append a boolean element */
+        void append(const char *fieldName, bool val) {
+            b.append((char) Bool);
+            b.append(fieldName);
+            b.append((char) (val?1:0));            
+        }
+        
         /** Append a 32 bit integer element */
         void append(const char *fieldName, int n) {
             b.append((char) NumberInt);
@@ -1214,7 +1282,42 @@ namespace mongo {
             append( fieldName.c_str() , n );
         }
 
+        /** appends a number.  if n < max(int)/2 then uses int, otherwise long long */
+        void appendIntOrLL( const string& fieldName , long long n ){
+            long long x = n;
+            if ( x < 0 )
+                x = x * -1;
+            if ( x < ( numeric_limits<int>::max() / 2 ) )
+                append( fieldName.c_str() , (int)n );
+            else
+                append( fieldName.c_str() , n );
+        }
+
+
+        /**
+         * appendNumber is a series of method for appending the smallest sensible type
+         * mostly for JS
+         */
+        void appendNumber( const string& fieldName , int n ){
+            append( fieldName.c_str() , n );
+        }
 
+        void appendNumber( const string& fieldName , double d ){
+            append( fieldName.c_str() , d );
+        }
+
+        void appendNumber( const string& fieldName , long long l ){
+            static long long maxInt = (int)pow( 2.0 , 30.0 );
+            static long long maxDouble = (long long)pow( 2.0 , 40.0 );
+
+            if ( l < maxInt )
+                append( fieldName.c_str() , (int)l );
+            else if ( l < maxDouble )
+                append( fieldName.c_str() , (double)l );
+            else
+                append( fieldName.c_str() , l );
+        }
+        
         /** Append a double element */
         BSONObjBuilder& append(const char *fieldName, double n) {
             b.append((char) NumberDouble);
@@ -1451,6 +1554,16 @@ namespace mongo {
             return BSONObj(_done());
         }
 
+        /** Peek at what is in the builder, but leave the builder ready for more appends.
+            The returned object is only valid until the next modification or destruction of the builder.
+            Intended use case: append a field if not already there.
+        */
+        BSONObj asTempObj() {
+            BSONObj temp(_done());
+            b.setlen(b.len()-1); //next append should overwrite the EOO
+            return temp;
+        }
+
         /* assume ownership of the buffer - you must then free it (with free()) */
         char* decouple(int& l) {
             char *x = _done();
@@ -1463,6 +1576,7 @@ namespace mongo {
             b.decouple();    // post done() call version.  be sure jsobj frees...
         }
 
+        void appendKeys( const BSONObj& keyPattern , const BSONObj& values );
 
     private:
         static const string numStrs[100]; // cache of 0 to 99 inclusive
@@ -1482,6 +1596,14 @@ namespace mongo {
             return s_;
         }
 
+        /** Stream oriented way to add field names and values. */
+        BSONObjBuilder& operator<<( IDLabeler ) {
+            OID oid;
+            oid.init();
+            appendOID("_id", &oid);
+            return *this;
+        }
+
         // prevent implicit string conversions which would allow bad things like BSON( BSON( "foo" << 1 ) << 2 )
         struct ForceExplicitString {
         ForceExplicitString( const string &str ) : str_( str ) {}
@@ -1509,12 +1631,15 @@ namespace mongo {
             b.append( fieldName );
             b.append( (void *) arr.objdata(), arr.objsize() );
         }
-
+        
         char* _done() {
             s_.endField();
             b.append((char) EOO);
             char *data = b.buf() + offset_;
-            *((int*)data) = b.len() - offset_;
+            int size = b.len() - offset_;
+            *((int*)data) = size;
+            if ( _tracker )
+                _tracker->got( size );
             return data;
         }
 
@@ -1522,34 +1647,88 @@ namespace mongo {
         BufBuilder buf_;
         int offset_;
         BSONObjBuilderValueStream s_;
+        BSONSizeTracker * _tracker;
     };
 
     class BSONArrayBuilder : boost::noncopyable{
     public:
-        BSONArrayBuilder() :i(0), b() {}
+        BSONArrayBuilder() : _i(0), _b() {}
+        BSONArrayBuilder( BufBuilder &b ) : _i(0), _b(b) {}
 
         template <typename T>
         BSONArrayBuilder& append(const T& x){
-            b.append(num().c_str(), x);
+            _b.append(num().c_str(), x);
             return *this;
         }
 
         BSONArrayBuilder& append(const BSONElement& e){
-            b.appendAs(e, num().c_str());
+            _b.appendAs(e, num().c_str());
             return *this;
         }
-
+        
         template <typename T>
         BSONArrayBuilder& operator<<(const T& x){
             return append(x);
         }
+        
+        void appendNull() {
+            _b.appendNull(num().c_str());
+        }
 
-        BSONArray arr(){ return BSONArray(b.obj()); }
+        BSONArray arr(){ return BSONArray(_b.obj()); }
+        
+        BSONObj done() { return _b.done(); }
+        
+        template <typename T>
+        BSONArrayBuilder& append(const char *name, const T& x){
+            fill( name );
+            append( x );
+            return *this;
+        }
+        
+        BufBuilder &subobjStart( const char *name ) {
+            fill( name );
+            return _b.subobjStart( num().c_str() );
+        }
 
+        BufBuilder &subarrayStart( const char *name ) {
+            fill( name );
+            return _b.subarrayStart( num().c_str() );
+        }
+        
+        void appendArray( const char *name, BSONObj subObj ) {
+            fill( name );
+            _b.appendArray( num().c_str(), subObj );
+        }
+        
+        void appendAs( const BSONElement &e, const char *name ) {
+            fill( name );
+            append( e );
+        }
+        
     private:
-        string num(){ return b.numStr(i++); }
-        int i;
-        BSONObjBuilder b;
+        void fill( const char *name ) {
+            char *r;
+            int n = strtol( name, &r, 10 );
+            uassert( 13048, "can't append to array using string field name", !*r );
+            while( _i < n )
+                append( nullElt() );
+        }
+        
+        static BSONElement nullElt() {
+            static BSONObj n = nullObj();
+            return n.firstElement();
+        }
+        
+        static BSONObj nullObj() {
+            BSONObjBuilder b;
+            b.appendNull( "" );
+            return b.obj();
+        }
+        
+        string num(){ return _b.numStr(_i++); }
+        int _i;
+        BSONObjBuilder _b;
     };
 
 
@@ -1584,8 +1763,8 @@ namespace mongo {
         /** @return the next element in the object. For the final element, element.eoo() will be true. */
         BSONElement next( bool checkEnd = false ) {
             assert( pos < theend );
-            BSONElement e( pos, checkEnd ? theend - pos : -1 );
-            pos += e.size( checkEnd ? theend - pos : -1 );
+            BSONElement e( pos, checkEnd ? (int)(theend - pos) : -1 );
+            pos += e.size( checkEnd ? (int)(theend - pos) : -1 );
             return e;
         }
     private:
@@ -1653,13 +1832,13 @@ namespace mongo {
 #define CHECK_OBJECT( o , msg )
 #endif
 
-    inline BSONObj BSONElement::embeddedObjectUserCheck() {
-        uassert( 10065 ,  "invalid parameter: expected an object", type()==Object || type()==Array );
+    inline BSONObj BSONElement::embeddedObjectUserCheck() const {
+        uassert( 10065 ,  "invalid parameter: expected an object", isABSONObj() );
         return BSONObj(value());
     }
 
     inline BSONObj BSONElement::embeddedObject() const {
-        assert( type()==Object || type()==Array );
+        assert( isABSONObj() );
         return BSONObj(value());
     }
 
@@ -1701,14 +1880,12 @@ namespace mongo {
         return false;
     }
 
-    inline BSONElement BSONObj::findElement(const char *name) const {
-        if ( !isEmpty() ) {
-            BSONObjIterator it(*this);
-            while ( it.moreWithEOO() ) {
-                BSONElement e = it.next();
-                if ( strcmp(name, e.fieldName()) == 0 )
-                    return e;
-            }
+    inline BSONElement BSONObj::getField(const char *name) const {
+        BSONObjIterator i(*this);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if ( strcmp(e.fieldName(), name) == 0 )
+                return e;
         }
         return BSONElement();
     }
@@ -1729,7 +1906,7 @@ namespace mongo {
     }
 
     inline bool BSONObj::getObjectID(BSONElement& e) const { 
-        BSONElement f = findElement("_id");
+        BSONElement f = getField("_id");
         if( !f.eoo() ) { 
             e = f;
             return true;
@@ -1845,7 +2022,7 @@ namespace mongo {
         
         ~BSONObjIteratorSorted(){
             assert( _fields );
-            delete _fields;
+            delete[] _fields;
             _fields = 0;
         }
 
diff --git a/db/jsobjmanipulator.h b/db/jsobjmanipulator.h
index d534d08..1771bff 100644
--- a/db/jsobjmanipulator.h
+++ b/db/jsobjmanipulator.h
@@ -22,57 +22,63 @@
 
 namespace mongo {
 
-/** Manipulate the binary representation of a BSONElement in-place.
- Careful, this casts away const.
- */
-class BSONElementManipulator {
-public:
-    BSONElementManipulator( const BSONElement &element ) :
-    element_( element ) {
-        assert( !element_.eoo() );
-    }
-    /** Replace a Timestamp type with a Date type initialized to
-     OpTime::now().asDate()
-     */
-    void initTimestamp();
-
-    /** Change the value, in place, of the number. */
-    void setNumber(double d) {
-        if ( element_.type() == NumberDouble ) *reinterpret_cast< double * >( value() )  = d;
-        else if ( element_.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d;
-    }
-    void setLong(long long n) { 
-        if( element_.type() == NumberLong ) *reinterpret_cast< long long * >( value() ) = n;
-    }
+    /** Manipulate the binary representation of a BSONElement in-place.
+        Careful, this casts away const.
+    */
+    class BSONElementManipulator {
+    public:
+        BSONElementManipulator( const BSONElement &element ) :
+            _element( element ) {
+            assert( !_element.eoo() );
+        }
+        /** Replace a Timestamp type with a Date type initialized to
+            OpTime::now().asDate()
+        */
+        void initTimestamp();
+        
+        /** Change the value, in place, of the number. */
+        void setNumber(double d) {
+            if ( _element.type() == NumberDouble ) *reinterpret_cast< double * >( value() )  = d;
+            else if ( _element.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d;
+        }
+        void setLong(long long n) { 
+            if( _element.type() == NumberLong ) *reinterpret_cast< long long * >( value() ) = n;
+        }
+        void setInt(int n) { 
+            assert( _element.type() == NumberInt );
+            *reinterpret_cast< int * >( value() ) = n;
+        }
 
-    /** Replace the type and value of the element with the type and value of e,
-        preserving the original fieldName */
-    void replaceTypeAndValue( const BSONElement &e ) {
-        *data() = e.type();
-        memcpy( value(), e.value(), e.valuesize() );
-    }
-    
-    static void lookForTimestamps( const BSONObj& obj ){
-        // If have a Timestamp field as the first or second element,
-        // update it to a Date field set to OpTime::now().asDate().  The
-        // replacement policy is a work in progress.
         
-        BSONObjIterator i( obj );
-        for( int j = 0; i.moreWithEOO() && j < 2; ++j ) {
-            BSONElement e = i.next();
-            if ( e.eoo() )
-                break;
-            if ( e.type() == Timestamp ){
-                BSONElementManipulator( e ).initTimestamp();
-                break;
+        /** Replace the type and value of the element with the type and value of e,
+            preserving the original fieldName */
+        void replaceTypeAndValue( const BSONElement &e ) {
+            *data() = e.type();
+            memcpy( value(), e.value(), e.valuesize() );
+        }
+        
+        static void lookForTimestamps( const BSONObj& obj ){
+            // If have a Timestamp field as the first or second element,
+            // update it to a Date field set to OpTime::now().asDate().  The
+            // replacement policy is a work in progress.
+            
+            BSONObjIterator i( obj );
+            for( int j = 0; i.moreWithEOO() && j < 2; ++j ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                if ( e.type() == Timestamp ){
+                    BSONElementManipulator( e ).initTimestamp();
+                    break;
+                }
             }
         }
-    }
-private:
-    char *data() { return nonConst( element_.rawdata() ); }
-    char *value() { return nonConst( element_.value() ); }
-    static char *nonConst( const char *s ) { return const_cast< char * >( s ); }
-    const BSONElement element_;
-};
+    private:
+        char *data() { return nonConst( _element.rawdata() ); }
+        char *value() { return nonConst( _element.value() ); }
+        static char *nonConst( const char *s ) { return const_cast< char * >( s ); }
+
+        const BSONElement _element;
+    };
 
 } // namespace mongo
diff --git a/db/json.cpp b/db/json.cpp
index b55ddb1..7645b6b 100644
--- a/db/json.cpp
+++ b/db/json.cpp
@@ -20,6 +20,7 @@
 #include "json.h"
 #include "../util/builder.h"
 #include "../util/base64.h"
+#include "../util/hex.h"
 
 using namespace boost::spirit;
 
@@ -167,27 +168,11 @@ namespace mongo {
         ObjectBuilder &b;
     };
 
-    namespace hex {
-        int val( char c ) {
-            if ( '0' <= c && c <= '9' )
-                return c - '0';
-            if ( 'a' <= c && c <= 'f' )
-                return c - 'a' + 10;
-            if ( 'A' <= c && c <= 'F' )
-                return c - 'A' + 10;
-            assert( false );
-            return 0xff;
-        }
-        char val( const char *c ) {
-            return ( val( c[ 0 ] ) << 4 ) | val( c[ 1 ] );
-        }
-    } // namespace hex
-
     struct chU {
         chU( ObjectBuilder &_b ) : b( _b ) {}
         void operator() ( const char *start, const char *end ) const {
-            unsigned char first = hex::val( start );
-            unsigned char second = hex::val( start + 2 );
+            unsigned char first = fromHex( start );
+            unsigned char second = fromHex( start + 2 );
             if ( first == 0 && second < 0x80 )
                 b.ss << second;
             else if ( first < 0x08 ) {
@@ -315,7 +300,7 @@ namespace mongo {
         OID oid;
         char *oidP = (char *)( &oid );
         for ( int i = 0; i < 12; ++i )
-            oidP[ i ] = hex::val( s + ( i * 2 ) );
+            oidP[ i ] = fromHex( s + ( i * 2 ) );
         return oid;
     }
 
@@ -356,7 +341,7 @@ namespace mongo {
     struct binDataType {
         binDataType( ObjectBuilder &_b ) : b( _b ) {}
         void operator() ( const char *start, const char *end ) const {
-            b.binDataType = BinDataType( hex::val( start ) );
+            b.binDataType = BinDataType( fromHex( start ) );
         }
         ObjectBuilder &b;
     };
diff --git a/db/lasterror.cpp b/db/lasterror.cpp
index e8b1fcf..9fefcfa 100644
--- a/db/lasterror.cpp
+++ b/db/lasterror.cpp
@@ -28,7 +28,7 @@ namespace mongo {
 
     LastError LastError::noError;
     LastErrorHolder lastError;
-    boost::mutex LastErrorHolder::_idsmutex;
+    mongo::mutex LastErrorHolder::_idsmutex;
 
     void LastError::appendSelf( BSONObjBuilder &b ) {
         if ( !valid ) {
@@ -75,7 +75,7 @@ namespace mongo {
         if ( id == 0 )
             return _tl.get();
 
-        boostlock lock(_idsmutex);        
+        scoped_lock lock(_idsmutex);        
         map<int,Status>::iterator i = _ids.find( id );
         if ( i == _ids.end() ){
             if ( ! create )
@@ -95,7 +95,7 @@ namespace mongo {
     }
 
     void LastErrorHolder::remove( int id ){
-        boostlock lock(_idsmutex);
+        scoped_lock lock(_idsmutex);
         map<int,Status>::iterator i = _ids.find( id );
         if ( i == _ids.end() )
             return;
@@ -121,7 +121,7 @@ namespace mongo {
             return;
         }
 
-        boostlock lock(_idsmutex);
+        scoped_lock lock(_idsmutex);
         Status & status = _ids[id];
         status.time = time(0);
         status.lerr = le;
diff --git a/db/lasterror.h b/db/lasterror.h
index 8f687bb..78160eb 100644
--- a/db/lasterror.h
+++ b/db/lasterror.h
@@ -30,7 +30,7 @@ namespace mongo {
         string msg;
         enum UpdatedExistingType { NotUpdate, True, False } updatedExisting;
         /* todo: nObjects should be 64 bit */
-        int nObjects;
+        long long nObjects;
         int nPrev;
         bool valid;
         bool overridenById;
@@ -40,12 +40,12 @@ namespace mongo {
             code = _code;
             msg = _msg;
         }
-        void recordUpdate( bool _updatedExisting, int nChanged ) {
+        void recordUpdate( bool _updatedExisting, long long nChanged ) {
             reset( true );
             nObjects = nChanged;
             updatedExisting = _updatedExisting ? True : False;
         }
-        void recordDelete( int nDeleted ) {
+        void recordDelete( long long nDeleted ) {
             reset( true );
             nObjects = nDeleted;
         }
@@ -100,7 +100,7 @@ namespace mongo {
             time_t time;
             LastError *lerr;
         };
-        static boost::mutex _idsmutex;
+        static mongo::mutex _idsmutex;
         map<int,Status> _ids;    
     } lastError;
     
diff --git a/db/matcher.cpp b/db/matcher.cpp
index d71b7ef..8c904e3 100644
--- a/db/matcher.cpp
+++ b/db/matcher.cpp
@@ -22,16 +22,35 @@
 #include "matcher.h"
 #include "../util/goodies.h"
 #include "../util/unittest.h"
-#include "storage.h"
+#include "diskloc.h"
 #include "../scripting/engine.h"
 #include "db.h"
 #include "client.h"
 
+#include "pdfile.h"
+
+namespace {
+    inline pcrecpp::RE_Options flags2options(const char* flags){
+        pcrecpp::RE_Options options;
+        options.set_utf8(true);
+        while ( flags && *flags ) {
+            if ( *flags == 'i' )
+                options.set_caseless(true);
+            else if ( *flags == 'm' )
+                options.set_multiline(true);
+            else if ( *flags == 'x' )
+                options.set_extended(true);
+            flags++;
+        }
+        return options;
+    }
+}
+
+//#define DEBUGMATCHER(x) cout << x << endl;
+#define DEBUGMATCHER(x)
+
 namespace mongo {
     
-    //#include "minilex.h"
-    //MiniLex minilex;
-    
     class Where {
     public:
         Where() {
@@ -66,52 +85,61 @@ namespace mongo {
         where = 0;
     }
 
-    ElementMatcher::ElementMatcher( BSONElement _e , int _op ) : toMatch( _e ) , compareOp( _op ) {
+    ElementMatcher::ElementMatcher( BSONElement _e , int _op, bool _isNot ) : toMatch( _e ) , compareOp( _op ), isNot( _isNot ) {
         if ( _op == BSONObj::opMOD ){
-            BSONObj o = _e.embeddedObject().firstElement().embeddedObject();
+            BSONObj o = _e.embeddedObject();
             mod = o["0"].numberInt();
             modm = o["1"].numberInt();
             
             uassert( 10073 ,  "mod can't be 0" , mod );
         }
         else if ( _op == BSONObj::opTYPE ){
-            type = (BSONType)(_e.embeddedObject().firstElement().numberInt());
+            type = (BSONType)(_e.numberInt());
         }
         else if ( _op == BSONObj::opELEM_MATCH ){
-            BSONElement m = toMatch.embeddedObjectUserCheck().firstElement();
+            BSONElement m = _e;
             uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object );
             subMatcher.reset( new Matcher( m.embeddedObject() ) );
         }
     }
 
-
-    ElementMatcher::~ElementMatcher(){
-    }
-
-
-
-} // namespace mongo
-
-#include "pdfile.h"
-
-namespace {
-    inline pcrecpp::RE_Options flags2options(const char* flags){
-        pcrecpp::RE_Options options;
-        options.set_utf8(true);
-        while ( flags && *flags ) {
-            if ( *flags == 'i' )
-                options.set_caseless(true);
-            else if ( *flags == 'm' )
-                options.set_multiline(true);
-            else if ( *flags == 'x' )
-                options.set_extended(true);
-            flags++;
+    ElementMatcher::ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot ) 
+        : toMatch( _e ) , compareOp( _op ), isNot( _isNot ) {
+        
+        myset.reset( new set<BSONElement,element_lt>() );
+        
+        BSONObjIterator i( array );
+        while ( i.more() ) {
+            BSONElement ie = i.next();
+            if ( _op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ){
+                shared_ptr<Matcher> s;
+                s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) );
+                allMatchers.push_back( s );
+            } else if ( ie.type() == RegEx ) {
+                if ( !myregex.get() ) {
+                    myregex.reset( new vector< RegexMatcher >() );
+                }
+                myregex->push_back( RegexMatcher() );
+                RegexMatcher &rm = myregex->back();
+                rm.re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) );
+                rm.fieldName = 0; // no need for field name
+                rm.regex = ie.regex();
+                rm.flags = ie.regexFlags();
+                rm.isNot = false;
+                bool purePrefix;
+                string prefix = simpleRegex(rm.regex, rm.flags, &purePrefix);
+                if (purePrefix)
+                    rm.prefix = prefix;
+            } else {
+                myset->insert(ie);
+            }
         }
-        return options;
+        
+        if ( allMatchers.size() ){
+            uassert( 13020 , "with $all, can't mix $elemMatch and others" , myset->size() == 0 && !myregex.get());
+        }
+        
     }
-}
-
-namespace mongo {
     
     CoveredIndexMatcher::CoveredIndexMatcher(const BSONObj &jsobj, const BSONObj &indexKeyPattern) :
         _keyMatcher(jsobj.filterFieldsUndotted(indexKeyPattern, true), 
@@ -120,13 +148,18 @@ namespace mongo {
     {
         _needRecord = ! ( 
                          _docMatcher.keyMatch() && 
-                         _keyMatcher.jsobj.nFields() == _docMatcher.jsobj.nFields()
+                         _keyMatcher.jsobj.nFields() == _docMatcher.jsobj.nFields() &&
+                         ! _keyMatcher.hasType( BSONObj::opEXISTS )
                           );
+
     }
     
-    bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc ) {
+    bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details ) {
+        if ( details )
+            details->reset();
+        
         if ( _keyMatcher.keyMatch() ) {
-            if ( !_keyMatcher.matches(key) ) {
+            if ( !_keyMatcher.matches(key, details ) ){
                 return false;
             }
         }
@@ -135,14 +168,128 @@ namespace mongo {
             return true;
         }
 
-        return _docMatcher.matches(recLoc.rec());
+        if ( details )
+            details->loadedObject = true;
+
+        return _docMatcher.matches(recLoc.rec() , details );
     }
     
     
+    void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot){
+
+        if ( nRegex >= 4 ) {
+            out() << "ERROR: too many regexes in query" << endl;
+        }
+        else {
+            RegexMatcher& rm = regexs[nRegex];
+            rm.re.reset( new pcrecpp::RE(regex, flags2options(flags)) );
+            rm.fieldName = fieldName;
+            rm.regex = regex;
+            rm.flags = flags;
+            rm.isNot = isNot;
+            nRegex++;
+
+            if (!isNot){ //TODO something smarter
+                bool purePrefix;
+                string prefix = simpleRegex(regex, flags, &purePrefix);
+                if (purePrefix)
+                    rm.prefix = prefix;
+            }
+        }        
+    }
+    
+    bool Matcher::addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ) {
+        const char *fn = fe.fieldName();
+        int op = fe.getGtLtOp( -1 );
+        if ( op == -1 ){
+            if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ){
+                return false; // { $ref : xxx } - treat as normal object
+            }
+            uassert( 10068 ,  (string)"invalid operator: " + fn , op != -1 );
+        }
+        
+        switch ( op ){
+            case BSONObj::GT:
+            case BSONObj::GTE:
+            case BSONObj::LT:
+            case BSONObj::LTE:{
+                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+                _builders.push_back( b );
+                b->appendAs(fe, e.fieldName());
+                addBasic(b->done().firstElement(), op, isNot);
+                break;
+            }
+            case BSONObj::NE:{
+                haveNeg = true;
+                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+                _builders.push_back( b );
+                b->appendAs(fe, e.fieldName());
+                addBasic(b->done().firstElement(), BSONObj::NE, isNot);
+                break;
+            }
+            case BSONObj::opALL:
+                all = true;
+            case BSONObj::opIN:
+                basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+                break;
+            case BSONObj::NIN:
+                haveNeg = true;
+                basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+                break;
+            case BSONObj::opMOD:
+            case BSONObj::opTYPE:
+            case BSONObj::opELEM_MATCH: {
+                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+                _builders.push_back( b );
+                b->appendAs(fe, e.fieldName());                                
+                // these are types where ElementMatcher has all the info
+                basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) );
+                break;                                
+            }
+            case BSONObj::opSIZE:{
+                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+                _builders.push_back( b );
+                b->appendAs(fe, e.fieldName());
+                addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot);    
+                haveSize = true;
+                break;
+            }
+            case BSONObj::opEXISTS:{
+                shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+                _builders.push_back( b );
+                b->appendAs(fe, e.fieldName());
+                addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot);
+                break;
+            }
+            case BSONObj::opREGEX:{
+                uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot );
+                if ( fe.type() == RegEx ){
+                    regex = fe.regex();
+                    flags = fe.regexFlags();
+                }
+                else {
+                    regex = fe.valuestrsafe();
+                }
+                break;
+            }
+            case BSONObj::opOPTIONS:{
+                uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot );
+                flags = fe.valuestrsafe();
+                break;
+            }
+            case BSONObj::opNEAR:
+            case BSONObj::opWITHIN:
+                break;
+            default:
+                uassert( 10069 ,  (string)"BUG - can't operator for: " + fn , 0 );
+        }        
+        return true;
+    }
+    
     /* _jsobj          - the query pattern
     */
     Matcher::Matcher(const BSONObj &_jsobj, const BSONObj &constrainIndexKey) :
-        where(0), jsobj(_jsobj), haveSize(), all(), hasArray(0), _atomic(false), nRegex(0) {
+        where(0), jsobj(_jsobj), haveSize(), all(), hasArray(0), haveNeg(), _atomic(false), nRegex(0) {
 
         BSONObjIterator i(jsobj);
         while ( i.more() ) {
@@ -171,15 +318,7 @@ namespace mongo {
             }
 
             if ( e.type() == RegEx ) {
-                if ( nRegex >= 4 ) {
-                    out() << "ERROR: too many regexes in query" << endl;
-                }
-                else {
-                    RegexMatcher& rm = regexs[nRegex];
-                    rm.re = new pcrecpp::RE(e.regex(), flags2options(e.regexFlags()));
-                    rm.fieldName = e.fieldName();
-                    nRegex++;
-                }
+                addRegex( e.fieldName(), e.regex(), e.regexFlags() );
                 continue;
             }
             
@@ -200,75 +339,31 @@ namespace mongo {
                     const char *fn = fe.fieldName();
                     
                     if ( fn[0] == '$' && fn[1] ) {
-                        int op = fe.getGtLtOp( -1 );
-
-                        if ( op == -1 ){
-                            if ( fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ){
-                                break; // { $ref : xxx } - treat as normal object
-                            }
-                            uassert( 10068 ,  (string)"invalid operator: " + fn , op != -1 );
-                        }
-
                         isOperator = true;
                         
-                        switch ( op ){
-                        case BSONObj::GT:
-                        case BSONObj::GTE:
-                        case BSONObj::LT:
-                        case BSONObj::LTE:{
-                            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
-                            _builders.push_back( b );
-                            b->appendAs(fe, e.fieldName());
-                            addBasic(b->done().firstElement(), op);
-                            isOperator = true;
-                            break;
-                        }
-                        case BSONObj::NE:{
-                            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
-                            _builders.push_back( b );
-                            b->appendAs(fe, e.fieldName());
-                            addBasic(b->done().firstElement(), BSONObj::NE);
-                            break;
-                        }
-                        case BSONObj::opALL:
-                            all = true;
-                        case BSONObj::opIN:
-                        case BSONObj::NIN:
-                            basics.push_back( ElementMatcher( e , op , fe.embeddedObject() ) );
-                            break;
-                        case BSONObj::opMOD:
-                        case BSONObj::opTYPE:
-                        case BSONObj::opELEM_MATCH:
-                            // these are types where ElementMatcher has all the info
-                            basics.push_back( ElementMatcher( e , op ) );
-                            break;
-                        case BSONObj::opSIZE:{
-                            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
-                            _builders.push_back( b );
-                            b->appendAs(fe, e.fieldName());
-                            addBasic(b->done().firstElement(), BSONObj::opSIZE);    
-                            haveSize = true;
-                            break;
-                        }
-                        case BSONObj::opEXISTS:{
-                            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
-                            _builders.push_back( b );
-                            b->appendAs(fe, e.fieldName());
-                            addBasic(b->done().firstElement(), BSONObj::opEXISTS);
-                            break;
-                        }
-                        case BSONObj::opREGEX:{
-                            regex = fe.valuestrsafe();
-                            break;
-                        }
-                        case BSONObj::opOPTIONS:{
-                            flags = fe.valuestrsafe();
-                            break;
-                        }
-                        default:
-                            uassert( 10069 ,  (string)"BUG - can't operator for: " + fn , 0 );
+                        if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) {
+                            haveNeg = true;
+                            switch( fe.type() ) {
+                                case Object: {
+                                    BSONObjIterator k( fe.embeddedObject() );
+                                    uassert( 13030, "$not cannot be empty", k.more() );
+                                    while( k.more() ) {
+                                        addOp( e, k.next(), true, regex, flags );
+                                    }
+                                    break;
+                                }
+                                case RegEx:
+                                    addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true );
+                                    break;
+                                default:
+                                    uassert( 13031, "invalid use of $not", false );
+                            }
+                        } else {
+                            if ( !addOp( e, fe, false, regex, flags ) ) {
+                                isOperator = false;
+                                break;
+                            }
                         }
-                        
                     }
                     else {
                         isOperator = false;
@@ -276,14 +371,7 @@ namespace mongo {
                     }
                 }
                 if (regex){
-                    if ( nRegex >= 4 ) {
-                        out() << "ERROR: too many regexes in query" << endl;
-                    } else {
-                        RegexMatcher& rm = regexs[nRegex];
-                        rm.re = new pcrecpp::RE(regex, flags2options(flags));
-                        rm.fieldName = e.fieldName();
-                        nRegex++;
-                    }
+                    addRegex(e.fieldName(), regex, flags);
                 }
                 if ( isOperator )
                     continue;
@@ -298,21 +386,46 @@ namespace mongo {
             }
             
             // normal, simple case e.g. { a : "foo" }
-            addBasic(e, BSONObj::Equality);
+            addBasic(e, BSONObj::Equality, false);
         }
         
         constrainIndexKey_ = constrainIndexKey;
     }
-
+    
+    inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) {
+        switch (e.type()){
+            case String:
+            case Symbol:
+                if (rm.prefix.empty())
+                    return rm.re->PartialMatch(e.valuestr());
+                else
+                    return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size());
+            case RegEx:
+                return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags());
+            default:
+                return false;
+        }
+    }
+        
     inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) {
         assert( op != BSONObj::NE && op != BSONObj::NIN );
         
-        if ( op == BSONObj::Equality )
+        if ( op == BSONObj::Equality ) {
             return l.valuesEqual(r);
+        }
         
         if ( op == BSONObj::opIN ) {
             // { $in : [1,2,3] }
-            return bm.myset->count(l);
+            int count = bm.myset->count(l);
+            if ( count )
+                return count;
+            if ( bm.myregex.get() ) {
+                for( vector<RegexMatcher>::const_iterator i = bm.myregex->begin(); i != bm.myregex->end(); ++i ) {
+                    if ( regexMatches( *i, l ) ) {
+                        return true;
+                    }
+                }
+            }
         }
 
         if ( op == BSONObj::opSIZE ) {
@@ -350,8 +463,8 @@ namespace mongo {
         return (op & z);
     }
 
-    int Matcher::matchesNe(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm ) {
-        int ret = matchesDotted( fieldName, toMatch, obj, BSONObj::Equality, bm );
+    int Matcher::matchesNe(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm , MatchDetails * details ) {
+        int ret = matchesDotted( fieldName, toMatch, obj, BSONObj::Equality, bm , false , details );
         if ( bm.toMatch.type() != jstNULL )
             return ( ret <= 0 ) ? 1 : 0;
         else
@@ -383,16 +496,44 @@ namespace mongo {
         0 missing element
         1 match
     */
-    int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& bm , bool isArr) {
-
+    int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& em , bool isArr, MatchDetails * details ) {
+        DEBUGMATCHER( "\t matchesDotted : " << fieldName << " hasDetails: " << ( details ? "yes" : "no" ) );
         if ( compareOp == BSONObj::opALL ) {
-            if ( bm.myset->size() == 0 )
+            
+            if ( em.allMatchers.size() ){
+                BSONElement e = obj.getFieldDotted( fieldName );
+                uassert( 13021 , "$all/$elemMatch needs to be applied to array" , e.type() == Array );
+                
+                for ( unsigned i=0; i<em.allMatchers.size(); i++ ){
+                    bool found = false;
+                    BSONObjIterator x( e.embeddedObject() );
+                    while ( x.more() ){
+                        BSONElement f = x.next();
+
+                        if ( f.type() != Object )
+                            continue;
+                        if ( em.allMatchers[i]->matches( f.embeddedObject() ) ){
+                            found = true;
+                            break;
+                        }
+                    }
+
+                    if ( ! found )
+                        return -1;
+                }
+                
+                return 1;
+            }
+            
+            if ( em.myset->size() == 0 && !em.myregex.get() )
                 return -1; // is this desired?
+            
             BSONObjSetDefaultOrder actualKeys;
             IndexSpec( BSON( fieldName << 1 ) ).getKeys( obj, actualKeys );
             if ( actualKeys.size() == 0 )
                 return 0;
-            for( set< BSONElement, element_lt >::const_iterator i = bm.myset->begin(); i != bm.myset->end(); ++i ) {
+            
+            for( set< BSONElement, element_lt >::const_iterator i = em.myset->begin(); i != em.myset->end(); ++i ) {
                 // ignore nulls
                 if ( i->type() == jstNULL )
                     continue;
@@ -402,17 +543,44 @@ namespace mongo {
                 if ( !actualKeys.count( b.done() ) )
                     return -1;
             }
-            return 1;
-        }
 
+            if ( !em.myregex.get() )
+                return 1;
+            
+            for( vector< RegexMatcher >::const_iterator i = em.myregex->begin(); i != em.myregex->end(); ++i ) {
+                bool match = false;
+                for( BSONObjSetDefaultOrder::const_iterator j = actualKeys.begin(); j != actualKeys.end(); ++j ) {
+                    if ( regexMatches( *i, j->firstElement() ) ) {
+                        match = true;
+                        break;
+                    }
+                }
+                if ( !match )
+                    return -1;
+            }
+            
+            return 1;
+        } // end opALL
+        
         if ( compareOp == BSONObj::NE )
-            return matchesNe( fieldName, toMatch, obj, bm );
+            return matchesNe( fieldName, toMatch, obj, em , details );
         if ( compareOp == BSONObj::NIN ) {
-            for( set<BSONElement,element_lt>::const_iterator i = bm.myset->begin(); i != bm.myset->end(); ++i ) {
-                int ret = matchesNe( fieldName, *i, obj, bm );
+            for( set<BSONElement,element_lt>::const_iterator i = em.myset->begin(); i != em.myset->end(); ++i ) {
+                int ret = matchesNe( fieldName, *i, obj, em , details );
                 if ( ret != 1 )
                     return ret;
             }
+            if ( em.myregex.get() ) {
+                BSONElementSet s;
+                obj.getFieldsDotted( fieldName, s );
+                for( vector<RegexMatcher>::const_iterator i = em.myregex->begin(); i != em.myregex->end(); ++i ) {
+                    for( BSONElementSet::const_iterator j = s.begin(); j != s.end(); ++j ) {
+                        if ( regexMatches( *i, *j ) ) {
+                            return -1;
+                        }
+                    }
+                }
+            }
             return 1;
         }
         
@@ -420,49 +588,73 @@ namespace mongo {
         bool indexed = !constrainIndexKey_.isEmpty();
         if ( indexed ) {
             e = obj.getFieldUsingIndexNames(fieldName, constrainIndexKey_);
-            assert( !e.eoo() );
+            if( e.eoo() ){
+                cout << "obj: " << obj << endl;
+                cout << "fieldName: " << fieldName << endl;
+                cout << "constrainIndexKey_: " << constrainIndexKey_ << endl;
+                assert( !e.eoo() );
+            }
         } else {
+
+            const char *p = strchr(fieldName, '.');
+            if ( p ) {
+                string left(fieldName, p-fieldName);
+
+                BSONElement se = obj.getField(left.c_str());
+                if ( se.eoo() )
+                    ;
+                else if ( se.type() != Object && se.type() != Array )
+                    ;
+                else {
+                    BSONObj eo = se.embeddedObject();
+                    return matchesDotted(p+1, toMatch, eo, compareOp, em, se.type() == Array , details );
+                }
+            }
+
             if ( isArr ) {
+                DEBUGMATCHER( "\t\t isArr 1 : obj : " << obj );
                 BSONObjIterator ai(obj);
                 bool found = false;
                 while ( ai.moreWithEOO() ) {
                     BSONElement z = ai.next();
+
+                    if( strcmp(z.fieldName(),fieldName) == 0 && valuesMatch(z, toMatch, compareOp, em) ) {
+                        // "field.<n>" array notation was used
+                        if ( details )
+                            details->elemMatchKey = z.fieldName();
+                        return 1;
+                    }
+
                     if ( z.type() == Object ) {
                         BSONObj eo = z.embeddedObject();
-                        int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, bm, false);
+                        int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, em, false, details );
                         if ( cmp > 0 ) {
+                            if ( details )
+                                details->elemMatchKey = z.fieldName();
                             return 1;
-                        } else if ( cmp < 0 ) {
+                        } 
+                        else if ( cmp < 0 ) {
                             found = true;
                         }
                     }
                 }
-                return found ? -1 : retMissing( bm );
+                return found ? -1 : retMissing( em );
             }
-            const char *p = strchr(fieldName, '.');
-            if ( p ) {
-                string left(fieldName, p-fieldName);
 
-                BSONElement se = obj.getField(left.c_str());
-                if ( se.eoo() )
-                    return retMissing( bm );
-                if ( se.type() != Object && se.type() != Array )
-                    return retMissing( bm );
-
-                BSONObj eo = se.embeddedObject();
-                return matchesDotted(p+1, toMatch, eo, compareOp, bm, se.type() == Array);
-            } else {
+            if( p ) { 
+                return retMissing( em );
+            }
+            else {
                 e = obj.getField(fieldName);
             }
         }
 
         if ( compareOp == BSONObj::opEXISTS ) {
-            return ( e.eoo() ^ toMatch.boolean() ) ? 1 : -1;
+            return ( e.eoo() ^ ( toMatch.boolean() ^ em.isNot ) ) ? 1 : -1;
         } else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) &&
-            valuesMatch(e, toMatch, compareOp, bm ) ) {
+            valuesMatch(e, toMatch, compareOp, em ) ) {
             return 1;
         } else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) {
-            
             BSONObjIterator ai(e.embeddedObject());
 
             while ( ai.moreWithEOO() ) {
@@ -470,18 +662,23 @@ namespace mongo {
                 
                 if ( compareOp == BSONObj::opELEM_MATCH ){
                     // SERVER-377
-                    if ( z.type() == Object && bm.subMatcher->matches( z.embeddedObject() ) )
+                    if ( z.type() == Object && em.subMatcher->matches( z.embeddedObject() ) ){
+                        if ( details )
+                            details->elemMatchKey = z.fieldName();
                         return 1;
+                    }
                 }
                 else {
-                    if ( valuesMatch( z, toMatch, compareOp, bm) ) {
+                    if ( valuesMatch( z, toMatch, compareOp, em) ) {
+                        if ( details )
+                            details->elemMatchKey = z.fieldName();
                         return 1;
                     }
                 }
 
             }
             
-            if ( compareOp == BSONObj::Equality && e.woCompare( toMatch ) == 0 ){
+            if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ){
                 // match an entire array to itself
                 return 1;
             }
@@ -496,27 +693,9 @@ namespace mongo {
 
     extern int dump;
 
-    inline bool regexMatches(RegexMatcher& rm, const BSONElement& e) {
-        char buf[64];
-        const char *p = buf;
-        if ( e.type() == String || e.type() == Symbol )
-            p = e.valuestr();
-        else if ( e.isNumber() ) {
-            sprintf(buf, "%f", e.number());
-        }
-        else if ( e.type() == Date ) {
-            Date_t d = e.date();
-            time_t t = (d.millis/1000);
-            time_t_to_String(t, buf);
-        }
-        else
-            return false;
-        return rm.re->PartialMatch(p);
-    }
-
     /* See if an object matches the query.
     */
-    bool Matcher::matches(const BSONObj& jsobj ) {
+    bool Matcher::matches(const BSONObj& jsobj , MatchDetails * details ) {
         /* assuming there is usually only one thing to match.  if more this
         could be slow sometimes. */
 
@@ -525,17 +704,21 @@ namespace mongo {
             ElementMatcher& bm = basics[i];
             BSONElement& m = bm.toMatch;
             // -1=mismatch. 0=missing element. 1=match
-            int cmp = matchesDotted(m.fieldName(), m, jsobj, bm.compareOp, bm );
+            int cmp = matchesDotted(m.fieldName(), m, jsobj, bm.compareOp, bm , false , details );
+            if ( bm.compareOp != BSONObj::opEXISTS && bm.isNot )
+                cmp = -cmp;
             if ( cmp < 0 )
                 return false;
             if ( cmp == 0 ) {
                 /* missing is ok iff we were looking for null */
                 if ( m.type() == jstNULL || m.type() == Undefined ) {
-                    if ( bm.compareOp == BSONObj::NE ) {
+                    if ( ( bm.compareOp == BSONObj::NE ) ^ bm.isNot ) {
                         return false;
                     }
                 } else {
-                    return false;
+                    if ( !bm.isNot ) {
+                        return false;
+                    }
                 }
             }
         }
@@ -554,7 +737,7 @@ namespace mongo {
             for( BSONElementSet::const_iterator i = s.begin(); i != s.end(); ++i )
                 if ( regexMatches(rm, *i) )
                     match = true;
-            if ( !match )
+            if ( !match ^ rm.isNot )
                 return false;
         }
         
@@ -590,6 +773,13 @@ namespace mongo {
         return true;
     }
 
+    bool Matcher::hasType( BSONObj::MatchType type ) const {
+        for ( unsigned i=0; i<basics.size() ; i++ )
+            if ( basics[i].compareOp == type )
+                return true;
+        return false;
+    }
+
     struct JSObj1 js1;
 
 #pragma pack(1)
diff --git a/db/matcher.h b/db/matcher.h
index f1609f9..3839b68 100644
--- a/db/matcher.h
+++ b/db/matcher.h
@@ -31,13 +31,12 @@ namespace mongo {
     class RegexMatcher {
     public:
         const char *fieldName;
-        pcrecpp::RE *re;
-        RegexMatcher() {
-            re = 0;
-        }
-        ~RegexMatcher() {
-            delete re;
-        }
+        const char *regex;
+        const char *flags;
+        string prefix;
+        shared_ptr< pcrecpp::RE > re;
+        bool isNot;
+        RegexMatcher() : isNot() {}
     };
     
     struct element_lt
@@ -58,24 +57,17 @@ namespace mongo {
         ElementMatcher() {
         }
         
-        ElementMatcher( BSONElement _e , int _op );
+        ElementMatcher( BSONElement _e , int _op, bool _isNot );
         
-        ElementMatcher( BSONElement _e , int _op , const BSONObj& array ) : toMatch( _e ) , compareOp( _op ) {
-            
-            myset.reset( new set<BSONElement,element_lt>() );
-            
-            BSONObjIterator i( array );
-            while ( i.more() ) {
-                BSONElement ie = i.next();
-                myset->insert(ie);
-            }
-        }
+        ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot );
         
-        ~ElementMatcher();
+        ~ElementMatcher() { }
 
         BSONElement toMatch;
         int compareOp;
+        bool isNot;
         shared_ptr< set<BSONElement,element_lt> > myset;
+        shared_ptr< vector<RegexMatcher> > myregex;
         
         // these are for specific operators
         int mod;
@@ -83,12 +75,34 @@ namespace mongo {
         BSONType type;
 
         shared_ptr<Matcher> subMatcher;
+
+        vector< shared_ptr<Matcher> > allMatchers;
     };
 
-// SQL where clause equivalent
-    class Where;
+    class Where; // used for $where javascript eval
     class DiskLoc;
 
+    struct MatchDetails {
+        MatchDetails(){
+            reset();
+        }
+        
+        void reset(){
+            loadedObject = false;
+            elemMatchKey = 0;
+        }
+        
+        string toString() const {
+            stringstream ss;
+            ss << "loadedObject: " << loadedObject << " ";
+            ss << "elemMatchKey: " << ( elemMatchKey ? elemMatchKey : "NULL" ) << " ";
+            return ss.str();
+        }
+
+        bool loadedObject;
+        const char * elemMatchKey; // warning, this may go out of scope if matched object does
+    };
+
     /* Match BSON objects against a query pattern.
 
        e.g.
@@ -107,12 +121,12 @@ namespace mongo {
         int matchesDotted(
             const char *fieldName,
             const BSONElement& toMatch, const BSONObj& obj,
-            int compareOp, const ElementMatcher& bm, bool isArr = false);
+            int compareOp, const ElementMatcher& bm, bool isArr , MatchDetails * details );
 
         int matchesNe(
             const char *fieldName,
             const BSONElement &toMatch, const BSONObj &obj,
-            const ElementMatcher&bm);
+            const ElementMatcher&bm, MatchDetails * details );
         
     public:
         static int opDirection(int op) {
@@ -125,30 +139,34 @@ namespace mongo {
 
         ~Matcher();
 
-        bool matches(const BSONObj& j);
+        bool matches(const BSONObj& j, MatchDetails * details = 0 );
         
-        bool keyMatch() const { return !all && !haveSize && !hasArray; }
+        bool keyMatch() const { return !all && !haveSize && !hasArray && !haveNeg; }
 
         bool atomic() const { return _atomic; }
 
+        bool hasType( BSONObj::MatchType type ) const;
     private:
-        void addBasic(const BSONElement &e, int c) {
+        void addBasic(const BSONElement &e, int c, bool isNot) {
             // TODO May want to selectively ignore these element types based on op type.
             if ( e.type() == MinKey || e.type() == MaxKey )
                 return;
-            basics.push_back( ElementMatcher( e , c ) );
+            basics.push_back( ElementMatcher( e , c, isNot ) );
         }
 
+        void addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot = false);
+        bool addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags );
+        
         int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm);
 
         Where *where;                    // set if query uses $where
         BSONObj jsobj;                  // the query pattern.  e.g., { name: "joe" }
         BSONObj constrainIndexKey_;
         vector<ElementMatcher> basics;
-//        int n;                           // # of basicmatcher items
         bool haveSize;
         bool all;
         bool hasArray;
+        bool haveNeg;
 
         /* $atomic - if true, a multi document operation (some removes, updates)
                      should be done atomically.  in that case, we do not yield - 
@@ -171,7 +189,7 @@ namespace mongo {
     public:
         CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern);
         bool matches(const BSONObj &o){ return _docMatcher.matches( o ); }
-        bool matches(const BSONObj &key, const DiskLoc &recLoc);
+        bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 );
         bool needRecord(){ return _needRecord; }
 
         Matcher& docMatcher() { return _docMatcher; }
diff --git a/db/module.cpp b/db/module.cpp
index d218fe6..78f8f79 100644
--- a/db/module.cpp
+++ b/db/module.cpp
@@ -1,4 +1,20 @@
 // module.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 
 #include "stdafx.h"
 #include "module.h"
diff --git a/db/modules/mms.cpp b/db/modules/mms.cpp
index 9c00e60..248a4e4 100644
--- a/db/modules/mms.cpp
+++ b/db/modules/mms.cpp
@@ -1,4 +1,20 @@
 // mms.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 
 #include "stdafx.h"
 #include "../db.h"
@@ -6,6 +22,7 @@
 #include "../module.h"
 #include "../../util/httpclient.h"
 #include "../../util/background.h"
+#include "../commands.h"
 
 namespace po = boost::program_options;
 
@@ -13,24 +30,26 @@ namespace mongo {
 
     /** Mongo Monitoring Service
         if enabled, this runs in the background ands pings mss
-     */
+    */
     class MMS : public BackgroundJob , Module {
     public:
 
         MMS()
-            : Module( "mms" ) , _baseurl( "http://mms.10gen.com/ping/" ) , 
+            : Module( "mms" ) , _baseurl( "" ) ,
               _secsToSleep(1) , _token( "" ) , _name( "" ) {
             
             add_options()
+                ( "mms-url" , po::value<string>()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" )
                 ( "mms-token" , po::value<string>() , "account token for mongo monitoring server" )
-                ( "mms-name" , po::value<string>() , "server name mongo monitoring server" )
-                ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval for mongo monitoring server" )
+                ( "mms-name" , po::value<string>() , "server name for mongo monitoring server" )
+                ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" )
                 ;
         }    
         
         ~MMS(){}
-
+        
         void config( program_options::variables_map& params ){
+            _baseurl = params["mms-url"].as<string>();
             if ( params.count( "mms-token" ) ){
                 _token = params["mms-token"].as<string>();
             }
@@ -41,87 +60,94 @@ namespace mongo {
         }
         
         void run(){
-        if ( _token.size() == 0  && _name.size() == 0 ){
-            log(1) << "mms not configured" << endl;
-            return;
-        }
-
-        if ( _token.size() == 0 ){
-            log() << "no token for mms - not running" << endl;
-            return;
-        }
-        
-        if ( _name.size() == 0 ){
-            log() << "no name for mms - not running" << endl;
-            return;
-        }
-
-        log() << "mms monitor staring...  token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl;
-
-        unsigned long long lastTime = 0;
-        unsigned long long lastLockTime = 0;
-        
-        while ( ! inShutdown() ){
-            sleepsecs( _secsToSleep );
-            
-            stringstream url;
-            url << _baseurl << _token << "?";
-            url << "monitor_name=" << _name << "&";
-            url << "version=" << versionString << "&";
-            url << "git_hash=" << gitVersion() << "&";
+            if ( _token.size() == 0  && _name.size() == 0 ){
+                log(1) << "mms not configured" << endl;
+                return;
+            }
 
-            { //percent_locked
-                unsigned long long time = curTimeMicros64();
-                unsigned long long start , lock;
-                dbMutex.info().getTimingInfo( start , lock );
-                if ( lastTime ){
-                    double timeDiff = (double) (time - lastTime);
-                    double lockDiff = (double) (lock - lastLockTime);
-                    url << "percent_locked=" << (int)ceil( 100 * ( lockDiff / timeDiff ) ) << "&";
-                }
-                lastTime = time;
-                lastLockTime = lock;
+            if ( _token.size() == 0 ){
+                log() << "no token for mms - not running" << endl;
+                return;
             }
-            
-            vector< string > dbNames;
-            getDatabaseNames( dbNames );
-            boost::intmax_t totalSize = 0;
-            for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
-                boost::intmax_t size = dbSize( i->c_str() );
-                totalSize += size;
+        
+            if ( _name.size() == 0 ){
+                log() << "no name for mms - not running" << endl;
+                return;
             }
-            url << "data_size=" << totalSize / ( 1024 * 1024 ) << "&";
-
             
+            log() << "mms monitor staring...  token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl;
+            Client::initThread( "mms" );
+            Client& c = cc();
             
-            /* TODO: 
-              message_operations
-              update_operations
-              insert_operations
-              get_more_operations
-              delete_operations
-              kill_cursors_operations 
-            */
             
-
-            log(1) << "mms url: " << url.str() << endl;
+            // TODO: using direct client is bad, but easy for now
             
-            try {
-                HttpClient c;
-                map<string,string> headers;
-                stringstream ss;
-                int rc = c.get( url.str() , headers , ss );
-                log(1) << "\t response code: " << rc << endl;
-                if ( rc != 200 ){
-                    log() << "mms error response code:" << rc << endl;
-                    log(1) << "mms error body:" << ss.str() << endl;
+            while ( ! inShutdown() ){
+                sleepsecs( _secsToSleep );
+                
+                try {
+                    stringstream url;
+                    url << _baseurl << "?"
+                        << "token=" << _token << "&"
+                        << "name=" << _name << "&"
+                        << "ts=" << time(0)
+                        ;
+                    
+                    BSONObjBuilder bb;
+                    // duplicated so the post has everything
+                    bb.append( "token" , _token );
+                    bb.append( "name" , _name );
+                    bb.appendDate( "ts" , jsTime()  );
+                    
+                    // any commands
+                    _add( bb , "buildinfo" );
+                    _add( bb , "serverStatus" );
+                    
+                    BSONObj postData = bb.obj();
+                    
+                    log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;;
+                    
+                    HttpClient c;
+                    HttpClient::Result r;
+                    int rc = c.post( url.str() , postData.jsonString() , &r );
+                    log(1) << "\t response code: " << rc << endl;
+                    if ( rc != 200 ){
+                        log() << "mms error response code:" << rc << endl;
+                        log(1) << "mms error body:" << r.getEntireResponse() << endl;
+                    }
+                }
+                catch ( std::exception& e ){
+                    log() << "mms exception: " << e.what() << endl;
                 }
             }
-            catch ( std::exception& e ){
-                log() << "mms get exception: " << e.what() << endl;
-            }
+            
+            c.shutdown();
         }
+        
+        void _add( BSONObjBuilder& postData , const char* cmd ){
+            Command * c = Command::findCommand( cmd );
+            if ( ! c ){
+                log() << "MMS can't find command: " << cmd << endl;
+                postData.append( cmd , "can't find command" );
+                return;
+            }
+            
+            if ( c->locktype() ){
+                log() << "MMS can only use noLocking commands not: " << cmd << endl;
+                postData.append( cmd , "not noLocking" );
+                return;
+            }
+
+            BSONObj co = BSON( cmd << 1 );
+
+            string errmsg;
+            BSONObjBuilder sub;
+            if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) )
+                postData.append( cmd , errmsg );
+            else
+                postData.append( cmd , sub.obj() );
         }
+        
 
         void init(){ go(); }
 
@@ -135,8 +161,8 @@ namespace mongo {
         
         string _token;
         string _name;
-
-    } /* mms */;
+        
+    } /*mms*/ ;
 
 }
 
diff --git a/db/mr.cpp b/db/mr.cpp
index ff88d9e..210dfca 100644
--- a/db/mr.cpp
+++ b/db/mr.cpp
@@ -28,6 +28,8 @@ namespace mongo {
 
     namespace mr {
 
+        typedef vector<BSONObj> BSONList;
+
         class MyCmp {
         public:
             MyCmp(){}
@@ -38,48 +40,76 @@ namespace mongo {
 
         typedef pair<BSONObj,BSONObj> Data;
         //typedef list< Data > InMemory;
-        typedef map< BSONObj,list<BSONObj>,MyCmp > InMemory;
+        typedef map< BSONObj,BSONList,MyCmp > InMemory;
 
-        BSONObj reduceValues( list<BSONObj>& values , Scope * s , ScriptingFunction reduce , bool final , ScriptingFunction finalize ){
+        BSONObj reduceValues( BSONList& values , Scope * s , ScriptingFunction reduce , bool final , ScriptingFunction finalize ){
             uassert( 10074 ,  "need values" , values.size() );
             
             int sizeEstimate = ( values.size() * values.begin()->getField( "value" ).size() ) + 128;
             BSONObj key;
 
             BSONObjBuilder reduceArgs( sizeEstimate );
-        
-            BSONObjBuilder valueBuilder( sizeEstimate );
-            int n = 0;
-            for ( list<BSONObj>::iterator i=values.begin(); i!=values.end(); i++){
-                BSONObj o = *i;
-                BSONObjIterator j(o);
+            BSONArrayBuilder * valueBuilder = 0;
+            
+            int sizeSoFar = 0;
+            unsigned n = 0;
+            for ( ; n<values.size(); n++ ){
+                BSONObjIterator j(values[n]);
                 BSONElement keyE = j.next();
                 if ( n == 0 ){
                     reduceArgs.append( keyE );
-                    BSONObjBuilder temp;
-                    temp.append( keyE );
-                    key = temp.obj();
+                    key = keyE.wrap();
+                    valueBuilder = new BSONArrayBuilder( reduceArgs.subarrayStart( "values" ) );
+                    sizeSoFar = 5 + keyE.size();
                 }
-                valueBuilder.appendAs( j.next() , BSONObjBuilder::numStr( n++ ).c_str() );
+                
+                BSONElement ee = j.next();
+                
+                uassert( 13070 , "value to large to reduce" , ee.size() < ( 2 * 1024 * 1024 ) );
+
+                if ( sizeSoFar + ee.size() > ( 4 * 1024 * 1024 ) ){
+                    assert( n > 1 ); // if not, inf. loop
+                    break;
+                }
+                
+                valueBuilder->append( ee );
+                sizeSoFar += ee.size();
             }
-        
-            reduceArgs.appendArray( "values" , valueBuilder.obj() );
+            assert(valueBuilder);
+            valueBuilder->done();
+            delete valueBuilder;
             BSONObj args = reduceArgs.obj();
-            
+
             s->invokeSafe( reduce , args );
             if ( s->type( "return" ) == Array ){
                 uassert( 10075 , "reduce -> multiple not supported yet",0);                
                 return BSONObj();
             }
+
+            int endSizeEstimate = key.objsize() + ( args.objsize() / values.size() );
+
+            if ( n < values.size() ){
+                BSONList x;
+                for ( ; n < values.size(); n++ ){
+                    x.push_back( values[n] );
+                }
+                BSONObjBuilder temp( endSizeEstimate );
+                temp.append( key.firstElement() );
+                s->append( temp , "1" , "return" );
+                x.push_back( temp.obj() );
+                return reduceValues( x , s , reduce , final , finalize );
+            }
             
+
+
             if ( finalize ){
-                BSONObjBuilder b;
+                BSONObjBuilder b(endSizeEstimate);
                 b.appendAs( key.firstElement() , "_id" );
                 s->append( b , "value" , "return" );
                 s->invokeSafe( finalize , b.obj() );
             }
             
-            BSONObjBuilder b;
+            BSONObjBuilder b(endSizeEstimate);
             b.appendAs( key.firstElement() , final ? "_id" : "0" );
             s->append( b , final ? "value" : "1" , "return" );
             return b.obj();
@@ -108,8 +138,12 @@ namespace mongo {
                     if ( ! keeptemp && markAsTemp )
                         cc().addTempCollection( tempLong );
 
-                    if ( cmdObj["out"].type() == String )
+                    replicate = keeptemp;
+
+                    if ( cmdObj["out"].type() == String ){
                         finalShort = cmdObj["out"].valuestr();
+                        replicate = true;
+                    }
                     else
                         finalShort = tempShort;
                     
@@ -123,8 +157,10 @@ namespace mongo {
                     if ( cmdObj["finalize"].type() ){
                         finalizeCode = cmdObj["finalize"].ascode();
                     }
+                    checkCodeWScope( "map" , cmdObj );
+                    checkCodeWScope( "reduce" , cmdObj );
+                    checkCodeWScope( "finalize" , cmdObj );
                     
-
                     if ( cmdObj["mapparams"].type() == Array ){
                         mapparams = cmdObj["mapparams"].embeddedObjectUserCheck();
                     }
@@ -151,6 +187,14 @@ namespace mongo {
                 }
             }
             
+            void checkCodeWScope( const char * field , const BSONObj& o ){
+                BSONElement e = o[field];
+                if ( e.type() != CodeWScope )
+                    return;
+                BSONObj x = e.codeWScopeObject();
+                uassert( 13035 , (string)"can't use CodeWScope with map/reduce function: " + field , x.isEmpty() );
+            }
+
             /**
                @return number objects in collection
              */
@@ -171,6 +215,7 @@ namespace mongo {
             // options
             bool verbose;            
             bool keeptemp;
+            bool replicate;
 
             // query options
             
@@ -224,12 +269,13 @@ namespace mongo {
                 db.dropCollection( setup.incLong );
                 
                 writelock l( setup.incLong );
+                Client::Context ctx( setup.incLong );
                 string err;
                 assert( userCreateNS( setup.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) );
 
             }
 
-            void finalReduce( list<BSONObj>& values ){
+            void finalReduce( BSONList& values ){
                 if ( values.size() == 0 )
                     return;
 
@@ -237,7 +283,11 @@ namespace mongo {
                 BSONObj res = reduceValues( values , scope.get() , reduce , 1 , finalize );
                 
                 writelock l( setup.tempLong );
-                theDataFileMgr.insertAndLog( setup.tempLong.c_str() , res , false );
+                Client::Context ctx( setup.incLong );
+                if ( setup.replicate )
+                    theDataFileMgr.insertAndLog( setup.tempLong.c_str() , res , false );
+                else
+                    theDataFileMgr.insert( setup.tempLong.c_str() , res , false );
             }
 
             
@@ -272,7 +322,7 @@ namespace mongo {
                 
                 for ( InMemory::iterator i=old->begin(); i!=old->end(); i++ ){
                     BSONObj key = i->first;
-                    list<BSONObj>& all = i->second;
+                    BSONList& all = i->second;
                     
                     if ( all.size() == 1 ){
                         // this key has low cardinality, so just write to db
@@ -291,13 +341,14 @@ namespace mongo {
 
             void dump(){
                 writelock l(_state.setup.incLong);
+                Client::Context ctx(_state.setup.incLong);
                     
                 for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ){
-                    list<BSONObj>& all = i->second;
+                    BSONList& all = i->second;
                     if ( all.size() < 1 )
                         continue;
                     
-                    for ( list<BSONObj>::iterator j=all.begin(); j!=all.end(); j++ )
+                    for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ )
                         write( *j );
                 }
                 _temp->clear();
@@ -306,7 +357,7 @@ namespace mongo {
             }
             
             void insert( const BSONObj& a ){
-                list<BSONObj>& all = (*_temp)[a];
+                BSONList& all = (*_temp)[a];
                 all.push_back( a );
                 _size += a.objsize() + 16;
             }
@@ -343,7 +394,8 @@ namespace mongo {
         boost::thread_specific_ptr<MRTL> _tlmr;
 
         BSONObj fast_emit( const BSONObj& args ){
-            uassert( 10077 ,  "fast_emit takes 2 args" , args.nFields() == 2 );
+            uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 );
+            uassert( 13069 , "an emit can't be more than 2mb" , args.objsize() < ( 2 * 1024 * 1024 ) );
             _tlmr->insert( args );
             _tlmr->numEmits++;
             return BSONObj();
@@ -357,11 +409,14 @@ namespace mongo {
             virtual void help( stringstream &help ) const {
                 help << "see http://www.mongodb.org/display/DOCS/MapReduce";
             }
-        
+            virtual LockType locktype(){ return WRITE; } // TODO, READ?
             bool run(const char *dbname, BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ){
                 Timer t;
                 Client::GodScope cg;
-                MRSetup mr( cc().database()->name , cmd );
+                Client& client = cc();
+                CurOp * op = client.curop();
+
+                MRSetup mr( client.database()->name , cmd );
 
                 log(1) << "mr ns: " << mr.ns << endl;
                 
@@ -385,7 +440,7 @@ namespace mongo {
                     MRTL * mrtl = new MRTL( state );
                     _tlmr.reset( mrtl );
 
-                    ProgressMeter pm( db.count( mr.ns , mr.filter ) );
+                    ProgressMeter & pm = op->setMessage( "m/r: (1/3) emit phase" , db.count( mr.ns , mr.filter ) );
                     auto_ptr<DBClientCursor> cursor = db.query( mr.ns , mr.q );
                     long long mapTime = 0;
                     Timer mt;
@@ -405,6 +460,7 @@ namespace mongo {
                             Timer t;
                             mrtl->checkSize();
                             inReduce += t.micros();
+                            killCurrentOp.checkForInterrupt();
                             dbtemprelease temprlease;
                         }
                         pm.hit();
@@ -412,9 +468,10 @@ namespace mongo {
                         if ( mr.limit && num >= mr.limit )
                             break;
                     }
+                    pm.finished();
                     
-                    countsBuilder.append( "input" , num );
-                    countsBuilder.append( "emit" , mrtl->numEmits );
+                    countsBuilder.appendNumber( "input" , num );
+                    countsBuilder.appendNumber( "emit" , mrtl->numEmits );
                     if ( mrtl->numEmits )
                         shouldHaveData = true;
                     
@@ -422,7 +479,7 @@ namespace mongo {
                     timingBuilder.append( "emitLoop" , t.millis() );
                     
                     // final reduce
-                    
+                    op->setMessage( "m/r: (2/3) final reduce in memory" );
                     mrtl->reduceInMemory();
                     mrtl->dump();
                     
@@ -430,16 +487,22 @@ namespace mongo {
                     db.ensureIndex( mr.incLong , sortKey );
                     
                     BSONObj prev;
-                    list<BSONObj> all;
+                    BSONList all;
                     
-                    ProgressMeter fpm( db.count( mr.incLong ) );
+                    assert( userCreateNS( mr.tempLong.c_str() , BSONObj() , errmsg , mr.replicate ) );
+
+                    pm = op->setMessage( "m/r: (3/3) final reduce to collection" , db.count( mr.incLong ) );
                     cursor = db.query( mr.incLong, Query().sort( sortKey ) );
 
                     while ( cursor->more() ){
                         BSONObj o = cursor->next().getOwned();
-                        
+                        pm.hit();
+
                         if ( o.woSortOrder( prev , sortKey ) == 0 ){
                             all.push_back( o );
+                            if ( pm.hits() % 1000 == 0 ){
+                                dbtemprelease tl;
+                            }
                             continue;
                         }
                         
@@ -448,12 +511,11 @@ namespace mongo {
                         all.clear();
                         prev = o;
                         all.push_back( o );
-                        fpm.hit();
+                        killCurrentOp.checkForInterrupt();
                         dbtemprelease tl;
                     }
-                    
                     state.finalReduce( all );
-
+                    pm.finished();
                     _tlmr.reset( 0 );
                 }
                 catch ( ... ){
@@ -471,7 +533,7 @@ namespace mongo {
                 
                 result.append( "result" , mr.finalShort );
                 result.append( "timeMillis" , t.millis() );
-                countsBuilder.append( "output" , finalCount );
+                countsBuilder.appendNumber( "output" , finalCount );
                 if ( mr.verbose ) result.append( "timing" , timingBuilder.obj() );
                 result.append( "counts" , countsBuilder.obj() );
 
@@ -493,11 +555,12 @@ namespace mongo {
         public:
             MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ){}
             virtual bool slaveOk() { return true; }
-
+            
+            virtual LockType locktype(){ return WRITE; } 
             bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){
-                dbtemprelease temprlease; // we don't touch the db directly
-                                    
-                string dbname = cc().database()->name;
+                string dbname = cc().database()->name; // this has to come before dbtemprelease
+                dbtemprelease temprelease; // we don't touch the db directly
+
                 string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe();
 
                 MRSetup mr( dbname , cmdObj.firstElement().embeddedObjectUserCheck() , false );
@@ -540,14 +603,14 @@ namespace mongo {
                 if ( mr.finalizeCode.size() )
                     finalizeFunction = s->createFunction( mr.finalizeCode.c_str() );
 
-                list<BSONObj> values;
+                BSONList values;
 
                 result.append( "result" , mr.finalShort );
 
                 DBDirectClient db;
                 
                 while ( cursor.more() ){
-                    BSONObj t = cursor.next();
+                    BSONObj t = cursor.next().getOwned();
                                         
                     if ( values.size() == 0 ){
                         values.push_back( t );
diff --git a/db/namespace.cpp b/db/namespace.cpp
index ecd5f64..210efb6 100644
--- a/db/namespace.cpp
+++ b/db/namespace.cpp
@@ -47,11 +47,43 @@ namespace mongo {
     }
     
     boost::filesystem::path NamespaceIndex::path() const {
-        return boost::filesystem::path( dir_ ) / ( database_ + ".ns" );
+        boost::filesystem::path ret( dir_ );
+        if ( directoryperdb )
+            ret /= database_;
+        ret /= ( database_ + ".ns" );
+        return ret;
     }
 
+    void NamespaceIndex::maybeMkdir() const {
+        if ( !directoryperdb )
+            return;
+        boost::filesystem::path dir( dir_ );
+        dir /= database_;
+        if ( !boost::filesystem::exists( dir ) )
+            BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( dir ) );
+    }
+    
 	int lenForNewNsFiles = 16 * 1024 * 1024;
     
+    void NamespaceDetails::onLoad(const Namespace& k) { 
+        if( k.isExtra() ) { 
+            /* overflow storage for indexes - so don't treat as a NamespaceDetails object. */
+            return;
+        }
+
+        assertInWriteLock();
+        if( backgroundIndexBuildInProgress ) { 
+            log() << "backgroundIndexBuildInProgress was " << backgroundIndexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl;
+            backgroundIndexBuildInProgress = 0;
+        }
+    }
+
+    static void callback(const Namespace& k, NamespaceDetails& v) { 
+        v.onLoad(k);
+    }
+
+    bool checkNsFilesOnLoad = true;
+
     void NamespaceIndex::init() {
         if ( ht )
             return;
@@ -82,6 +114,7 @@ namespace mongo {
 		else {
 			// use lenForNewNsFiles, we are making a new database
 			massert( 10343 ,  "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 );
+            maybeMkdir();
 			long l = lenForNewNsFiles;
 			p = f.map(pathString.c_str(), l);
             if( p ) { 
@@ -95,6 +128,8 @@ namespace mongo {
             dbexit( EXIT_FS );
         }
         ht = new HashTable<Namespace,NamespaceDetails>(p, len, "namespace index");
+        if( checkNsFilesOnLoad )
+            ht->iterAll(callback);
     }
 
     void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) {
@@ -446,9 +481,14 @@ namespace mongo {
         // signal done allocating new extents.
         if ( !deletedList[ 1 ].isValid() )
             deletedList[ 1 ] = DiskLoc();
-
+        
         assert( len < 400000000 );
         int passes = 0;
+        int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
+        if ( maxPasses < 5000 ){
+            // this is for bacwards safety since 5000 was the old value
+            maxPasses = 5000;
+        }
         DiskLoc loc;
 
         // delete records until we have room and the max # objects limit achieved.
@@ -497,10 +537,10 @@ namespace mongo {
             DiskLoc fr = theCapExtent()->firstRecord;
             theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true);
             compact();
-            if( ++passes >= 5000 ) {
-                log() << "passes ns:" << ns << " len:" << len << '\n';
+            if( ++passes > maxPasses ) {
+                log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n';
                 log() << "passes max:" << max << " nrecords:" << nrecords << " datasize: " << datasize << endl;
-                massert( 10345 ,  "passes >= 5000 in capped collection alloc", false );
+                massert( 10345 ,  "passes >= maxPasses in capped collection alloc", false );
             }
         }
 
@@ -512,7 +552,7 @@ namespace mongo {
     }
 
     /* you MUST call when adding an index.  see pdfile.cpp */
-    IndexDetails& NamespaceDetails::addIndex(const char *thisns) {
+    IndexDetails& NamespaceDetails::addIndex(const char *thisns, bool resetTransient) {
         assert( nsdetails(thisns) == this );
 
         if( nIndexes == NIndexesBase && extraOffset == 0 ) { 
@@ -521,7 +561,8 @@ namespace mongo {
 
         IndexDetails& id = idx(nIndexes);
         nIndexes++;
-        NamespaceDetailsTransient::get_w(thisns).addedIndex();
+        if ( resetTransient )
+            NamespaceDetailsTransient::get_w(thisns).addedIndex();
         return id;
     }
 
@@ -543,31 +584,39 @@ namespace mongo {
         for ( int i = 0; i < nIndexes; i++ ) {
             IndexDetails& idx = indexes[i];
             BSONObj idxKey = idx.info.obj().getObjectField("key"); // e.g., { ts : -1 }
-            if ( !idxKey.findElement(fieldName).eoo() )
+            if ( !idxKey.getField(fieldName).eoo() )
                 return i;
         }*/
         return -1;
     }
     
-    long long NamespaceDetails::storageSize(){
+    long long NamespaceDetails::storageSize( int * numExtents ){
         Extent * e = firstExtent.ext();
         assert( e );
         
         long long total = 0;
+        int n = 0;
         while ( e ){
-                total += e->length;
-                e = e->getNextExtent();
+            total += e->length;
+            e = e->getNextExtent();
+            n++;
         }
+        
+        if ( numExtents )
+            *numExtents = n;
+        
         return total;
     }
     
     /* ------------------------------------------------------------------------- */
 
-    boost::mutex NamespaceDetailsTransient::_qcMutex;
+    mongo::mutex NamespaceDetailsTransient::_qcMutex;
+    mongo::mutex NamespaceDetailsTransient::_isMutex;
     map< string, shared_ptr< NamespaceDetailsTransient > > NamespaceDetailsTransient::_map;
     typedef map< string, shared_ptr< NamespaceDetailsTransient > >::iterator ouriter;
 
     void NamespaceDetailsTransient::reset() {
+        DEV assertInWriteLock();
         clearQueryCache();
         _keysComputed = false;
         _indexSpecs.clear();
@@ -595,11 +644,13 @@ namespace mongo {
         _keysComputed = true;
         _indexKeys.clear();
         NamespaceDetails *d = nsdetails(_ns.c_str());
+        if ( ! d )
+            return;
         NamespaceDetails::IndexIterator i = d->ii();
         while( i.more() )
             i.next().keyPattern().getFieldNames(_indexKeys);
     }
-    
+
     void NamespaceDetailsTransient::cllStart( int logSizeMb ) {
         assertInWriteLock();
         _cll_ns = "local.temp.oplog." + _ns;
@@ -607,7 +658,7 @@ namespace mongo {
         stringstream spec;
         // 128MB
         spec << "{size:" << logSizeMb * 1024 * 1024 << ",capped:true,autoIndexId:false}";
-        setClient( _cll_ns.c_str() );
+        Client::Context ct( _cll_ns );
         string err;
         massert( 10347 ,  "Could not create log ns", userCreateNS( _cll_ns.c_str(), fromjson( spec.str() ), err, false ) );
         NamespaceDetails *d = nsdetails( _cll_ns.c_str() );
@@ -633,7 +684,7 @@ namespace mongo {
         assertInWriteLock();
         if ( !_cll_enabled )
             return;
-        setClient( _cll_ns.c_str() );
+        Client::Context ctx( _cll_ns );
         dropNS( _cll_ns );
     }
 
diff --git a/db/namespace.h b/db/namespace.h
index df4c62f..1b1a954 100644
--- a/db/namespace.h
+++ b/db/namespace.h
@@ -21,7 +21,7 @@
 #include "../stdafx.h"
 #include "jsobj.h"
 #include "queryutil.h"
-#include "storage.h"
+#include "diskloc.h"
 #include "../util/hashtab.h"
 #include "../util/mmap.h"
 
@@ -75,6 +75,10 @@ namespace mongo {
         NamespaceString( const char * ns ) { init(ns); }
         NamespaceString( const string& ns ) { init(ns.c_str()); }
 
+        string ns() const { 
+            return db + '.' + coll;
+        }
+
         bool isSystem() { 
             return strncmp(coll.c_str(), "system.", 7) == 0;
         }
@@ -100,6 +104,10 @@ namespace mongo {
             massert( 10348 , "ns name too long", s.size() < MaxNsLen);
             return s;
         }
+        bool isExtra() const { 
+            const char *p = strstr(buf, "$extra");
+            return p && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
+        }
 
         void kill() {
             buf[0] = 0x7f;
@@ -186,6 +194,9 @@ namespace mongo {
 
         BOOST_STATIC_ASSERT( NIndexesMax == NIndexesBase + NIndexesExtra );
 
+        /* called when loaded from disk */
+        void onLoad(const Namespace& k);
+
         NamespaceDetails( const DiskLoc &loc, bool _capped ) {
             /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
             firstExtent = lastExtent = capExtent = loc;
@@ -251,6 +262,13 @@ namespace mongo {
         int backgroundIndexBuildInProgress; // 1 if in prog
         char reserved[76];
 
+        /* when a background index build is in progress, we don't count the index in nIndexes until 
+           complete, yet need to still use it in _indexRecord() - thus we use this function for that.
+        */
+        int nIndexesBeingBuilt() const {
+            return nIndexes + backgroundIndexBuildInProgress;
+        }
+
         /* NOTE: be careful with flags.  are we manipulating them in read locks?  if so, 
                  this isn't thread safe.  TODO
         */
@@ -264,6 +282,10 @@ namespace mongo {
                 return _indexes[idxNo];
             return extra()->details[idxNo-NIndexesBase];
         }
+        IndexDetails& backgroundIdx() { 
+            DEV assert(backgroundIndexBuildInProgress);
+            return idx(nIndexes);
+        }
 
         class IndexIterator { 
             friend class NamespaceDetails;
@@ -324,7 +346,7 @@ namespace mongo {
         /* add a new index.  does not add to system.indexes etc. - just to NamespaceDetails.
            caller must populate returned object. 
          */
-        IndexDetails& addIndex(const char *thisns);
+        IndexDetails& addIndex(const char *thisns, bool resetTransient=true);
 
         void aboutToDeleteAnIndex() {
             flags &= ~Flag_HaveIdIndex;
@@ -410,7 +432,7 @@ namespace mongo {
 
         void checkMigrate();
 
-        long long storageSize();
+        long long storageSize( int * numExtents = 0 );
 
     private:
         bool cappedMayDelete() const {
@@ -450,7 +472,7 @@ namespace mongo {
         static std::map< string, shared_ptr< NamespaceDetailsTransient > > _map;
     public:
         NamespaceDetailsTransient(const char *ns) : _ns(ns), _keysComputed(false), _qcWriteCount(), _cll_enabled() { }
-        /* _get() is not threadsafe */
+        /* _get() is not threadsafe -- see get_inlock() comments */
         static NamespaceDetailsTransient& _get(const char *ns);
         /* use get_w() when doing write operations */
         static NamespaceDetailsTransient& get_w(const char *ns) { 
@@ -484,12 +506,16 @@ namespace mongo {
         /* IndexSpec caching */
     private:
         map<const IndexDetails*,IndexSpec> _indexSpecs;
+        static mongo::mutex _isMutex;
     public:
         const IndexSpec& getIndexSpec( const IndexDetails * details ){
-            DEV assertInWriteLock();
             IndexSpec& spec = _indexSpecs[details];
-            if ( spec.meta.isEmpty() ){
-                spec.reset( details->info );
+            if ( ! spec._finishedInit ){
+                scoped_lock lk(_isMutex);
+                if ( ! spec._finishedInit ){
+                    spec.reset( details );
+                    assert( spec._finishedInit );
+                }
             }
             return spec;
         }
@@ -499,7 +525,7 @@ namespace mongo {
         int _qcWriteCount;
         map< QueryPattern, pair< BSONObj, long long > > _qcCache;
     public:
-        static boost::mutex _qcMutex;
+        static mongo::mutex _qcMutex;
         /* you must be in the qcMutex when calling this (and using the returned val): */
         static NamespaceDetailsTransient& get_inlock(const char *ns) {
             return _get(ns);
@@ -555,9 +581,9 @@ namespace mongo {
         BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
     public:
         NamespaceIndex(const string &dir, const string &database) :
-        ht( 0 ),
-        dir_( dir ),
-        database_( database ) {}
+          ht( 0 ),
+          dir_( dir ),
+          database_( database ) {}
 
         /* returns true if new db will be created if we init lazily */
         bool exists() const;
@@ -637,6 +663,7 @@ namespace mongo {
 
     private:
         boost::filesystem::path path() const;
+        void maybeMkdir() const;
         
         MemoryMappedFile f;
         HashTable<Namespace,NamespaceDetails> *ht;
@@ -644,7 +671,8 @@ namespace mongo {
         string database_;
     };
 
-    extern string dbpath; // --dbpath parm 
+    extern string dbpath; // --dbpath parm
+    extern bool directoryperdb;
 
     // Rename a namespace within current 'client' db.
     // (Arguments should include db name)
diff --git a/db/nonce.cpp b/db/nonce.cpp
index 4c677be..d8db58d 100644
--- a/db/nonce.cpp
+++ b/db/nonce.cpp
@@ -49,8 +49,8 @@ namespace mongo {
     }
     
     nonce Security::getNonce(){
-        static boost::mutex m;
-        boostlock lk(m);
+        static mongo::mutex m;
+        scoped_lock lk(m);
 
 		/* question/todo: /dev/random works on OS X.  is it better 
 		   to use that than random() / srandom()?
diff --git a/db/pdfile.cpp b/db/pdfile.cpp
index 18df5f1..1c4608c 100644
--- a/db/pdfile.cpp
+++ b/db/pdfile.cpp
@@ -30,6 +30,7 @@ _ disallow system* manipulations from the database.
 #include "../util/mmap.h"
 #include "../util/hashtab.h"
 #include "../util/file_allocator.h"
+#include "../util/processinfo.h"
 #include "btree.h"
 #include <algorithm>
 #include <list>
@@ -40,10 +41,63 @@ _ disallow system* manipulations from the database.
 #include "queryutil.h"
 #include "extsort.h"
 #include "curop.h"
+#include "background.h"
 
 namespace mongo {
 
+    map<string, unsigned> BackgroundOperation::dbsInProg;
+    set<string> BackgroundOperation::nsInProg;
+
+    bool BackgroundOperation::inProgForDb(const char *db) {
+        assertInWriteLock();
+        return dbsInProg[db] != 0;
+    }
+
+    bool BackgroundOperation::inProgForNs(const char *ns) { 
+        assertInWriteLock();
+        return nsInProg.count(ns) != 0;
+    }
+
+    void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { 
+        uassert(12586, "cannot perform operation: a background operation is currently running for this database",
+            !inProgForDb(db));
+    }
+
+    void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { 
+        uassert(12587, "cannot perform operation: a background operation is currently running for this collection",
+            !inProgForNs(ns));
+    } 
+
+    BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { 
+        assertInWriteLock();
+        dbsInProg[_ns.db]++;
+        assert( nsInProg.count(_ns.ns()) == 0 );
+        nsInProg.insert(_ns.ns());
+    }
+
+    BackgroundOperation::~BackgroundOperation() { 
+        assertInWriteLock();
+        dbsInProg[_ns.db]--;
+        nsInProg.erase(_ns.ns());
+    }
+
+    void BackgroundOperation::dump(stringstream& ss) {
+        if( nsInProg.size() ) { 
+            ss << "\n<b>Background Jobs in Progress</b>\n";
+            for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ )
+                ss << "  " << *i << '\n';
+        }
+        for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { 
+            if( i->second ) 
+                ss << "database " << i->first << ": " << i->second << '\n';
+        }
+    }
+
+    /* ----------------------------------------- */
+
     string dbpath = "/data/db/";
+    bool directoryperdb = false;
+    string repairpath;
 
     DataFileMgr theDataFileMgr;
     DatabaseHolder dbHolder;
@@ -53,7 +107,8 @@ namespace mongo {
     extern int otherTraceLevel;
     void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
     void ensureIdIndexForNewNs(const char *ns) {
-        if ( !strstr( ns, ".system." ) && !strstr( ns, ".$freelist" ) ) {
+        if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) &&
+             strstr( ns, ".$freelist" ) == 0 ){
             log( 1 ) << "adding _id index for new collection" << endl;
             ensureHaveIdIndex( ns );
         }        
@@ -63,10 +118,13 @@ namespace mongo {
         stringstream ss;
         Client * c = currentClient.get();
         if ( c ){
-            Database *database = c->database();
-            if ( database ) {
-                ss << database->name << ' ';
-                ss << cc().ns() << ' ';
+            Client::Context * cx = c->getContext();
+            if ( cx ){
+                Database *database = cx->db();
+                if ( database ) {
+                    ss << database->name << ' ';
+                    ss << cx->ns() << ' ';
+                }
             }
         }
         return ss.str();
@@ -105,7 +163,7 @@ namespace mongo {
             addNewNamespaceToCatalog(ns, j.isEmpty() ? 0 : &j);
 
         long long size = initialExtentSize(128);
-        BSONElement e = j.findElement("size");
+        BSONElement e = j.getField("size");
         if ( e.isNumber() ) {
             size = (long long) e.number();
             size += 256;
@@ -116,10 +174,10 @@ namespace mongo {
 
         bool newCapped = false;
         int mx = 0;
-        e = j.findElement("capped");
+        e = j.getField("capped");
         if ( e.type() == Bool && e.boolean() ) {
             newCapped = true;
-            e = j.findElement("max");
+            e = j.getField("max");
             if ( e.isNumber() ) {
                 mx = (int) e.number();
             }
@@ -127,7 +185,7 @@ namespace mongo {
 
         // $nExtents just for debug/testing.  We create '$nExtents' extents,
         // each of size 'size'.
-        e = j.findElement( "$nExtents" );
+        e = j.getField( "$nExtents" );
         int nExtents = int( e.number() );
         Database *database = cc().database();
         if ( nExtents > 0 ) {
@@ -487,13 +545,11 @@ namespace mongo {
     /*---------------------------------------------------------------------*/
 
     auto_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) {
-        DiskLoc loc;
-        bool found = nsindex(ns)->find(ns, loc);
-        if ( !found ) {
-            //		out() << "info: findAll() namespace does not exist: " << ns << endl;
+        NamespaceDetails * d = nsdetails( ns );
+        if ( ! d )
             return auto_ptr<Cursor>(new BasicCursor(DiskLoc()));
-        }
 
+        DiskLoc loc = d->firstExtent;
         Extent *e = getExtent(loc);
 
         DEBUGGING {
@@ -512,40 +568,42 @@ namespace mongo {
             }
 
             out() << endl;
-            nsdetails(ns)->dumpDeleted(&extents);
+            d->dumpDeleted(&extents);
         }
 
-        if ( !nsdetails( ns )->capped ) {
-            if ( !startLoc.isNull() )
-                return auto_ptr<Cursor>(new BasicCursor( startLoc ));                
-            while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
-                /* todo: if extent is empty, free it for reuse elsewhere.
-                   that is a bit complicated have to clean up the freelists.
-                */
-                RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead " << ns << endl;
-                // find a nonempty extent
-                // it might be nice to free the whole extent here!  but have to clean up free recs then.
-                e = e->getNextExtent();
-            }
-            return auto_ptr<Cursor>(new BasicCursor( e->firstRecord ));
-        } else {
-            return auto_ptr< Cursor >( new ForwardCappedCursor( nsdetails( ns ), startLoc ) );
+        if ( d->capped ) 
+            return auto_ptr< Cursor >( new ForwardCappedCursor( d , startLoc ) );
+        
+        if ( !startLoc.isNull() )
+            return auto_ptr<Cursor>(new BasicCursor( startLoc ));                
+        
+        while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
+            /* todo: if extent is empty, free it for reuse elsewhere.
+               that is a bit complicated have to clean up the freelists.
+            */
+            RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead " << ns << endl;
+            // find a nonempty extent
+            // it might be nice to free the whole extent here!  but have to clean up free recs then.
+            e = e->getNextExtent();
         }
+        return auto_ptr<Cursor>(new BasicCursor( e->firstRecord ));
     }
 
     /* get a table scan cursor, but can be forward or reverse direction.
        order.$natural - if set, > 0 means forward (asc), < 0 backward (desc).
     */
     auto_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) {
-        BSONElement el = order.findElement("$natural"); // e.g., { $natural : -1 }
+        BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 }
 
         if ( el.number() >= 0 )
             return DataFileMgr::findAll(ns, startLoc);
-
+        
         // "reverse natural order"
         NamespaceDetails *d = nsdetails(ns);
+        
         if ( !d )
             return auto_ptr<Cursor>(new BasicCursor(DiskLoc()));
+        
         if ( !d->capped ) {
             if ( !startLoc.isNull() )
                 return auto_ptr<Cursor>(new ReverseCursor( startLoc ));                
@@ -583,6 +641,8 @@ namespace mongo {
         NamespaceDetails* d = nsdetails(nsToDrop.c_str());
         uassert( 10086 ,  (string)"ns not found: " + nsToDrop , d );
 
+        BackgroundOperation::assertNoBgOpInProgForNs(nsToDrop.c_str());
+
         NamespaceString s(nsToDrop);
         assert( s.db == cc().database()->name );
         if( s.isSystem() ) {
@@ -634,29 +694,33 @@ namespace mongo {
         log(1) << "dropCollection: " << name << endl;
         NamespaceDetails *d = nsdetails(name.c_str());
         assert( d );
+
+        BackgroundOperation::assertNoBgOpInProgForNs(name.c_str());
+
         if ( d->nIndexes != 0 ) {
             try { 
-                assert( deleteIndexes(d, name.c_str(), "*", errmsg, result, true) );
+                assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) );
             }
             catch( DBException& ) {
-                uasserted(12503,"drop: deleteIndexes for collection failed - consider trying repair");
+                uasserted(12503,"drop: dropIndexes for collection failed - consider trying repair");
             }
             assert( d->nIndexes == 0 );
         }
-        log(1) << "\t deleteIndexes done" << endl;
+        log(1) << "\t dropIndexes done" << endl;
         result.append("ns", name.c_str());
         ClientCursor::invalidate(name.c_str());
+        Top::global.collectionDropped( name );
         dropNS(name);        
     }
     
     int nUnindexes = 0;
 
-    void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) {
+    /* unindex all keys in index for this record. */
+    static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) {
         BSONObjSetDefaultOrder keys;
         id.getKeysFromObject(obj, keys);
         for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
             BSONObj j = *i;
-            //		out() << "UNINDEX: j:" << j.toString() << " head:" << id.head.toString() << dl.toString() << endl;
             if ( otherTraceLevel >= 5 ) {
                 out() << "_unindexRecord() " << obj.toString();
                 out() << "\n  unindex:" << j.toString() << endl;
@@ -666,9 +730,9 @@ namespace mongo {
             try {
                 ok = id.head.btree()->unindex(id.head, id, j, dl);
             }
-            catch (AssertionException&) {
+            catch (AssertionException& e) {
                 problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl;
-                out() << "Assertion failure: _unindex failed" << '\n';
+                out() << "Assertion failure: _unindex failed: " << e.what() << '\n';
                 out() << "  obj:" << obj.toString() << '\n';
                 out() << "  key:" << j.toString() << '\n';
                 out() << "  dl:" << dl.toString() << endl;
@@ -682,12 +746,14 @@ namespace mongo {
     }
 
     /* unindex all keys in all indexes for this record. */
-    void  unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) {
-        if ( d->nIndexes == 0 ) return;
+    static void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) {
         BSONObj obj(todelete);
-        NamespaceDetails::IndexIterator i = d->ii();
-        while( i.more() ) {
-            _unindexRecord(i.next(), obj, dl, !noWarn);
+        int n = d->nIndexes;
+        for ( int i = 0; i < n; i++ )
+            _unindexRecord(d->idx(i), obj, dl, !noWarn);
+        if( d->backgroundIndexBuildInProgress ) {
+            // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it
+            _unindexRecord(d->idx(n), obj, dl, false); 
         }
     }
 
@@ -763,19 +829,20 @@ namespace mongo {
 
     /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
      */
-    const DiskLoc DataFileMgr::update(const char *ns,
-                                       Record *toupdate, const DiskLoc& dl,
-                                       const char *_buf, int _len, OpDebug& debug)
+    const DiskLoc DataFileMgr::updateRecord(
+        const char *ns,
+        NamespaceDetails *d,
+        NamespaceDetailsTransient *nsdt,
+        Record *toupdate, const DiskLoc& dl,
+        const char *_buf, int _len, OpDebug& debug)
     {
         StringBuilder& ss = debug.str;
         dassert( toupdate == dl.rec() );
 
-        NamespaceDetails *d = nsdetails(ns);
-
         BSONObj objOld(toupdate);
         BSONObj objNew(_buf);
-        assert( objNew.objsize() == _len );
-        assert( objNew.objdata() == _buf );
+        DEV assert( objNew.objsize() == _len );
+        DEV assert( objNew.objdata() == _buf );
 
         if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
             /* add back the old _id value if the update removes it.  Note this implementation is slow 
@@ -795,7 +862,7 @@ namespace mongo {
         */
         vector<IndexChanges> changes;
         getIndexChanges(changes, *d, objNew, objOld);
-        dupCheck(changes, *d);
+        dupCheck(changes, *d, dl);
 
         if ( toupdate->netLength() < objNew.objsize() ) {
             // doesn't fit.  reallocate -----------------------------------------------------
@@ -807,13 +874,14 @@ namespace mongo {
             return insert(ns, objNew.objdata(), objNew.objsize(), false);
         }
 
-        NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
+        nsdt->notifyOfWriteOp();
         d->paddingFits();
 
         /* have any index keys changed? */
         {
             unsigned keyUpdates = 0;
-            for ( int x = 0; x < d->nIndexes; x++ ) {
+            int z = d->nIndexesBeingBuilt();
+            for ( int x = 0; x < z; x++ ) {
                 IndexDetails& idx = d->idx(x);
                 for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) {
                     try {
@@ -859,10 +927,8 @@ namespace mongo {
         return sz;
     }
 
-    int deb=0;
-
-    /* add keys to indexes for a new record */
-    inline void  _indexRecord(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc newRecordLoc, bool dupsAllowed) {
+    /* add keys to index idxNo for a new record */
+    static inline void  _indexRecord(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) {
         IndexDetails& idx = d->idx(idxNo);
         BSONObjSetDefaultOrder keys;
         idx.getKeysFromObject(obj, keys);
@@ -872,12 +938,16 @@ namespace mongo {
             if( ++n == 2 ) { 
                 d->setIndexIsMultikey(idxNo);
             }
-            assert( !newRecordLoc.isNull() );
+            assert( !recordLoc.isNull() );
             try {
-                idx.head.btree()->bt_insert(idx.head, newRecordLoc,
+                idx.head.btree()->bt_insert(idx.head, recordLoc,
                                             *i, order, dupsAllowed, idx);
             }
-            catch (AssertionException& ) {
+            catch (AssertionException& e) {
+                if( e.code == 10287 && idxNo == d->nIndexes ) { 
+                    DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+                    continue;
+                }
                 if( !dupsAllowed ) {
                     // dup key exception, presumably.
                     throw;
@@ -913,10 +983,10 @@ namespace mongo {
     }
 
     // throws DBException
-    /* _ TODO dropDups 
-     */
     unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
-        //        testSorting();
+        assert( d->backgroundIndexBuildInProgress == 0 );
+        CurOp * op = cc().curop();
+
         Timer t;
 
         log() << "Buildindex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl;
@@ -926,13 +996,16 @@ namespace mongo {
         BSONObj order = idx.keyPattern();
 
         idx.head.Null();
+        
+        if ( logLevel > 1 ) printMemInfo( "before index start" );
 
         /* get and sort all the keys ----- */
         unsigned long long n = 0;
         auto_ptr<Cursor> c = theDataFileMgr.findAll(ns);
         BSONObjExternalSorter sorter(order);
+        sorter.hintNumObjects( d->nrecords );
         unsigned long long nkeys = 0;
-        ProgressMeter pm( d->nrecords , 10 );
+        ProgressMeter & pm = op->setMessage( "index: (1/3) external sort" , d->nrecords , 10 );
         while ( c->ok() ) {
             BSONObj o = c->current();
             DiskLoc loc = c->currLoc();
@@ -947,12 +1020,20 @@ namespace mongo {
                 sorter.add(*i, loc);
                 nkeys++;
             }
-
+            
             c->advance();
             n++;
             pm.hit();
+            if ( logLevel > 1 && n % 10000 == 0 ){
+                printMemInfo( "\t iterating objects" );
+            }
+
         };
+        pm.finished();
+
+        if ( logLevel > 1 ) printMemInfo( "before final sort" );
         sorter.sort();
+        if ( logLevel > 1 ) printMemInfo( "after final sort" );
         
         log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
 
@@ -963,21 +1044,23 @@ namespace mongo {
             BtreeBuilder btBuilder(dupsAllowed, idx);
             BSONObj keyLast;
             auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
-            ProgressMeter pm2( nkeys , 10 );
+            pm = op->setMessage( "index: (2/3) btree bottom up" , nkeys , 10 );
             while( i->more() ) { 
                 RARELY killCurrentOp.checkForInterrupt();
                 BSONObjExternalSorter::Data d = i->next();
 
-                //cout<<"TEMP SORTER next " << d.first.toString() << endl;
                 try { 
                     btBuilder.addKey(d.first, d.second);
                 }
-                catch( AssertionException& ) { 
+                catch( AssertionException& e ) { 
                     if ( dupsAllowed ){
                         // unknow exception??
                         throw;
                     }
                     
+                    if( e.interrupted() )
+                        throw;
+
                     if ( ! dropDups )
                         throw;
 
@@ -987,8 +1070,11 @@ namespace mongo {
                     dupsToDrop.push_back(d.second);
                     uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 );
                 }
-                pm2.hit();
+                pm.hit();
             }
+            pm.finished();
+            op->setMessage( "index: (3/3) btree-middle" );
+            log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
             btBuilder.commit();
             wassert( btBuilder.getn() == nkeys || dropDups ); 
         }
@@ -1001,32 +1087,61 @@ namespace mongo {
         return n;
     }
 
-    static class BackgroundIndexBuildJobs { 
+    class BackgroundIndexBuildJob : public BackgroundOperation { 
 
         unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
             bool dupsAllowed = !idx.unique();
             bool dropDups = idx.dropDups();
 
+            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->nrecords );
+
             unsigned long long n = 0;
-            auto_ptr<Cursor> c = theDataFileMgr.findAll(ns);
-            while ( c->ok() ) {
-                BSONObj js = c->current();
+            auto_ptr<ClientCursor> cc;
+            {
+                auto_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+                cc.reset( new ClientCursor(c, ns, false) );
+            }
+            CursorId id = cc->cursorid;
+
+            while ( cc->c->ok() ) {
+                BSONObj js = cc->c->current();
                 try { 
-                    _indexRecord(d, idxNo, js, c->currLoc(),dupsAllowed);
-                    c->advance();
+                    _indexRecord(d, idxNo, js, cc->c->currLoc(), dupsAllowed);
+                    cc->c->advance();
                 } catch( AssertionException& e ) { 
+                    if( e.interrupted() )
+                        throw;
+
                     if ( dropDups ) {
-                        DiskLoc toDelete = c->currLoc();
-                        c->advance();
+                        DiskLoc toDelete = cc->c->currLoc();
+                        bool ok = cc->c->advance();
+                        cc->updateLocation();
                         theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true );
+                        if( ClientCursor::find(id, false) == 0 ) {
+                            cc.release();
+                            if( !ok ) { 
+                                /* we were already at the end. normal. */
+                            }
+                            else {
+                                uasserted(12585, "cursor gone during bg index; dropDups");
+                            }
+                            break;
+                        }
                     } else {
-                        _log() << endl;
-                        log(2) << "addExistingToIndex exception " << e.what() << endl;
+                        log() << "background addExistingToIndex exception " << e.what() << endl;
                         throw;
                     }
                 }
                 n++;
-            };
+                progress.hit();
+
+                if ( n % 128 == 0 && !cc->yield() ) {
+                    cc.release();
+                    uasserted(12584, "cursor gone during bg index");
+                    break;
+                }
+            }
+            progress.done();
             return n;
         }
 
@@ -1034,72 +1149,76 @@ namespace mongo {
            that way on a crash/restart, we don't think we are still building one. */
         set<NamespaceDetails*> bgJobsInProgress;
 
-        void prep(NamespaceDetails *d) {
+        void prep(const char *ns, NamespaceDetails *d) {
             assertInWriteLock();
-            assert( bgJobsInProgress.count(d) == 0 );
             bgJobsInProgress.insert(d);
             d->backgroundIndexBuildInProgress = 1;
+            d->nIndexes--;
         }
-
-    public:
-        /* Note you cannot even do a foreground index build if a background is in progress,
-           as bg build assumes it is the last index in the array!
-        */
-        void checkInProg(NamespaceDetails *d) { 
+        void done(const char *ns, NamespaceDetails *d) {
+            d->nIndexes++;
+            d->backgroundIndexBuildInProgress = 0;
+            NamespaceDetailsTransient::get_w(ns).addedIndex(); // clear query optimizer cache
             assertInWriteLock();
-            uassert(12580, "already building an index for this namespace in background", bgJobsInProgress.count(d) == 0);
         }
 
-/* todo: clean bg flag on loading of NamespaceDetails  */
+    public:
+        BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { }
 
         unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { 
-            unsigned long long n;
-            prep(d);
+            unsigned long long n = 0;
+
+            prep(ns.c_str(), d);
+            assert( idxNo == d->nIndexes );
             try { 
                 idx.head = BtreeBucket::addBucket(idx);
                 n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
             }
             catch(...) { 
-                assertInWriteLock();
-                bgJobsInProgress.erase(d);
-                d->backgroundIndexBuildInProgress = 0;
+                if( cc().database() && nsdetails(ns.c_str()) == d ) {
+                    assert( idxNo == d->nIndexes );
+                    done(ns.c_str(), d);
+                }
+                else {
+                    log() << "ERROR: db gone during bg index?" << endl;
+                }
                 throw;
             }
+            assert( idxNo == d->nIndexes );
+            done(ns.c_str(), d);
             return n;
         }
-    } backgroundIndex;
+    };
 
     // throws DBException
-    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { 
-        log() << "building new index on " << idx.keyPattern() << " for " << ns << "..." << endl;
+    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { 
+        log() << "building new index on " << idx.keyPattern() << " for " << ns << endl;
         Timer t;
 		unsigned long long n;
 
-        BSONObj info = idx.info.obj();
-        bool background = info["background"].trueValue();
-        if( background ) { 
-            log() << "WARNING: background index build not yet implemented" << endl;
+        if( background ) {
+            log(2) << "buildAnIndex: background=true\n";
         }
 
+        assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be...
         if( !background ) {
 			n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
 			assert( !idx.head.isNull() );
 		}
 		else {
-            n = backgroundIndex.go(ns, d, idx, idxNo);
+            BackgroundIndexBuildJob j(ns.c_str());
+            n = j.go(ns, d, idx, idxNo);
 		}
         log() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl;
     }
 
     /* add keys to indexes for a new record */
-    void  indexRecord(NamespaceDetails *d, const void *buf, int len, DiskLoc newRecordLoc) {
-        BSONObj obj((const char *)buf);
-
-        /*UNIQUE*/
-        for ( int i = 0; i < d->nIndexes; i++ ) {
+    static void indexRecord(NamespaceDetails *d, BSONObj obj, DiskLoc loc) {
+        int n = d->nIndexesBeingBuilt();
+        for ( int i = 0; i < n; i++ ) {
             try { 
                 bool unique = d->idx(i).unique();
-                _indexRecord(d, i, obj, newRecordLoc, /*dupsAllowed*/!unique);
+                _indexRecord(d, i, obj, loc, /*dupsAllowed*/!unique);
             }
             catch( DBException& ) { 
                 /* try to roll back previously added index entries
@@ -1108,7 +1227,7 @@ namespace mongo {
                 */
                 for( int j = 0; j <= i; j++ ) { 
                     try {
-                        _unindexRecord(d->idx(j), obj, newRecordLoc, false);
+                        _unindexRecord(d->idx(j), obj, loc, false);
                     }
                     catch(...) { 
                         log(3) << "unindex fails on rollback after unique failure\n";
@@ -1119,7 +1238,7 @@ namespace mongo {
         }
     }
 
-    extern BSONObj id_obj; // { _id : ObjectId("000000000000000000000000") }
+    extern BSONObj id_obj; // { _id : 1 }
 
     void ensureHaveIdIndex(const char *ns) {
         NamespaceDetails *d = nsdetails(ns);
@@ -1179,12 +1298,31 @@ namespace mongo {
 
     bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection);
 
+    // We are now doing two btree scans for all unique indexes (one here, and one when we've
+    // written the record to the collection.  This could be made more efficient inserting
+    // dummy data here, keeping pointers to the btree nodes holding the dummy data and then
+    // updating the dummy data with the DiskLoc of the real record.    
+    void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) {
+        for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) {
+            if( d->idx(idxNo).unique() ) {
+                IndexDetails& idx = d->idx(idxNo);
+                BSONObjSetDefaultOrder keys;
+                idx.getKeysFromObject(obj, keys);
+                BSONObj order = idx.keyPattern();
+                for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
+                    uassert( 12582, "duplicate key insert for unique index of capped collection",
+                            idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() );
+                }
+            }
+        }        
+    }
+    
     /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc 
              after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
     */
     DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) {
         bool wouldAddIndex = false;
-        uassert( 10093 , "cannot insert into reserved $ collection", god || strchr(ns, '$') == 0 );
+        massert( 10093 , "cannot insert into reserved $ collection", god || strchr(ns, '$') == 0 );
         uassert( 10094 , "invalid ns", strchr( ns , '.' ) > 0 );
         const char *sys = strstr(ns, "system.");
         if ( sys ) {
@@ -1212,8 +1350,8 @@ namespace mongo {
             /* todo: shouldn't be in the namespace catalog until after the allocations here work.
                also if this is an addIndex, those checks should happen before this!
             */
-            // This creates first file in the database.
-            cc().database()->newestFile()->createExtent(ns, initialExtentSize(len));
+            // This may create first file in the database.
+            cc().database()->allocExtent(ns, initialExtentSize(len), false);
             d = nsdetails(ns);
             if ( !god )
                 ensureIdIndexForNewNs(ns);
@@ -1225,10 +1363,8 @@ namespace mongo {
         string tabletoidxns;
         if ( addIndex ) {
             BSONObj io((const char *) obuf);
-            backgroundIndex.checkInProg(d);
-            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) ) {
+            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) )
                 return DiskLoc();
-            }
         }
 
         const BSONElement *newId = &writeId;
@@ -1262,6 +1398,13 @@ namespace mongo {
             d->paddingFactor = 1.0;
             lenWHdr = len + Record::HeaderSize;
         }
+        
+        // If the collection is capped, check if the new object will violate a unique index
+        // constraint before allocating space.
+        if ( d->nIndexes && d->capped && !god ) {
+            checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
+        }
+        
         DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
         if ( loc.isNull() ) {
             // out of space
@@ -1321,27 +1464,35 @@ namespace mongo {
             NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
         
         if ( tableToIndex ) {
+            BSONObj info = loc.obj();
+            bool background = info["background"].trueValue();
+
             int idxNo = tableToIndex->nIndexes;
-            IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str()); // clear transient info caches so they refresh; increments nIndexes
+            IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
             idx.info = loc;
             try {
-                buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo);
+                buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
             } catch( DBException& ) {
-                // save our error msg string as an exception on deleteIndexes will overwrite our message
+                // save our error msg string as an exception or dropIndexes will overwrite our message
                 LastError *le = lastError.get();
-                assert( le );
-                string saveerrmsg = le->msg;
-                assert( !saveerrmsg.empty() );
+                int savecode = 0;
+                string saveerrmsg;
+                if ( le ) {
+                    savecode = le->code;
+                    saveerrmsg = le->msg;
+                }
 
                 // roll back this index
                 string name = idx.indexName();
                 BSONObjBuilder b;
                 string errmsg;
-                bool ok = deleteIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
+                bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
                 if( !ok ) {
                     log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
                 }
-                raiseError(12506,saveerrmsg.c_str());
+
+                assert( le && !saveerrmsg.empty() );
+                raiseError(savecode,saveerrmsg.c_str());
                 throw;
             }
         }
@@ -1349,11 +1500,13 @@ namespace mongo {
         /* add this record to our indexes */
         if ( d->nIndexes ) {
             try { 
-                indexRecord(d, r->data/*buf*/, len, loc);
+                BSONObj obj(r->data);
+                indexRecord(d, obj, loc);
             } 
             catch( AssertionException& e ) { 
                 // should be a dup key error on _id index
-                if( tableToIndex || d->capped ) { 
+                if( tableToIndex || d->capped ) {
+                    massert( 12583, "unexpected index insertion failure on capped collection", !d->capped );
                     string s = e.toString();
                     s += " : on addIndex/capped - collection and its index will not match";
                     uassert_nothrow(s.c_str());
@@ -1406,19 +1559,6 @@ namespace mongo {
         return r;
     }
 
-    void DataFileMgr::init(const string& path ) {
-        /*	boost::filesystem::path path( dir );
-        	path /= "temp.dat";
-        	string pathString = path.string();
-        	temp.open(pathString.c_str(), 64 * 1024 * 1024);
-        */
-    }
-
-    void pdfileInit() {
-        //	namespaceIndex.init(dbpath);
-        theDataFileMgr.init(dbpath);
-    }
-
 } // namespace mongo
 
 #include "clientcursor.h"
@@ -1427,63 +1567,75 @@ namespace mongo {
 
     void dropDatabase(const char *ns) {
         // ns is of the form "<dbname>.$cmd"
-        char cl[256];
-        nsToDatabase(ns, cl);
-        log(1) << "dropDatabase " << cl << endl;
-        assert( cc().database()->name == cl );
+        char db[256];
+        nsToDatabase(ns, db);
+        log(1) << "dropDatabase " << db << endl;
+        assert( cc().database()->name == db );
+
+        BackgroundOperation::assertNoBgOpInProgForDb(db);
 
-        closeDatabase( cl );
-        _deleteDataFiles(cl);
+        closeDatabase( db );
+        _deleteDataFiles(db);
     }
 
     typedef boost::filesystem::path Path;
 
     // back up original database files to 'temp' dir
     void _renameForBackup( const char *database, const Path &reservedPath ) {
+        Path newPath( reservedPath );
+        if ( directoryperdb )
+            newPath /= database;
         class Renamer : public FileOp {
         public:
-            Renamer( const Path &reservedPath ) : reservedPath_( reservedPath ) {}
+            Renamer( const Path &newPath ) : newPath_( newPath ) {}
         private:
-            const boost::filesystem::path &reservedPath_;
+            const boost::filesystem::path &newPath_;
             virtual bool apply( const Path &p ) {
                 if ( !boost::filesystem::exists( p ) )
                     return false;
-                boost::filesystem::rename( p, reservedPath_ / ( p.leaf() + ".bak" ) );
+                boost::filesystem::rename( p, newPath_ / ( p.leaf() + ".bak" ) );
                 return true;
             }
             virtual const char * op() const {
                 return "renaming";
             }
-        } renamer( reservedPath );
+        } renamer( newPath );
         _applyOpToDataFiles( database, renamer, true );
     }
 
     // move temp files to standard data dir
     void _replaceWithRecovered( const char *database, const char *reservedPathString ) {
-        class : public FileOp {
+        Path newPath( dbpath );
+        if ( directoryperdb )
+            newPath /= database;
+        class Replacer : public FileOp {
+        public:
+            Replacer( const Path &newPath ) : newPath_( newPath ) {}
+        private:
+            const boost::filesystem::path &newPath_;
             virtual bool apply( const Path &p ) {
                 if ( !boost::filesystem::exists( p ) )
                     return false;
-                boost::filesystem::rename( p, boost::filesystem::path(dbpath) / p.leaf() );
+                boost::filesystem::rename( p, newPath_ / p.leaf() );
                 return true;
             }
             virtual const char * op() const {
                 return "renaming";
             }
-        } renamer;
-        _applyOpToDataFiles( database, renamer, true, reservedPathString );
+        } replacer( newPath );
+        _applyOpToDataFiles( database, replacer, true, reservedPathString );
     }
 
     // generate a directory name for storing temp data files
     Path uniqueReservedPath( const char *prefix ) {
-        Path dbPath = Path( dbpath );
+        Path repairPath = Path( repairpath );
         Path reservedPath;
         int i = 0;
         bool exists = false;
         do {
             stringstream ss;
             ss << prefix << "_repairDatabase_" << i++;
-            reservedPath = dbPath / ss.str();
+            reservedPath = repairPath / ss.str();
             BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
         } while ( exists );
         return reservedPath;
@@ -1540,6 +1692,8 @@ namespace mongo {
         problem() << "repairDatabase " << dbName << endl;
         assert( cc().database()->name == dbName );
 
+        BackgroundOperation::assertNoBgOpInProgForDb(dbName);
+
         boost::intmax_t totalSize = dbSize( dbName );
         boost::intmax_t freeSize = freeSpace();
         if ( freeSize > -1 && freeSize < totalSize ) {
@@ -1553,14 +1707,19 @@ namespace mongo {
 
         Path reservedPath =
             uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ?
-                                "backup" : "tmp" );
+                                "backup" : "$tmp" );
         BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
         string reservedPathString = reservedPath.native_directory_string();
-        assert( setClient( dbName, reservedPathString.c_str() ) );
-
-        bool res = cloneFrom(localhost.c_str(), errmsg, dbName, 
-                             /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
-        closeDatabase( dbName, reservedPathString.c_str() );
+        
+        bool res;
+        { // clone to temp location, which effectively does repair
+            Client::Context ctx( dbName, reservedPathString );
+            assert( ctx.justCreated() );
+            
+            res = cloneFrom(localhost.c_str(), errmsg, dbName, 
+                                 /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
+            closeDatabase( dbName, reservedPathString.c_str() );
+        }
 
         if ( !res ) {
             problem() << "clone failed for " << dbName << " with error: " << errmsg << endl;
@@ -1569,13 +1728,15 @@ namespace mongo {
             return false;
         }
 
-        assert( !setClient( dbName ) );
+        Client::Context ctx( dbName );
         closeDatabase( dbName );
 
-        if ( backupOriginalFiles )
+        if ( backupOriginalFiles ) {
             _renameForBackup( dbName, reservedPath );
-        else
+        } else {
             _deleteDataFiles( dbName );
+            BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) );
+        }
 
         _replaceWithRecovered( dbName, reservedPathString.c_str() );
 
@@ -1591,6 +1752,8 @@ namespace mongo {
         string c = database;
         c += '.';
         boost::filesystem::path p(path);
+        if ( directoryperdb )
+            p /= database;
         boost::filesystem::path q;
         q = p / (c+"ns");
         bool ok = false;
@@ -1619,8 +1782,8 @@ namespace mongo {
 
     NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); }
     
-    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result ){
-        log(2) << "DatabaseHolder::closeAll path:" << path << endl;
+    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ){
+        log() << "DatabaseHolder::closeAll path:" << path << endl;
         dbMutex.assertWriteLocked();
         
         map<string,Database*>& m = _paths[path];
@@ -1633,14 +1796,23 @@ namespace mongo {
         
         BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
         int n = 0;
+        int nNotClosed = 0;
         for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) {
             string name = *i;
             log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl;
-            setClient( name.c_str() , path );
-            closeDatabase( name.c_str() , path );
-            bb.append( bb.numStr( n++ ).c_str() , name );
+            Client::Context ctx( name , path );
+            if( !force && BackgroundOperation::inProgForDb(name.c_str()) ) {
+                log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl;
+                nNotClosed++;
+            }
+            else {
+                closeDatabase( name.c_str() , path );
+                bb.append( bb.numStr( n++ ).c_str() , name );
+            }
         }
         bb.done();
+        if( nNotClosed )
+            result.append("nNotClosed", nNotClosed);
         
         return true;
     }
diff --git a/db/pdfile.h b/db/pdfile.h
index 19a8322..85dc191 100644
--- a/db/pdfile.h
+++ b/db/pdfile.h
@@ -27,7 +27,7 @@
 
 #include "../stdafx.h"
 #include "../util/mmap.h"
-#include "storage.h"
+#include "diskloc.h"
 #include "jsobjmanipulator.h"
 #include "namespace.h"
 #include "client.h"
@@ -98,8 +98,10 @@ namespace mongo {
         static Extent* allocFromFreeList(const char *ns, int approxSize, bool capped = false);
 
         /** @return DiskLoc where item ends up */
-        const DiskLoc update(
+        const DiskLoc updateRecord(
             const char *ns,
+            NamespaceDetails *d,
+            NamespaceDetailsTransient *nsdt,
             Record *toupdate, const DiskLoc& dl,
             const char *buf, int len, OpDebug& debug);
         // The object o may be updated if modified on insert.                                
@@ -392,6 +394,10 @@ namespace mongo {
     void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );
 
     inline void _deleteDataFiles(const char *database) {
+        if ( directoryperdb ) {
+            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) );
+            return;
+        }
         class : public FileOp {
             virtual bool apply( const boost::filesystem::path &p ) {
                 return boost::filesystem::remove( p );
@@ -443,6 +449,6 @@ namespace mongo {
     
     void ensureHaveIdIndex(const char *ns);
     
-    bool deleteIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex );
+    bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex );
         
 } // namespace mongo
diff --git a/db/query.cpp b/db/query.cpp
index 9c82609..761a312 100644
--- a/db/query.cpp
+++ b/db/query.cpp
@@ -55,11 +55,11 @@ namespace mongo {
             justOne_( justOne ),
             count_(),
             bestCount_( bestCount ),
-            nScanned_() {
+            _nscanned() {
         }
         virtual void init() {
             c_ = qp().newCursor();
-            matcher_.reset( new CoveredIndexMatcher( qp().query(), qp().indexKey() ) );
+            _matcher.reset( new CoveredIndexMatcher( qp().query(), qp().indexKey() ) );
         }
         virtual void next() {
             if ( !c_->ok() ) {
@@ -69,20 +69,20 @@ namespace mongo {
             
             DiskLoc rloc = c_->currLoc();
             
-            if ( matcher_->matches(c_->currKey(), rloc ) ) {
+            if ( _matcher->matches(c_->currKey(), rloc ) ) {
                 if ( !c_->getsetdup(rloc) )
                     ++count_;
             }
 
             c_->advance();
-            ++nScanned_;
+            ++_nscanned;
             if ( count_ > bestCount_ )
                 bestCount_ = count_;
             
             if ( count_ > 0 ) {
                 if ( justOne_ )
                     setComplete();
-                else if ( nScanned_ >= 100 && count_ == bestCount_ )
+                else if ( _nscanned >= 100 && count_ == bestCount_ )
                     setComplete();
             }
         }
@@ -95,16 +95,17 @@ namespace mongo {
         bool justOne_;
         int count_;
         int &bestCount_;
-        long long nScanned_;
+        long long _nscanned;
         auto_ptr< Cursor > c_;
-        auto_ptr< CoveredIndexMatcher > matcher_;
+        auto_ptr< CoveredIndexMatcher > _matcher;
     };
     
     /* ns:      namespace, e.g. <database>.<collection>
        pattern: the "where" clause / criteria
        justOne: stop after 1 match
+       god:     allow access to system namespaces, and don't yield
     */
-    int deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop, bool god) {
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop, bool god) {
         if( !god ) {
             if ( strstr(ns, ".system.") ) {
                 /* note a delete from system.indexes would corrupt the db 
@@ -124,7 +125,7 @@ namespace mongo {
             return 0;
         uassert( 10101 ,  "can't remove from a capped collection" , ! d->capped );
 
-        int nDeleted = 0;
+        long long nDeleted = 0;
         QueryPlanSet s( ns, pattern, BSONObj() );
         int best = 0;
         DeleteOp original( justOne, best );
@@ -136,18 +137,14 @@ namespace mongo {
 
         CoveredIndexMatcher matcher(pattern, creal->indexKeyPattern());
 
-        auto_ptr<ClientCursor> cc;
-        cc.reset( new ClientCursor() );
-        cc->c = creal;
-        cc->ns = ns;
-        cc->noTimeout();
+        auto_ptr<ClientCursor> cc( new ClientCursor(creal, ns, false) );
         cc->setDoingDeletes( true );
 
         CursorId id = cc->cursorid;
         
         unsigned long long nScanned = 0;
         do {
-            if ( ++nScanned % 128 == 0 && !matcher.docMatcher().atomic() ) {
+            if ( ++nScanned % 128 == 0 && !god && !matcher.docMatcher().atomic() ) {
                 if ( ! cc->yield() ){
                     cc.release(); // has already been deleted elsewhere
                     break;
@@ -233,32 +230,9 @@ namespace mongo {
         log( k == n ) << "killcursors: found " << k << " of " << n << '\n';
     }
 
-    BSONObj id_obj = fromjson("{\"_id\":ObjectId( \"000000000000000000000000\" )}");
+    BSONObj id_obj = fromjson("{\"_id\":1}");
     BSONObj empty_obj = fromjson("{}");
 
-    /* This is for languages whose "objects" are not well ordered (JSON is well ordered).
-       [ { a : ... } , { b : ... } ] -> { a : ..., b : ... }
-    */
-    inline BSONObj transformOrderFromArrayFormat(BSONObj order) {
-        /* note: this is slow, but that is ok as order will have very few pieces */
-        BSONObjBuilder b;
-        char p[2] = "0";
-
-        while ( 1 ) {
-            BSONObj j = order.getObjectField(p);
-            if ( j.isEmpty() )
-                break;
-            BSONElement e = j.firstElement();
-            uassert( 10102 , "bad order array", !e.eoo());
-            uassert( 10103 , "bad order array [2]", e.isNumber());
-            b.append(e);
-            (*p)++;
-            uassert( 10104 , "too many ordering elements", *p <= '9');
-        }
-
-        return b.obj();
-    }
-
 
     //int dump = 0;
 
@@ -328,7 +302,7 @@ namespace mongo {
                     }
                     else {
                         BSONObj js = c->current();
-                        fillQueryResultFromObj(b, cc->filter.get(), js);
+                        fillQueryResultFromObj(b, cc->fields.get(), js);
                         n++;
                         if ( (ntoreturn>0 && (n >= ntoreturn || b.len() > MaxBytesToReturnToClientAtOnce)) ||
                              (ntoreturn==0 && b.len()>1*1024*1024) ) {
@@ -365,8 +339,8 @@ namespace mongo {
         virtual void init() {
             query_ = spec_.getObjectField( "query" );
             c_ = qp().newCursor();
-            matcher_.reset( new CoveredIndexMatcher( query_, c_->indexKeyPattern() ) );
-            if ( qp().exactKeyMatch() && ! matcher_->needRecord() ) {
+            _matcher.reset( new CoveredIndexMatcher( query_, c_->indexKeyPattern() ) );
+            if ( qp().exactKeyMatch() && ! _matcher->needRecord() ) {
                 query_ = qp().simplifiedQuery( qp().indexKey() );
                 bc_ = dynamic_cast< BtreeCursor* >( c_.get() );
                 bc_->forgetEndKey();
@@ -398,7 +372,7 @@ namespace mongo {
                     _gotOne();
                 }
             } else {
-                if ( !matcher_->matches(c_->currKey(), c_->currLoc() ) ) {
+                if ( !_matcher->matches(c_->currKey(), c_->currLoc() ) ) {
                 }
                 else if( !c_->getsetdup(c_->currLoc()) ) {
                     _gotOne();
@@ -434,7 +408,7 @@ namespace mongo {
         auto_ptr< Cursor > c_;
         BSONObj query_;
         BtreeCursor *bc_;
-        auto_ptr< CoveredIndexMatcher > matcher_;
+        auto_ptr< CoveredIndexMatcher > _matcher;
         BSONObj firstMatch_;
     };
     
@@ -479,438 +453,389 @@ namespace mongo {
     // Implements database 'query' requests using the query optimizer's QueryOp interface
     class UserQueryOp : public QueryOp {
     public:
-        UserQueryOp( int ntoskip, int ntoreturn, const BSONObj &order, bool wantMore,
-                   bool explain, FieldMatcher *filter, int queryOptions ) :
-            b_( 32768 ),
-            ntoskip_( ntoskip ),
-            ntoreturn_( ntoreturn ),
-            order_( order ),
-            wantMore_( wantMore ),
-            explain_( explain ),
-            filter_( filter ),
-            ordering_(),
-            nscanned_(),
-            queryOptions_( queryOptions ),
-            n_(),
-            soSize_(),
-            saveClientCursor_(),
-            findingStart_( (queryOptions & QueryOption_OplogReplay) != 0 ),
-            findingStartCursor_()
-        {
-            uassert( 10105 , "bad skip value in query", ntoskip >= 0);
-        }
-
+        
+        UserQueryOp( const ParsedQuery& pq ) :
+        //int ntoskip, int ntoreturn, const BSONObj &order, bool wantMore,
+        //                   bool explain, FieldMatcher *filter, int queryOptions ) :
+            _buf( 32768 ) , // TODO be smarter here
+            _pq( pq ) ,
+            _ntoskip( pq.getSkip() ) ,
+            _nscanned(0), _nscannedObjects(0),
+            _n(0),
+            _inMemSort(false),
+            _saveClientCursor(false),
+            _oplogReplay( pq.hasOption( QueryOption_OplogReplay) )
+        {}
+        
         virtual void init() {
-            b_.skip( sizeof( QueryResult ) );
+            _buf.skip( sizeof( QueryResult ) );
             
-            // findingStart mode is used to find the first operation of interest when
-            // we are scanning through a repl log.  For efficiency in the common case,
-            // where the first operation of interest is closer to the tail than the head,
-            // we start from the tail of the log and work backwards until we find the
-            // first operation of interest.  Then we scan forward from that first operation,
-            // actually returning results to the client.  During the findingStart phase,
-            // we release the db mutex occasionally to avoid blocking the db process for
-            // an extended period of time.
-            if ( findingStart_ ) {
-                // Use a ClientCursor here so we can release db mutex while scanning
-                // oplog (can take quite a while with large oplogs).
-                findingStartCursor_ = new ClientCursor();
-				findingStartCursor_->noTimeout();
-                findingStartCursor_->c = qp().newReverseCursor();
-                findingStartCursor_->ns = qp().ns();
+            if ( _oplogReplay ) {
+                _findingStartCursor.reset( new FindingStartCursor( qp() ) );
             } else {
-                c_ = qp().newCursor();
+                _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() );
             }
-            
-            matcher_.reset(new CoveredIndexMatcher(qp().query(), qp().indexKey()));
-            
+            _matcher.reset(new CoveredIndexMatcher( qp().query() , qp().indexKey()));
+
             if ( qp().scanAndOrderRequired() ) {
-                ordering_ = true;
-                so_.reset( new ScanAndOrder( ntoskip_, ntoreturn_, order_ ) );
-                wantMore_ = false;
+                _inMemSort = true;
+                _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder() ) );
             }
         }
+        
         virtual void next() {
-            if ( findingStart_ ) {
-                if ( !findingStartCursor_ || !findingStartCursor_->c->ok() ) {
-                    findingStart_ = false;
-                    c_ = qp().newCursor();
-                } else if ( !matcher_->matches( findingStartCursor_->c->currKey(), findingStartCursor_->c->currLoc() ) ) {
-                    findingStart_ = false;
-                    c_ = qp().newCursor( findingStartCursor_->c->currLoc() );
+            if ( _findingStartCursor.get() ) {
+                if ( _findingStartCursor->done() ) {
+                    _c = _findingStartCursor->cRelease();
+                    _findingStartCursor.reset( 0 );
                 } else {
-                    findingStartCursor_->c->advance();
-                    RARELY {
-                        CursorId id = findingStartCursor_->cursorid;
-                        findingStartCursor_->updateLocation();
-                        {
-                            dbtemprelease t;
-                        }
-                        findingStartCursor_ = ClientCursor::find( id, false );
-                    }
-                    return;
+                    _findingStartCursor->next();
                 }
+                return;
             }
             
-            if ( findingStartCursor_ ) {
-                ClientCursor::erase( findingStartCursor_->cursorid );
-                findingStartCursor_ = 0;
-            }
-            
-            if ( !c_->ok() ) {
+            if ( !_c->ok() ) {
                 finish();
                 return;
             }
             
-            bool mayCreateCursor1 = wantMore_ && ntoreturn_ != 1 && useCursors;
+            bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors;
             
             if( 0 ) { 
-                BSONObj js = c_->current();
-                cout << "SCANNING " << js << endl;
+                cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl;
             }
 
-            nscanned_++;
-            if ( !matcher_->matches(c_->currKey(), c_->currLoc() ) ) {
-                ;
+            _nscanned++;
+            if ( !_matcher->matches(_c->currKey(), _c->currLoc() , &_details ) ) {
+                // not a match, continue onward
+                if ( _details.loadedObject )
+                    _nscannedObjects++;
             }
             else {
-                DiskLoc cl = c_->currLoc();
-                if( !c_->getsetdup(cl) ) { 
-                    BSONObj js = c_->current();
+                _nscannedObjects++;
+                DiskLoc cl = _c->currLoc();
+                if( !_c->getsetdup(cl) ) { 
                     // got a match.
+                    
+                    BSONObj js = _pq.returnKey() ? _c->currKey() : _c->current();
                     assert( js.objsize() >= 0 ); //defensive for segfaults
-                    if ( ordering_ ) {
+
+                    if ( _inMemSort ) {
                         // note: no cursors for non-indexed, ordered results.  results must be fairly small.
-                        so_->add(js);
+                        _so->add(js);
                     }
-                    else if ( ntoskip_ > 0 ) {
-                        ntoskip_--;
-                    } else {
-                        if ( explain_ ) {
-                            n_++;
-                            if ( n_ >= ntoreturn_ && !wantMore_ ) {
+                    else if ( _ntoskip > 0 ) {
+                        _ntoskip--;
+                    } 
+                    else {
+                        if ( _pq.isExplain() ) {
+                            _n++;
+                            if ( _n >= _pq.getNumToReturn() && !_pq.wantMore() ) {
                                 // .limit() was used, show just that much.
                                 finish();
                                 return;
                             }
                         }
                         else {
-                            fillQueryResultFromObj(b_, filter_, js);
-                            n_++;
-                            if ( (ntoreturn_>0 && (n_ >= ntoreturn_ || b_.len() > MaxBytesToReturnToClientAtOnce)) ||
-                                 (ntoreturn_==0 && (b_.len()>1*1024*1024 || n_>=101)) ) {
-                                /* if ntoreturn is zero, we return up to 101 objects.  on the subsequent getmore, there
-                                   is only a size limit.  The idea is that on a find() where one doesn't use much results,
-                                   we don't return much, but once getmore kicks in, we start pushing significant quantities.
-                             
-                                   The n limit (vs. size) is important when someone fetches only one small field from big
-                                   objects, which causes massive scanning server-side.
-                                */
+                            if ( _pq.returnKey() ){
+                                BSONObjBuilder bb( _buf );
+                                bb.appendKeys( _c->indexKeyPattern() , js );
+                                bb.done();
+                            }
+                            else {
+                                fillQueryResultFromObj( _buf , _pq.getFields() , js );
+                            }
+                            _n++;
+                            if ( ! _c->supportGetMore() ){
+                                if ( _pq.enough( _n ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ){
+                                    finish();
+                                    return;
+                                }
+                            }
+                            else if ( _pq.enoughForFirstBatch( _n , _buf.len() ) ){
                                 /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
                                 if ( mayCreateCursor1 ) {
-                                    c_->advance();
-                                    if ( c_->ok() ) {
+                                    _c->advance();
+                                    if ( _c->ok() ) {
                                         // more...so save a cursor
-                                        saveClientCursor_ = true;
+                                        _saveClientCursor = true;
                                     }
                                 }
                                 finish();
                                 return;
-                                }
+                            }
                         }
                     }
                 }
             }
-            c_->advance();            
+            _c->advance();            
         }
+
         void finish() {
-            if ( explain_ ) {
-                n_ = ordering_ ? so_->size() : n_;
-            } else if ( ordering_ ) {
-                so_->fill(b_, filter_, n_);
-            }
-            if ( mayCreateCursor2() ) {
-                c_->setTailable();
+            if ( _pq.isExplain() ) {
+                _n = _inMemSort ? _so->size() : _n;
+            } 
+            else if ( _inMemSort ) {
+                _so->fill( _buf, _pq.getFields() , _n );
             }
+            
+            if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 )
+                _c->setTailable();
+            
             // If the tailing request succeeded.
-            if ( c_->tailable() ) {
-                saveClientCursor_ = true;
-            }
+            if ( _c->tailable() )
+                _saveClientCursor = true;
+
             setComplete();            
         }
-        virtual bool mayRecordPlan() const { return ntoreturn_ != 1; }
+        
+        virtual bool mayRecordPlan() const { return _pq.getNumToReturn() != 1; }
+        
         virtual QueryOp *clone() const {
-            return new UserQueryOp( ntoskip_, ntoreturn_, order_, wantMore_, explain_, filter_, queryOptions_ );
-        }
-        BufBuilder &builder() { return b_; }
-        bool scanAndOrderRequired() const { return ordering_; }
-        auto_ptr< Cursor > cursor() { return c_; }
-        auto_ptr< CoveredIndexMatcher > matcher() { return matcher_; }
-        int n() const { return n_; }
-        long long nscanned() const { return nscanned_; }
-        bool saveClientCursor() const { return saveClientCursor_; }
-        bool mayCreateCursor2() const { return ( queryOptions_ & QueryOption_CursorTailable ) && ntoreturn_ != 1; }
+            return new UserQueryOp( _pq );
+        }
+
+        BufBuilder &builder() { return _buf; }
+        bool scanAndOrderRequired() const { return _inMemSort; }
+        auto_ptr< Cursor > cursor() { return _c; }
+        auto_ptr< CoveredIndexMatcher > matcher() { return _matcher; }
+        int n() const { return _n; }
+        long long nscanned() const { return _nscanned; }
+        long long nscannedObjects() const { return _nscannedObjects; }
+        bool saveClientCursor() const { return _saveClientCursor; }
+
     private:
-        BufBuilder b_;
-        int ntoskip_;
-        int ntoreturn_;
-        BSONObj order_;
-        bool wantMore_;
-        bool explain_;
-        FieldMatcher *filter_;   
-        bool ordering_;
-        auto_ptr< Cursor > c_;
-        long long nscanned_;
-        int queryOptions_;
-        auto_ptr< CoveredIndexMatcher > matcher_;
-        int n_;
-        int soSize_;
-        bool saveClientCursor_;
-        auto_ptr< ScanAndOrder > so_;
-        bool findingStart_;
-        ClientCursor * findingStartCursor_;
+        BufBuilder _buf;
+        const ParsedQuery& _pq;
+
+        long long _ntoskip;
+        long long _nscanned;
+        long long _nscannedObjects;
+        int _n; // found so far
+        
+        MatchDetails _details;
+
+        bool _inMemSort;
+        auto_ptr< ScanAndOrder > _so;
+        
+        auto_ptr< Cursor > _c;
+
+        auto_ptr< CoveredIndexMatcher > _matcher;
+
+        bool _saveClientCursor;
+        bool _oplogReplay;
+        auto_ptr< FindingStartCursor > _findingStartCursor;
     };
     
     /* run a query -- includes checking for and running a Command */
     auto_ptr< QueryResult > runQuery(Message& m, QueryMessage& q, CurOp& curop ) {
         StringBuilder& ss = curop.debug().str;
+        ParsedQuery pq( q );
         const char *ns = q.ns;
         int ntoskip = q.ntoskip;
-        int _ntoreturn = q.ntoreturn;
         BSONObj jsobj = q.query;
-        auto_ptr< FieldMatcher > filter = q.fields; // what fields to return (unspecified = full object)
         int queryOptions = q.queryOptions;
         BSONObj snapshotHint;
         
-        Timer t;
         if( logLevel >= 2 )
             log() << "runQuery: " << ns << jsobj << endl;
         
         long long nscanned = 0;
-        bool wantMore = true;
-        int ntoreturn = _ntoreturn;
-        if ( _ntoreturn < 0 ) {
-            /* _ntoreturn greater than zero is simply a hint on how many objects to send back per 
-               "cursor batch".
-               A negative number indicates a hard limit.
-            */
-            ntoreturn = -_ntoreturn;
-            wantMore = false;
-        }
-        ss << "query " << ns << " ntoreturn:" << ntoreturn;
+        ss << ns << " ntoreturn:" << pq.getNumToReturn();
         curop.setQuery(jsobj);
         
-        BufBuilder bb;
         BSONObjBuilder cmdResBuf;
         long long cursorid = 0;
         
-        bb.skip(sizeof(QueryResult));
-        
         auto_ptr< QueryResult > qr;
         int n = 0;
         
         Client& c = cc();
-        /* we assume you are using findOne() for running a cmd... */
-        if ( ntoreturn == 1 && runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
-            n = 1;
-            qr.reset( (QueryResult *) bb.buf() );
-            bb.decouple();
-            qr->setResultFlagsToOk();
-            qr->len = bb.len();
-            ss << " reslen:" << bb.len();
-            //	qr->channel = 0;
-            qr->setOperation(opReply);
-            qr->cursorId = cursorid;
-            qr->startingFrom = 0;
-            qr->nReturned = n;            
+
+        if ( pq.couldBeCommand() ){
+            BufBuilder bb;
+            bb.skip(sizeof(QueryResult));
+
+            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
+                ss << " command ";
+                curop.markCommand();
+                n = 1;
+                qr.reset( (QueryResult *) bb.buf() );
+                bb.decouple();
+                qr->setResultFlagsToOk();
+                qr->len = bb.len();
+                ss << " reslen:" << bb.len();
+                //	qr->channel = 0;
+                qr->setOperation(opReply);
+                qr->cursorId = cursorid;
+                qr->startingFrom = 0;
+                qr->nReturned = n;
+            }
+            return qr;
         }
-        else {
-            /* regular query */
-            
-            AuthenticationInfo *ai = currentClient.get()->ai;
-            uassert( 10106 , "unauthorized", ai->isAuthorized(c.database()->name.c_str()));
-
-			/* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair 
-			   so that queries to a pair are realtime consistent as much as possible.  use setSlaveOk() to 
-			   query the nonmaster member of a replica pair.
-			*/
-            uassert( 10107 ,  "not master", isMaster() || (queryOptions & QueryOption_SlaveOk) || slave == SimpleSlave );
-
-            BSONElement hint;
-            BSONObj min;
-            BSONObj max;
-            bool explain = false;
-            bool _gotquery = false;
-            bool snapshot = false;
-            BSONObj query;
-            {
-                BSONElement e = jsobj.findElement("$query");
-                if ( e.eoo() )
-                    e = jsobj.findElement("query");                    
-                if ( !e.eoo() && (e.type() == Object || e.type() == Array) ) {
-                    query = e.embeddedObject();
-                    _gotquery = true;
-                }
+        
+        // regular query
+
+        mongolock lk(false); // read lock
+        Client::Context ctx( ns , dbpath , &lk );
+
+        /* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair 
+           so that queries to a pair are realtime consistent as much as possible.  use setSlaveOk() to 
+           query the nonmaster member of a replica pair.
+        */
+        uassert( 10107 , "not master" , isMaster() || pq.hasOption( QueryOption_SlaveOk ) || replSettings.slave == SimpleSlave );
+
+        BSONElement hint = useHints ? pq.getHint() : BSONElement();
+        bool explain = pq.isExplain();
+        bool snapshot = pq.isSnapshot();
+        BSONObj query = pq.getFilter();
+        BSONObj order = pq.getOrder();
+
+        if ( pq.hasOption( QueryOption_CursorTailable ) ) {
+            NamespaceDetails *d = nsdetails( ns );
+            uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped );
+            if ( order.isEmpty() ) {
+                order = BSON( "$natural" << 1 );
+            } else {
+                uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == BSON( "$natural" << 1 ) );
             }
-            BSONObj order;
-            {
-                BSONElement e = jsobj.findElement("$orderby");
-                if ( e.eoo() )
-                    e = jsobj.findElement("orderby");                    
-                if ( !e.eoo() ) {
-                    order = e.embeddedObjectUserCheck();
-                    if ( e.type() == Array )
-                        order = transformOrderFromArrayFormat(order);
+        }
+        
+        if( snapshot ) { 
+            NamespaceDetails *d = nsdetails(ns);
+            if ( d ){
+                int i = d->findIdIndex();
+                if( i < 0 ) { 
+                    if ( strstr( ns , ".system." ) == 0 )
+                        log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
                 }
-            }
-            if ( !_gotquery && order.isEmpty() )
-                query = jsobj;
-            else {
-                explain = jsobj.getBoolField("$explain");
-                if ( useHints )
-                    hint = jsobj.getField("$hint");
-                min = jsobj.getObjectField("$min");
-                max = jsobj.getObjectField("$max");
-                BSONElement e = jsobj.getField("$snapshot");
-                snapshot = !e.eoo() && e.trueValue();
-                if( snapshot ) { 
-                    uassert( 12001 , "E12001 can't sort with $snapshot", order.isEmpty());
-					uassert( 12002 , "E12002 can't use hint with $snapshot", hint.eoo());
-                    NamespaceDetails *d = nsdetails(ns);
-                    if ( d ){
-                        int i = d->findIdIndex();
-                        if( i < 0 ) { 
-                            if ( strstr( ns , ".system." ) == 0 )
-                                log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
-                        }
-                        else {
-                            /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
-                               probably need a better way to specify "use the _id index" as a hint.  if someone is
-                               in the query optimizer please fix this then!
-                            */
-                            BSONObjBuilder b;
-                            b.append("$hint", d->idx(i).indexName());
-                            snapshotHint = b.obj();
-                            hint = snapshotHint.firstElement();
-                        }
-                    }
+                else {
+                    /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
+                       probably need a better way to specify "use the _id index" as a hint.  if someone is
+                       in the query optimizer please fix this then!
+                    */
+                    BSONObjBuilder b;
+                    b.append("$hint", d->idx(i).indexName());
+                    snapshotHint = b.obj();
+                    hint = snapshotHint.firstElement();
                 }
             }
+        }
             
-            /* The ElemIter will not be happy if this isn't really an object. So throw exception
-               here when that is true.
-               (Which may indicate bad data from client.)
-            */
-            if ( query.objsize() == 0 ) {
-                out() << "Bad query object?\n  jsobj:";
-                out() << jsobj.toString() << "\n  query:";
-                out() << query.toString() << endl;
-                uassert( 10110 , "bad query object", false);
-            }
+        /* The ElemIter will not be happy if this isn't really an object. So throw exception
+           here when that is true.
+           (Which may indicate bad data from client.)
+        */
+        if ( query.objsize() == 0 ) {
+            out() << "Bad query object?\n  jsobj:";
+            out() << jsobj.toString() << "\n  query:";
+            out() << query.toString() << endl;
+            uassert( 10110 , "bad query object", false);
+        }
             
-            bool idHackWorked = false;
 
-            if ( strcmp( query.firstElement().fieldName() , "_id" ) == 0 && query.nFields() == 1 && query.firstElement().isSimpleType() ){
-                nscanned = 1;
+        if ( ! explain && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
+            nscanned = 1;
 
-                bool nsFound = false;
-                bool indexFound = false;
+            bool nsFound = false;
+            bool indexFound = false;
 
-                BSONObj resObject;
-                bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
-                if ( nsFound == false || indexFound == true ){
-                    idHackWorked = true;
-                    if ( found ){
-                        n = 1;
-                        fillQueryResultFromObj( bb , filter.get() , resObject );
-                    }
-                    qr.reset( (QueryResult *) bb.buf() );
-                    bb.decouple();
-                    qr->setResultFlagsToOk();
-                    qr->len = bb.len();
-                    ss << " reslen:" << bb.len();
-                    qr->setOperation(opReply);
-                    qr->cursorId = cursorid;
-                    qr->startingFrom = 0;
-                    qr->nReturned = n;       
-                }     
-            }
-            
-            if ( ! idHackWorked ){ // non-simple _id lookup
-                BSONObj oldPlan;
-                if ( explain && hint.eoo() && min.isEmpty() && max.isEmpty() ) {
-                    QueryPlanSet qps( ns, query, order );
-                    if ( qps.usingPrerecordedPlan() )
-                        oldPlan = qps.explain();
-                }
-                QueryPlanSet qps( ns, query, order, &hint, !explain, min, max );
-                UserQueryOp original( ntoskip, ntoreturn, order, wantMore, explain, filter.get(), queryOptions );
-                shared_ptr< UserQueryOp > o = qps.runOp( original );
-                UserQueryOp &dqo = *o;
-                massert( 10362 ,  dqo.exceptionMessage(), dqo.complete() );
-                n = dqo.n();
-                nscanned = dqo.nscanned();
-                if ( dqo.scanAndOrderRequired() )
-                    ss << " scanAndOrder ";
-                auto_ptr< Cursor > c = dqo.cursor();
-                log( 5 ) << "   used cursor: " << c.get() << endl;
-                if ( dqo.saveClientCursor() ) {
-                    ClientCursor *cc = new ClientCursor();
-                    if ( queryOptions & QueryOption_NoCursorTimeout )
-                        cc->noTimeout();
-                    cc->c = c;
-                    cursorid = cc->cursorid;
-                    cc->query = jsobj.getOwned();
-                    DEV out() << "  query has more, cursorid: " << cursorid << endl;
-                    cc->matcher = dqo.matcher();
-                    cc->ns = ns;
-                    cc->pos = n;
-                    cc->filter = filter;
-                    cc->originalMessage = m;
-                    cc->updateLocation();
-                    if ( !cc->c->ok() && cc->c->tailable() ) {
-                        DEV out() << "  query has no more but tailable, cursorid: " << cursorid << endl;
-                    } else {
-                        DEV out() << "  query has more, cursorid: " << cursorid << endl;
-                    }
-                }
-                if ( explain ) {
-                    BSONObjBuilder builder;
-                    builder.append("cursor", c->toString());
-                    builder.append("startKey", c->prettyStartKey());
-                    builder.append("endKey", c->prettyEndKey());
-                    builder.append("nscanned", double( dqo.nscanned() ) );
-                    builder.append("n", n);
-                    if ( dqo.scanAndOrderRequired() )
-                        builder.append("scanAndOrder", true);
-                    builder.append("millis", t.millis());
-                    if ( !oldPlan.isEmpty() )
-                        builder.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
-                    if ( hint.eoo() )
-                        builder.appendElements(qps.explain());
-                    BSONObj obj = builder.done();
-                    fillQueryResultFromObj(dqo.builder(), 0, obj);
+            BSONObj resObject;
+            bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
+            if ( nsFound == false || indexFound == true ){
+                BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
+                bb.skip(sizeof(QueryResult));
+                
+                ss << " idhack ";
+                if ( found ){
                     n = 1;
+                    fillQueryResultFromObj( bb , pq.getFields() , resObject );
                 }
-                qr.reset( (QueryResult *) dqo.builder().buf() );
-                dqo.builder().decouple();
-                qr->cursorId = cursorid;
+                qr.reset( (QueryResult *) bb.buf() );
+                bb.decouple();
                 qr->setResultFlagsToOk();
-                qr->len = dqo.builder().len();
-                ss << " reslen:" << qr->len;
+                qr->len = bb.len();
+                ss << " reslen:" << bb.len();
                 qr->setOperation(opReply);
+                qr->cursorId = cursorid;
                 qr->startingFrom = 0;
-                qr->nReturned = n;
+                qr->nReturned = n;       
+                return qr;
+            }     
+        }
+        
+        // regular, not QO bypass query
+        
+        BSONObj oldPlan;
+        if ( explain && ! pq.hasIndexSpecifier() ){
+            QueryPlanSet qps( ns, query, order );
+            if ( qps.usingPrerecordedPlan() )
+                oldPlan = qps.explain();
+        }
+        QueryPlanSet qps( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax() );
+        UserQueryOp original( pq );
+        shared_ptr< UserQueryOp > o = qps.runOp( original );
+        UserQueryOp &dqo = *o;
+        massert( 10362 ,  dqo.exceptionMessage(), dqo.complete() );
+        n = dqo.n();
+        nscanned = dqo.nscanned();
+        if ( dqo.scanAndOrderRequired() )
+            ss << " scanAndOrder ";
+        auto_ptr<Cursor> cursor = dqo.cursor();
+        log( 5 ) << "   used cursor: " << cursor.get() << endl;
+        if ( dqo.saveClientCursor() ) {
+            // the clientcursor now owns the Cursor* and 'c' is released:
+            ClientCursor *cc = new ClientCursor(cursor, ns, !(queryOptions & QueryOption_NoCursorTimeout));
+            cursorid = cc->cursorid;
+            cc->query = jsobj.getOwned();
+            DEV out() << "  query has more, cursorid: " << cursorid << endl;
+            cc->matcher = dqo.matcher();
+            cc->pos = n;
+            cc->fields = pq.getFieldPtr();
+            cc->originalMessage = m;
+            cc->updateLocation();
+            if ( !cc->c->ok() && cc->c->tailable() ) {
+                DEV out() << "  query has no more but tailable, cursorid: " << cursorid << endl;
+            } else {
+                DEV out() << "  query has more, cursorid: " << cursorid << endl;
             }
         }
+        if ( explain ) {
+            BSONObjBuilder builder;
+            builder.append("cursor", cursor->toString());
+            builder.appendArray("indexBounds", cursor->prettyIndexBounds());
+            builder.appendNumber("nscanned", dqo.nscanned() );
+            builder.appendNumber("nscannedObjects", dqo.nscannedObjects() );
+            builder.append("n", n);
+            if ( dqo.scanAndOrderRequired() )
+                builder.append("scanAndOrder", true);
+            builder.append("millis", curop.elapsedMillis());
+            if ( !oldPlan.isEmpty() )
+                builder.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
+            if ( hint.eoo() )
+                builder.appendElements(qps.explain());
+            BSONObj obj = builder.done();
+            fillQueryResultFromObj(dqo.builder(), 0, obj);
+            n = 1;
+        }
+        qr.reset( (QueryResult *) dqo.builder().buf() );
+        dqo.builder().decouple();
+        qr->cursorId = cursorid;
+        qr->setResultFlagsToOk();
+        qr->len = dqo.builder().len();
+        ss << " reslen:" << qr->len;
+        qr->setOperation(opReply);
+        qr->startingFrom = 0;
+        qr->nReturned = n;
+
         
-        int duration = t.millis();
-        Database *database = c.database();
-        if ( (database && database->profile) || duration >= 100 ) {
+        int duration = curop.elapsedMillis();
+        bool dbprofile = curop.shouldDBProfile( duration );
+        if ( dbprofile || duration >= cmdLine.slowMS ) {
             ss << " nscanned:" << nscanned << ' ';
             if ( ntoskip )
                 ss << " ntoskip:" << ntoskip;
-            if ( database && database->profile )
+            if ( dbprofile )
                 ss << " \nquery: ";
             ss << jsobj << ' ';
         }
diff --git a/db/query.h b/db/query.h
index d69b6d9..fdc33c9 100644
--- a/db/query.h
+++ b/db/query.h
@@ -22,7 +22,7 @@
 #include "../util/message.h"
 #include "dbmessage.h"
 #include "jsobj.h"
-#include "storage.h"
+#include "diskloc.h"
 
 /* db request message format
 
@@ -71,13 +71,15 @@
 
 namespace mongo {
 
+    extern const int MaxBytesToReturnToClientAtOnce;
+
     // for an existing query (ie a ClientCursor), send back additional information.
     QueryResult* getMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op);
 
     struct UpdateResult {
         bool existing;
         bool mod;
-        unsigned long long num;
+        long long num;
 
         UpdateResult( bool e, bool m, unsigned long long n )
             : existing(e) , mod(m), num(n ){}
@@ -100,16 +102,213 @@ namespace mongo {
     
     /* returns true if an existing object was updated, false if no existing object was found.
        multi - update multiple objects - mostly useful with things like $set
+       god - allow access to system namespaces and don't yield
     */
-    UpdateResult updateObjects(const char *ns, BSONObj updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
+    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
 
     // If justOne is true, deletedId is set to the id of the deleted object.
-    int deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false);
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false);
 
     long long runCount(const char *ns, const BSONObj& cmd, string& err);
     
     auto_ptr< QueryResult > runQuery(Message& m, QueryMessage& q, CurOp& curop );
     
+    /* This is for languages whose "objects" are not well ordered (JSON is well ordered).
+       [ { a : ... } , { b : ... } ] -> { a : ..., b : ... }
+    */
+    inline BSONObj transformOrderFromArrayFormat(BSONObj order) {
+        /* note: this is slow, but that is ok as order will have very few pieces */
+        BSONObjBuilder b;
+        char p[2] = "0";
+
+        while ( 1 ) {
+            BSONObj j = order.getObjectField(p);
+            if ( j.isEmpty() )
+                break;
+            BSONElement e = j.firstElement();
+            uassert( 10102 , "bad order array", !e.eoo());
+            uassert( 10103 , "bad order array [2]", e.isNumber());
+            b.append(e);
+            (*p)++;
+            uassert( 10104 , "too many ordering elements", *p <= '9');
+        }
+
+        return b.obj();
+    }
+
+    /**
+     * this represents a total user query
+     * includes fields from the query message, both possible query levels
+     * parses everything up front
+     */
+    class ParsedQuery {
+    public:
+        ParsedQuery( QueryMessage& qm )
+            : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ){
+            init( qm.query );
+            initFields( qm.fields );
+        }
+        ParsedQuery( const char* ns , int ntoskip , int ntoreturn , int queryoptions , const BSONObj& query , const BSONObj& fields )
+            : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ){
+            init( query );
+            initFields( fields );
+        }
+        
+        ~ParsedQuery(){}
+
+        const char * ns() const { return _ns; }
+
+        const BSONObj& getFilter() const { return _filter; }
+        FieldMatcher* getFields() const { return _fields.get(); }
+        shared_ptr<FieldMatcher> getFieldPtr() const { return _fields; }
+
+        int getSkip() const { return _ntoskip; }
+        int getNumToReturn() const { return _ntoreturn; }
+        bool wantMore() const { return _wantMore; }
+        int getOptions() const { return _options; }
+        bool hasOption( int x ) const { return x & _options; }
+
+        
+        bool isExplain() const { return _explain; }
+        bool isSnapshot() const { return _snapshot; }
+        bool returnKey() const { return _returnKey; }
+
+        const BSONObj& getMin() const { return _min; }
+        const BSONObj& getMax() const { return _max; }
+        const BSONObj& getOrder() const { return _order; }
+        const BSONElement& getHint() const { return _hint; }
+
+        bool couldBeCommand() const {
+            /* we assume you are using findOne() for running a cmd... */
+            return _ntoreturn == 1 && strstr( _ns , ".$cmd" );
+        }
+
+        bool hasIndexSpecifier() const {
+            return ! _hint.eoo() || ! _min.isEmpty() || ! _max.isEmpty();
+        }
+
+        /* if ntoreturn is zero, we return up to 101 objects.  on the subsequent getmore, there
+           is only a size limit.  The idea is that on a find() where one doesn't use much results,
+           we don't return much, but once getmore kicks in, we start pushing significant quantities.
+           
+           The n limit (vs. size) is important when someone fetches only one small field from big
+           objects, which causes massive scanning server-side.
+        */
+        bool enoughForFirstBatch( int n , int len ) const {
+            if ( _ntoreturn == 0 )
+                return ( len > 1024 * 1024 ) || n >= 101;
+            return n >= _ntoreturn || len > MaxBytesToReturnToClientAtOnce;
+        }
+
+        bool enough( int n ) const {
+            if ( _ntoreturn == 0 )
+                return false;
+            return n >= _ntoreturn;
+        }
+        
+    private:
+        void init( const BSONObj& q ){
+            _reset();
+            uassert( 10105 , "bad skip value in query", _ntoskip >= 0);
+            
+            if ( _ntoreturn < 0 ){
+                /* _ntoreturn greater than zero is simply a hint on how many objects to send back per 
+                   "cursor batch".
+                   A negative number indicates a hard limit.
+                */
+                _wantMore = false;
+                _ntoreturn = -_ntoreturn;
+            }
+
+            
+            BSONElement e = q["query"];
+            if ( ! e.isABSONObj() )
+                e = q["$query"];
+            
+            if ( e.isABSONObj() ){
+                _filter = e.embeddedObject();
+                _initTop( q );
+            }
+            else {
+                _filter = q;
+            }
+        }
+
+        void _reset(){
+            _wantMore = true;
+            _explain = false;
+            _snapshot = false;
+            _returnKey = false;
+        }
+
+        void _initTop( const BSONObj& top ){
+            BSONObjIterator i( top );
+            while ( i.more() ){
+                BSONElement e = i.next();
+                const char * name = e.fieldName();
+
+                if ( strcmp( "$orderby" , name ) == 0 ||
+                     strcmp( "orderby" , name ) == 0 ){
+                    if ( e.type() == Object )
+                        _order = e.embeddedObject();
+                    else if ( e.type() == Array )
+                        _order = transformOrderFromArrayFormat( _order );
+                    else
+                        assert( 0 );
+                }
+                else if ( strcmp( "$explain" , name ) == 0 )
+                    _explain = e.trueValue();
+                else if ( strcmp( "$snapshot" , name ) == 0 )
+                    _snapshot = e.trueValue();
+                else if ( strcmp( "$min" , name ) == 0 )
+                    _min = e.embeddedObject();
+                else if ( strcmp( "$max" , name ) == 0 )
+                    _max = e.embeddedObject();
+                else if ( strcmp( "$hint" , name ) == 0 )
+                    _hint = e;
+                else if ( strcmp( "$returnKey" , name ) == 0 )
+                    _returnKey = e.trueValue();
+
+            }
+
+            if ( _snapshot ){
+                uassert( 12001 , "E12001 can't sort with $snapshot", _order.isEmpty() );
+                uassert( 12002 , "E12002 can't use hint with $snapshot", _hint.eoo() );
+            }
+            
+        }
+
+        void initFields( const BSONObj& fields ){
+            if ( fields.isEmpty() )
+                return;
+            _fields.reset( new FieldMatcher() );
+            _fields->add( fields );
+        }
+
+        ParsedQuery( const ParsedQuery& other ){
+            assert(0);
+        }
+
+        const char* _ns;
+        int _ntoskip;
+        int _ntoreturn;
+        int _options;
+        
+        BSONObj _filter;
+        shared_ptr< FieldMatcher > _fields;
+        
+        bool _wantMore;
+
+        bool _explain;
+        bool _snapshot;
+        bool _returnKey;
+        BSONObj _min;
+        BSONObj _max;
+        BSONElement _hint;
+        BSONObj _order;
+    };
+
+
 } // namespace mongo
 
 #include "clientcursor.h"
diff --git a/db/queryoptimizer.cpp b/db/queryoptimizer.cpp
index 499417a..fa08323 100644
--- a/db/queryoptimizer.cpp
+++ b/db/queryoptimizer.cpp
@@ -24,6 +24,9 @@
 #include "queryoptimizer.h"
 #include "cmdline.h"
 
+//#define DEBUGQO(x) cout << x << endl;
+#define DEBUGQO(x)
+
 namespace mongo {
 
     void checkTableScanAllowed( const char * ns ){
@@ -39,7 +42,7 @@ namespace mongo {
 
         uassert( 10111 ,  (string)"table scans not allowed:" + ns , ! cmdLine.notablescan );
     }
-    
+
     double elementDirection( const BSONElement &e ) {
         if ( e.isNumber() )
             return e.number();
@@ -48,7 +51,7 @@ namespace mongo {
     
     QueryPlan::QueryPlan( 
         NamespaceDetails *_d, int _idxNo,
-        const FieldRangeSet &fbs, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey ) :
+        const FieldRangeSet &fbs, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) :
     d(_d), idxNo(_idxNo),
     fbs_( fbs ),
     order_( order ),
@@ -58,7 +61,9 @@ namespace mongo {
     exactKeyMatch_( false ),
     direction_( 0 ),
     endKeyInclusive_( endKey.isEmpty() ),
-    unhelpful_( false ) {
+    unhelpful_( false ),
+    _special( special ),
+    _type(0){
 
         if ( !fbs_.matchPossible() ) {
             unhelpful_ = true;
@@ -75,6 +80,14 @@ namespace mongo {
             return;
         }
 
+        if ( _special.size() ){
+            optimal_ = true;
+            _type  = index_->getSpec().getType();
+            massert( 13040 , (string)"no type for special: " + _special , _type );
+            scanAndOrderRequired_ = _type->scanAndOrderRequired( fbs.query() , order );
+            return;
+        }
+
         BSONObj idxKey = index_->keyPattern();
         BSONObjIterator o( order );
         BSONObjIterator k( idxKey );
@@ -163,7 +176,11 @@ namespace mongo {
             unhelpful_ = true;
     }
     
-    auto_ptr< Cursor > QueryPlan::newCursor( const DiskLoc &startLoc ) const {
+    auto_ptr< Cursor > QueryPlan::newCursor( const DiskLoc &startLoc , int numWanted ) const {
+
+        if ( _type )
+            return _type->newCursor( fbs_.query() , order_ , numWanted );
+        
         if ( !fbs_.matchPossible() ){
             if ( fbs_.nNontrivialRanges() )
                 checkTableScanAllowed( fbs_.ns() );
@@ -206,13 +223,14 @@ namespace mongo {
     
     void QueryPlan::registerSelf( long long nScanned ) const {
         if ( fbs_.matchPossible() ) {
-            boostlock lk(NamespaceDetailsTransient::_qcMutex);
+            scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
             NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( fbs_.pattern( order_ ), indexKey(), nScanned );  
         }
     }
     
     QueryPlanSet::QueryPlanSet( const char *_ns, const BSONObj &query, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max ) :
     ns(_ns),
+    query_( query.getOwned() ),
     fbs_( _ns, query ),
     mayRecordPlan_( true ),
     usingPrerecordedPlan_( false ),
@@ -223,9 +241,7 @@ namespace mongo {
     min_( min.getOwned() ),
     max_( max.getOwned() ) {
         if ( hint && !hint->eoo() ) {
-            BSONObjBuilder b;
-            b.append( *hint );
-            hint_ = b.obj();
+            hint_ = hint->wrap();
         }
         init();
     }
@@ -242,6 +258,7 @@ namespace mongo {
     }
     
     void QueryPlanSet::init() {
+        DEBUGQO( "QueryPlanSet::init " << ns << "\t" << query_ );
         plans_.clear();
         mayRecordPlan_ = true;
         usingPrerecordedPlan_ = false;
@@ -297,9 +314,43 @@ namespace mongo {
             plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(*idx), fbs_, order_, min_, max_ ) ) );
             return;
         }
-        
+
+        if ( isSimpleIdQuery( query_ ) ){
+            int idx = d->findIdIndex();
+            if ( idx >= 0 ){
+                usingPrerecordedPlan_ = true;
+                mayRecordPlan_ = false;
+                plans_.push_back( PlanPtr( new QueryPlan( d , idx , fbs_ , order_ ) ) );
+                return;
+            }
+        }
+
+        if ( query_.isEmpty() && order_.isEmpty() ){
+            plans_.push_back( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ) );
+            return;
+        }
+
+        DEBUGQO( "\t special : " << fbs_.getSpecial() );
+        if ( fbs_.getSpecial().size() ){
+            _special = fbs_.getSpecial();
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                int j = i.pos();
+                IndexDetails& ii = i.next();
+                const IndexSpec& spec = ii.getSpec();
+                if ( spec.getTypeName() == _special && spec.suitability( query_ , order_ ) ){
+                    usingPrerecordedPlan_ = true;
+                    mayRecordPlan_ = true;
+                    plans_.push_back( PlanPtr( new QueryPlan( d , j , fbs_ , order_ , 
+                                                              BSONObj() , BSONObj() , _special ) ) );
+                    return;
+                }
+            }
+            uassert( 13038 , (string)"can't find special index: " + _special + " for: " + query_.toString() , 0 );
+        }
+
         if ( honorRecordedPlan_ ) {
-            boostlock lk(NamespaceDetailsTransient::_qcMutex);
+            scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
             NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( ns );
             BSONObj bestIndex = nsd.indexForPattern( fbs_.pattern( order_ ) );
             if ( !bestIndex.isEmpty() ) {
@@ -334,7 +385,7 @@ namespace mongo {
         if ( !d )
             return;
 
-        // If table scan is optimal or natural order requested
+        // If table scan is optimal or natural order requested or tailable cursor requested
         if ( !fbs_.matchPossible() || ( fbs_.nNontrivialRanges() == 0 && order_.isEmpty() ) ||
             ( !order_.isEmpty() && !strcmp( order_.firstElement().fieldName(), "$natural" ) ) ) {
             // Table scan plan
@@ -342,8 +393,19 @@ namespace mongo {
             return;
         }
         
+        bool normalQuery = hint_.isEmpty() && min_.isEmpty() && max_.isEmpty();
+
         PlanSet plans;
         for( int i = 0; i < d->nIndexes; ++i ) {
+            IndexDetails& id = d->idx(i);
+            const IndexSpec& spec = id.getSpec();
+            IndexSuitability suitability = HELPFUL;
+            if ( normalQuery ){
+                suitability = spec.suitability( query_ , order_ );
+                if ( suitability == USELESS )
+                    continue;
+            }
+
             PlanPtr p( new QueryPlan( d, i, fbs_, order_ ) );
             if ( p->optimal() ) {
                 addPlan( p, checkFirst );
@@ -367,7 +429,7 @@ namespace mongo {
             if ( res->complete() || plans_.size() > 1 )
                 return res;
             {
-                boostlock lk(NamespaceDetailsTransient::_qcMutex);
+                scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
                 NamespaceDetailsTransient::get_inlock( fbs_.ns() ).registerIndexForPattern( fbs_.pattern( order_ ), BSONObj(), 0 );
             }
             init();
@@ -380,7 +442,10 @@ namespace mongo {
         vector< BSONObj > arr;
         for( PlanSet::const_iterator i = plans_.begin(); i != plans_.end(); ++i ) {
             auto_ptr< Cursor > c = (*i)->newCursor();
-            arr.push_back( BSON( "cursor" << c->toString() << "startKey" << c->prettyStartKey() << "endKey" << c->prettyEndKey() ) );
+            BSONObjBuilder explain;
+            explain.append( "cursor", c->toString() );
+            explain.appendArray( "indexBounds", c->prettyIndexBounds() );
+            arr.push_back( explain.obj() );
         }
         BSONObjBuilder b;
         b.append( "allPlans", arr );
@@ -433,7 +498,7 @@ namespace mongo {
             }
             if ( errCount == ops.size() )
                 break;
-            if ( plans_.usingPrerecordedPlan_ && nScanned > plans_.oldNScanned_ * 10 ) {
+            if ( plans_.usingPrerecordedPlan_ && nScanned > plans_.oldNScanned_ * 10 && plans_._special.empty() ) {
                 plans_.addOtherPlans( true );
                 PlanSet::iterator i = plans_.plans_.begin();
                 ++i;
@@ -558,7 +623,7 @@ namespace mongo {
             return 0;
         }
         
-        setClient( ns );
+        Client::Context ctx( ns );
         IndexDetails *id = 0;
         NamespaceDetails *d = nsdetails( ns );
         if ( !d ) {
@@ -576,9 +641,11 @@ namespace mongo {
             while( i.more() ) {
                 IndexDetails& ii = i.next();
                 if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) {
-                    id = &ii;
-                    keyPattern = ii.keyPattern();
-                    break;
+                    if ( ii.getSpec().getType() == 0 ){
+                        id = &ii;
+                        keyPattern = ii.keyPattern();
+                        break;
+                    }
                 }
             }
             
diff --git a/db/queryoptimizer.h b/db/queryoptimizer.h
index e4a79d8..1cb5052 100644
--- a/db/queryoptimizer.h
+++ b/db/queryoptimizer.h
@@ -25,6 +25,8 @@
 namespace mongo {
     
     class IndexDetails;
+    class IndexType;
+
     class QueryPlan : boost::noncopyable {
     public:
         QueryPlan(NamespaceDetails *_d, 
@@ -32,7 +34,8 @@ namespace mongo {
                   const FieldRangeSet &fbs,
                   const BSONObj &order,
                   const BSONObj &startKey = BSONObj(),
-                  const BSONObj &endKey = BSONObj() );
+                  const BSONObj &endKey = BSONObj() ,
+                  string special="" );
 
         /* If true, no other index can do better. */
         bool optimal() const { return optimal_; }
@@ -46,10 +49,11 @@ namespace mongo {
            requested sort order */
         bool unhelpful() const { return unhelpful_; }
         int direction() const { return direction_; }
-        auto_ptr< Cursor > newCursor( const DiskLoc &startLoc = DiskLoc() ) const;
+        auto_ptr< Cursor > newCursor( const DiskLoc &startLoc = DiskLoc() , int numWanted=0 ) const;
         auto_ptr< Cursor > newReverseCursor() const;
         BSONObj indexKey() const;
         const char *ns() const { return fbs_.ns(); }
+        NamespaceDetails *nsd() const { return d; }
         BSONObj query() const { return fbs_.query(); }
         BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return fbs_.simplifiedQuery( fields ); }
         const FieldRange &range( const char *fieldName ) const { return fbs_.range( fieldName ); }
@@ -69,6 +73,8 @@ namespace mongo {
         BoundList indexBounds_;
         bool endKeyInclusive_;
         bool unhelpful_;
+        string _special;
+        IndexType * _type;
     };
 
     // Inherit from this interface to implement a new query operation.
@@ -78,11 +84,15 @@ namespace mongo {
     public:
         QueryOp() : complete_(), qp_(), error_() {}
         virtual ~QueryOp() {}
+        
+        /** this gets called after a query plan is set? ERH 2/16/10 */
         virtual void init() = 0;
         virtual void next() = 0;
         virtual bool mayRecordPlan() const = 0;
-        // Return a copy of the inheriting class, which will be run with its own
-        // query plan.
+        
+        /** @return a copy of the inheriting class, which will be run with its own
+                    query plan.
+        */
         virtual QueryOp *clone() const = 0;
         bool complete() const { return complete_; }
         bool error() const { return error_; }
@@ -143,6 +153,7 @@ namespace mongo {
             static void nextOp( QueryOp &op );
         };
         const char *ns;
+        BSONObj query_;
         FieldRangeSet fbs_;
         PlanSet plans_;
         bool mayRecordPlan_;
@@ -153,9 +164,17 @@ namespace mongo {
         bool honorRecordedPlan_;
         BSONObj min_;
         BSONObj max_;
+        string _special;
     };
 
     // NOTE min, max, and keyPattern will be updated to be consistent with the selected index.
     IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern );
+
+    inline bool isSimpleIdQuery( const BSONObj& query ){
+        return 
+            strcmp( query.firstElement().fieldName() , "_id" ) == 0 && 
+            query.nFields() == 1 && 
+            query.firstElement().isSimpleType();
+    }
         
 } // namespace mongo
diff --git a/db/queryutil.cpp b/db/queryutil.cpp
index d8854be..c01b89e 100644
--- a/db/queryutil.cpp
+++ b/db/queryutil.cpp
@@ -24,96 +24,118 @@
 #include "../util/unittest.h"
 
 namespace mongo {
-    namespace {
-        /** returns a string that when used as a matcher, would match a super set of regex()
-            returns "" for complex regular expressions
-            used to optimize queries in some simple regex cases that start with '^'
-        */
-        inline string simpleRegexHelper(const char* regex, const char* flags){
-            string r = "";
-
-            bool extended = false;
-            while (*flags){
-                switch (*(flags++)){
-                    case 'm': // multiline
-                        continue;
-                    case 'x': // extended
-                        extended = true;
-                        break;
-                    default:
-                        return r; // cant use index
-                }
-            }
+    /** returns a string that when used as a matcher, would match a super set of regex()
+        returns "" for complex regular expressions
+        used to optimize queries in some simple regex cases that start with '^'
 
-            if ( *(regex++) != '^' )
-                return r;
+        if purePrefix != NULL, sets it to whether the regex can be converted to a range query
+    */
+    string simpleRegex(const char* regex, const char* flags, bool* purePrefix){
+        string r = "";
 
-            stringstream ss;
+        if (purePrefix) *purePrefix = false;
 
-            while(*regex){
-                char c = *(regex++);
-                if ( c == '*' || c == '?' ){
-                    // These are the only two symbols that make the last char optional
-                    r = ss.str();
-                    r = r.substr( 0 , r.size() - 1 );
-                    return r; //breaking here fails with /^a?/
-                } else if (c == '\\'){
-                    // slash followed by non-alphanumeric represents the following char
-                    c = *(regex++);
-                    if ((c >= 'A' && c <= 'Z') ||
-                        (c >= 'a' && c <= 'z') ||
-                        (c >= '0' && c <= '0') ||
-                        (c == '\0'))
-                    {
-                        r = ss.str();
-                        break;
-                    } else {
-                        ss << c;
-                    }
-                } else if (strchr("^$.[|()+{", c)){
-                    // list of "metacharacters" from man pcrepattern
-                    r = ss.str();
+        bool extended = false;
+        while (*flags){
+            switch (*(flags++)){
+                case 'm': // multiline
+                    continue;
+                case 'x': // extended
+                    extended = true;
                     break;
-                } else if (extended && c == '#'){
-                    // comment
+                default:
+                    return r; // cant use index
+            }
+        }
+
+        if ( *(regex++) != '^' )
+            return r;
+
+        stringstream ss;
+
+        while(*regex){
+            char c = *(regex++);
+            if ( c == '*' || c == '?' ){
+                // These are the only two symbols that make the last char optional
+                r = ss.str();
+                r = r.substr( 0 , r.size() - 1 );
+                return r; //breaking here fails with /^a?/
+            } else if (c == '\\'){
+                // slash followed by non-alphanumeric represents the following char
+                c = *(regex++);
+                if ((c >= 'A' && c <= 'Z') ||
+                    (c >= 'a' && c <= 'z') ||
+                    (c >= '0' && c <= '0') ||
+                    (c == '\0'))
+                {
                     r = ss.str();
                     break;
-                } else if (extended && isspace(c)){
-                    continue;
                 } else {
-                    // self-matching char
                     ss << c;
                 }
-            }
-
-            if ( r.size() == 0 && *regex == 0 )
+            } else if (strchr("^$.[|()+{", c)){
+                // list of "metacharacters" from man pcrepattern
                 r = ss.str();
+                break;
+            } else if (extended && c == '#'){
+                // comment
+                r = ss.str();
+                break;
+            } else if (extended && isspace(c)){
+                continue;
+            } else {
+                // self-matching char
+                ss << c;
+            }
+        }
 
-            return r;
+        if ( r.empty() && *regex == 0 ){
+            r = ss.str();
+            if (purePrefix) *purePrefix = !r.empty();
         }
-        inline string simpleRegex(const BSONElement& e){
-            switch(e.type()){
-                case RegEx:
-                    return simpleRegexHelper(e.regex(), e.regexFlags());
-                case Object:{
-                    BSONObj o = e.embeddedObject();
-                    return simpleRegexHelper(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe());
-                }
-                default: assert(false); return ""; //return squashes compiler warning
+
+        return r;
+    }
+    inline string simpleRegex(const BSONElement& e){
+        switch(e.type()){
+            case RegEx:
+                return simpleRegex(e.regex(), e.regexFlags());
+            case Object:{
+                BSONObj o = e.embeddedObject();
+                return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe());
             }
+            default: assert(false); return ""; //return squashes compiler warning
         }
     }
+
+    string simpleRegexEnd( string regex ) {
+        ++regex[ regex.length() - 1 ];
+        return regex;
+    }    
     
-    FieldRange::FieldRange( const BSONElement &e, bool optimize ) {
-        if ( !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) {
+    
+    FieldRange::FieldRange( const BSONElement &e, bool isNot, bool optimize ) {
+        // NOTE with $not, we could potentially form a complementary set of intervals.
+        if ( !isNot && !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) {
             set< BSONElement, element_lt > vals;
+            vector< FieldRange > regexes;
+            uassert( 12580 , "invalid query" , e.isABSONObj() );
             BSONObjIterator i( e.embeddedObject() );
-            while( i.more() )
-                vals.insert( i.next() );
+            while( i.more() ) {
+                BSONElement ie = i.next();
+                if ( ie.type() == RegEx ) {
+                    regexes.push_back( FieldRange( ie, false, optimize ) );
+                } else {
+                    vals.insert( ie );
+                }
+            }
 
             for( set< BSONElement, element_lt >::const_iterator i = vals.begin(); i != vals.end(); ++i )
                 intervals_.push_back( FieldInterval(*i) );
 
+            for( vector< FieldRange >::const_iterator i = regexes.begin(); i != regexes.end(); ++i )
+                *this |= *i;
+            
             return;
         }
         
@@ -149,15 +171,66 @@ namespace mongo {
              || (e.type() == Object && !e.embeddedObject()["$regex"].eoo())
            )
         {
-            const string r = simpleRegex(e);
-            if ( r.size() ) {
-                lower = addObj( BSON( "" << r ) ).firstElement();
-                upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement();
-                upperInclusive = false;
-            }            
+            if ( !isNot ) { // no optimization for negated regex - we could consider creating 2 intervals comprising all nonmatching prefixes
+                const string r = simpleRegex(e);
+                if ( r.size() ) {
+                    lower = addObj( BSON( "" << r ) ).firstElement();
+                    upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement();
+                    upperInclusive = false;
+                } else {
+                    BSONObjBuilder b1(32), b2(32);
+                    b1.appendMinForType( "" , String );
+                    lower = addObj( b1.obj() ).firstElement();
+
+                    b2.appendMaxForType( "" , String );
+                    upper = addObj( b2.obj() ).firstElement();
+                    upperInclusive = false; //MaxForType String is an empty Object
+                }
+
+                // regex matches self - regex type > string type
+                if (e.type() == RegEx){
+                    BSONElement re = addObj( BSON( "" << e ) ).firstElement();
+                    intervals_.push_back( FieldInterval(re) );
+                } else {
+                    BSONObj orig = e.embeddedObject();
+                    BSONObjBuilder b;
+                    b.appendRegex("", orig["$regex"].valuestrsafe(), orig["$options"].valuestrsafe());
+                    BSONElement re = addObj( b.obj() ).firstElement();
+                    intervals_.push_back( FieldInterval(re) );
+                }
+
+            }
             return;
         }
-        switch( e.getGtLtOp() ) {
+        int op = e.getGtLtOp();
+        if ( isNot ) {
+            switch( op ) {
+                case BSONObj::Equality:
+                case BSONObj::opALL:
+                case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in)
+                case BSONObj::opTYPE:
+                    op = BSONObj::NE; // no bound calculation
+                    break;
+                case BSONObj::NE:
+                    op = BSONObj::Equality;
+                    break;
+                case BSONObj::LT:
+                    op = BSONObj::GTE;
+                    break;
+                case BSONObj::LTE:
+                    op = BSONObj::GT;
+                    break;
+                case BSONObj::GT:
+                    op = BSONObj::LTE;
+                    break;
+                case BSONObj::GTE:
+                    op = BSONObj::LT;
+                    break;
+                default: // otherwise doesn't matter
+                    break;
+            }
+        }
+        switch( op ) {
         case BSONObj::Equality:
             lower = upper = e;
             break;
@@ -174,8 +247,32 @@ namespace mongo {
         case BSONObj::opALL: {
             massert( 10370 ,  "$all requires array", e.type() == Array );
             BSONObjIterator i( e.embeddedObject() );
-            if ( i.more() )
-                lower = upper = i.next();
+            bool bound = false;
+            while ( i.more() ){
+                BSONElement x = i.next();
+                if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ){
+                    // taken care of elsewhere
+                }
+                else if ( x.type() != RegEx ) {
+                    lower = upper = x;
+                    bound = true;
+                    break;
+                }
+            }
+            if ( !bound ) { // if no good non regex bound found, try regex bounds
+                BSONObjIterator i( e.embeddedObject() );
+                while( i.more() ) {
+                    BSONElement x = i.next();
+                    if ( x.type() != RegEx )
+                        continue;
+                    string simple = simpleRegex( x.regex(), x.regexFlags() );
+                    if ( !simple.empty() ) {
+                        lower = addObj( BSON( "" << simple ) ).firstElement();
+                        upper = addObj( BSON( "" << simpleRegexEnd( simple ) ) ).firstElement();
+                        break;
+                    }
+                }
+            }
             break;
         }
         case BSONObj::opMOD: {
@@ -206,10 +303,18 @@ namespace mongo {
             
             break;
         }
+        case BSONObj::opREGEX:
+        case BSONObj::opOPTIONS:
+            // do nothing
+            break;
         case BSONObj::opELEM_MATCH: {
             log() << "warning: shouldn't get here?" << endl;
             break;
         }
+        case BSONObj::opNEAR:
+        case BSONObj::opWITHIN:
+            _special = "2d";
+            break;
         default:
             break;
         }
@@ -269,19 +374,118 @@ namespace mongo {
         intervals_ = newIntervals;
         for( vector< BSONObj >::const_iterator i = other.objData_.begin(); i != other.objData_.end(); ++i )
             objData_.push_back( *i );
+        if ( _special.size() == 0 && other._special.size() )
+            _special = other._special;
         return *this;
     }
     
-    string FieldRange::simpleRegexEnd( string regex ) {
-        ++regex[ regex.length() - 1 ];
-        return regex;
-    }    
+    void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector< FieldInterval > &newIntervals ) {
+        if ( low.bound_.eoo() ) {
+            low = lower.lower_; high = lower.upper_;
+        } else {
+            if ( high.bound_.woCompare( lower.lower_.bound_, false ) < 0 ) { // when equal but neither inclusive, just assume they overlap, since current btree scanning code just as efficient either way
+                FieldInterval tmp;
+                tmp.lower_ = low;
+                tmp.upper_ = high;
+                newIntervals.push_back( tmp );
+                low = lower.lower_; high = lower.upper_;                    
+            } else {
+                high = lower.upper_;
+            }
+        }        
+    }
+    
+    const FieldRange &FieldRange::operator|=( const FieldRange &other ) {
+        vector< FieldInterval > newIntervals;
+        FieldBound low;
+        FieldBound high;
+        vector< FieldInterval >::const_iterator i = intervals_.begin();
+        vector< FieldInterval >::const_iterator j = other.intervals_.begin();
+        while( i != intervals_.end() && j != other.intervals_.end() ) {
+            int cmp = i->lower_.bound_.woCompare( j->lower_.bound_, false );
+            if ( ( cmp == 0 && i->lower_.inclusive_ ) || cmp < 0 ) {
+                handleInterval( *i, low, high, newIntervals );
+                ++i;
+            } else {
+                handleInterval( *j, low, high, newIntervals );
+                ++j;
+            } 
+        }
+        while( i != intervals_.end() ) {
+            handleInterval( *i, low, high, newIntervals );
+            ++i;            
+        }
+        while( j != other.intervals_.end() ) {
+            handleInterval( *j, low, high, newIntervals );
+            ++j;            
+        }
+        FieldInterval tmp;
+        tmp.lower_ = low;
+        tmp.upper_ = high;
+        newIntervals.push_back( tmp );        
+        intervals_ = newIntervals;
+        for( vector< BSONObj >::const_iterator i = other.objData_.begin(); i != other.objData_.end(); ++i )
+            objData_.push_back( *i );
+        if ( _special.size() == 0 && other._special.size() )
+            _special = other._special;
+        return *this;        
+    }
     
     BSONObj FieldRange::addObj( const BSONObj &o ) {
         objData_.push_back( o );
         return o;
     }
     
+    string FieldRangeSet::getSpecial() const {
+        string s = "";
+        for ( map<string,FieldRange>::iterator i=ranges_.begin(); i!=ranges_.end(); i++ ){
+            if ( i->second.getSpecial().size() == 0 )
+                continue;
+            uassert( 13033 , "can't have 2 special fields" , s.size() == 0 );
+            s = i->second.getSpecial();
+        }
+        return s;
+    }
+
+    void FieldRangeSet::processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize ) {
+        BSONElement g = f;
+        int op2 = g.getGtLtOp();
+        if ( op2 == BSONObj::opALL ) {
+            BSONElement h = g;
+            massert( 13050 ,  "$all requires array", h.type() == Array );
+            BSONObjIterator i( h.embeddedObject() );
+            if( i.more() ) {
+                BSONElement x = i.next();
+                if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+                    g = x.embeddedObject().firstElement();
+                    op2 = g.getGtLtOp();
+                }
+            }
+        }
+        if ( op2 == BSONObj::opELEM_MATCH ) {
+            BSONObjIterator k( g.embeddedObjectUserCheck() );
+            while ( k.more() ){
+                BSONElement h = k.next();
+                StringBuilder buf(32);
+                buf << fieldName << "." << h.fieldName();
+                string fullname = buf.str();
+                
+                int op3 = getGtLtOp( h );
+                if ( op3 == BSONObj::Equality ){
+                    ranges_[ fullname ] &= FieldRange( h , isNot , optimize );
+                }
+                else {
+                    BSONObjIterator l( h.embeddedObject() );
+                    while ( l.more() ){
+                        ranges_[ fullname ] &= FieldRange( l.next() , isNot , optimize );
+                    }
+                }
+            }                        
+        } else {
+            ranges_[ fieldName ] &= FieldRange( f , isNot , optimize );
+        }        
+    }
+    
     FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query , bool optimize )
         : ns_( ns ), query_( query.getOwned() ) {
         BSONObjIterator i( query_ );
@@ -293,36 +497,38 @@ namespace mongo {
             if ( strcmp( e.fieldName(), "$where" ) == 0 )
                 continue;
 
-            int op = getGtLtOp( e );
+            bool equality = ( getGtLtOp( e ) == BSONObj::Equality );
+            if ( equality && e.type() == Object ) {
+                equality = ( strcmp( e.embeddedObject().firstElement().fieldName(), "$not" ) != 0 );
+            }
             
-            if ( op == BSONObj::Equality || op == BSONObj::opREGEX || op == BSONObj::opOPTIONS ) {
-                ranges_[ e.fieldName() ] &= FieldRange( e , optimize );
-            }
-            else if ( op == BSONObj::opELEM_MATCH ){
-                BSONObjIterator i( e.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck() );
-                while ( i.more() ){
-                    BSONElement f = i.next();
-                    StringBuilder buf(32);
-                    buf << e.fieldName() << "." << f.fieldName();
-                    string fullname = buf.str();
-
-                    int op2 = getGtLtOp( f );
-                    if ( op2 == BSONObj::Equality ){
-                        ranges_[ fullname ] &= FieldRange( f , optimize );
-                    }
-                    else {
-                        BSONObjIterator j( f.embeddedObject() );
-                        while ( j.more() ){
-                            ranges_[ fullname ] &= FieldRange( j.next() , optimize );
+            if ( equality || ( e.type() == Object && !e.embeddedObject()[ "$regex" ].eoo() ) ) {
+                ranges_[ e.fieldName() ] &= FieldRange( e , false , optimize );
+            }
+            if ( !equality ) {
+                BSONObjIterator j( e.embeddedObject() );
+                while( j.more() ) {
+                    BSONElement f = j.next();
+                    if ( strcmp( f.fieldName(), "$not" ) == 0 ) {
+                        switch( f.type() ) {
+                            case Object: {
+                                BSONObjIterator k( f.embeddedObject() );
+                                while( k.more() ) {
+                                    BSONElement g = k.next();
+                                    uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality );
+                                    processOpElement( e.fieldName(), g, true, optimize );
+                                }
+                                break;
+                            }
+                            case RegEx:
+                                processOpElement( e.fieldName(), f, true, optimize );
+                                break;
+                            default:
+                                uassert( 13041, "invalid use of $not", false );
                         }
+                    } else {
+                        processOpElement( e.fieldName(), f, false, optimize );
                     }
-                }
-            }
-            else {
-                BSONObjIterator i( e.embeddedObject() );
-                while( i.more() ) {
-                    BSONElement f = i.next();
-                    ranges_[ e.fieldName() ] &= FieldRange( f , optimize );
                 }                
             }
         }
@@ -445,8 +651,8 @@ namespace mongo {
     ///////////////////
     
     void FieldMatcher::add( const BSONObj& o ){
-        massert( 10371 , "can only add to FieldMatcher once", source_.isEmpty());
-        source_ = o;
+        massert( 10371 , "can only add to FieldMatcher once", _source.isEmpty());
+        _source = o;
 
         BSONObjIterator i( o );
         int true_false = -1;
@@ -457,23 +663,24 @@ namespace mongo {
             // validate input
             if (true_false == -1){
                 true_false = e.trueValue();
-                include_ = !e.trueValue();
-            }else{
-                if((bool) true_false != e.trueValue())
-                    errmsg = "You cannot currently mix including and excluding fields. Contact us if this is an issue.";
+                _include = !e.trueValue();
+            }
+            else{
+                uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." , 
+                         (bool)true_false == e.trueValue() );
             }
         }
     }
 
     void FieldMatcher::add(const string& field, bool include){
         if (field.empty()){ // this is the field the user referred to
-            include_ = include;
+            _include = include;
         } else {
             const size_t dot = field.find('.');
             const string subfield = field.substr(0,dot);
             const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); 
 
-            boost::shared_ptr<FieldMatcher>& fm = fields_[subfield];
+            boost::shared_ptr<FieldMatcher>& fm = _fields[subfield];
             if (!fm)
                 fm.reset(new FieldMatcher(!include));
 
@@ -482,7 +689,7 @@ namespace mongo {
     }
 
     BSONObj FieldMatcher::getSpec() const{
-        return source_;
+        return _source;
     }
 
     //b will be the value part of an array-typed BSONElement
@@ -509,7 +716,7 @@ namespace mongo {
                     break;
                 }
                 default:
-                    if (include_)
+                    if (_include)
                         b.appendAs(e, b.numStr(i++).c_str());
             }
             
@@ -518,18 +725,20 @@ namespace mongo {
     }
 
     void FieldMatcher::append( BSONObjBuilder& b , const BSONElement& e ) const {
-        FieldMap::const_iterator field = fields_.find( e.fieldName() );
+        FieldMap::const_iterator field = _fields.find( e.fieldName() );
         
-        if (field == fields_.end()){
-            if (include_)
+        if (field == _fields.end()){
+            if (_include)
                 b.append(e);
-        } else {
+        } 
+        else {
             FieldMatcher& subfm = *field->second;
-
-            if (subfm.fields_.empty() || !(e.type()==Object || e.type()==Array) ){
-                if (subfm.include_)
+            
+            if (subfm._fields.empty() || !(e.type()==Object || e.type()==Array) ){
+                if (subfm._include)
                     b.append(e);
-            } else if (e.type() == Object){ 
+            }
+            else if (e.type() == Object){ 
                 BSONObjBuilder subb;
                 BSONObjIterator it(e.embeddedObject());
                 while (it.more()){
@@ -537,7 +746,8 @@ namespace mongo {
                 }
                 b.append(e.fieldName(), subb.obj());
 
-            } else { //Array
+            } 
+            else { //Array
                 BSONObjBuilder subb;
                 subfm.appendArray(subb, e.embeddedObject());
                 b.appendArray(e.fieldName(), subb.obj());
diff --git a/db/queryutil.h b/db/queryutil.h
index 2122a7f..7d8be78 100644
--- a/db/queryutil.h
+++ b/db/queryutil.h
@@ -48,8 +48,9 @@ namespace mongo {
     // determine index limits
     class FieldRange {
     public:
-        FieldRange( const BSONElement &e = BSONObj().firstElement() , bool optimize=true );
+        FieldRange( const BSONElement &e = BSONObj().firstElement() , bool isNot=false , bool optimize=true );
         const FieldRange &operator&=( const FieldRange &other );
+    const FieldRange &operator|=( const FieldRange &other );
         BSONElement min() const { assert( !empty() ); return intervals_[ 0 ].lower_.bound_; }
         BSONElement max() const { assert( !empty() ); return intervals_[ intervals_.size() - 1 ].upper_.bound_; }
         bool minInclusive() const { assert( !empty() ); return intervals_[ 0 ].lower_.inclusive_; }
@@ -69,11 +70,13 @@ namespace mongo {
         }
         bool empty() const { return intervals_.empty(); }
 		const vector< FieldInterval > &intervals() const { return intervals_; }
+        string getSpecial() const { return _special; }
+
     private:
         BSONObj addObj( const BSONObj &o );
-        string simpleRegexEnd( string regex );
         vector< FieldInterval > intervals_;
         vector< BSONObj > objData_;
+        string _special;
     };
     
     // implements query pattern matching, used to determine if a query is
@@ -171,7 +174,9 @@ namespace mongo {
         }
         QueryPattern pattern( const BSONObj &sort = BSONObj() ) const;
         BoundList indexBounds( const BSONObj &keyPattern, int direction ) const;
+        string getSpecial() const;
     private:
+        void processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize );
         static FieldRange *trivialRange_;
         static FieldRange &trivialRange();
         mutable map< string, FieldRange > ranges_;
@@ -185,26 +190,34 @@ namespace mongo {
     class FieldMatcher {
     public:
 
-        FieldMatcher(bool include=false) : errmsg(NULL), include_(include)  {}
+        FieldMatcher(bool include=false) : _include(include){}
         
         void add( const BSONObj& o );
 
         void append( BSONObjBuilder& b , const BSONElement& e ) const;
 
         BSONObj getSpec() const;
-
-        const char* errmsg; //null if FieldMatcher is valid
     private:
 
         void add( const string& field, bool include );
         void appendArray( BSONObjBuilder& b , const BSONObj& a ) const;
 
-        bool include_; // true if default at this level is to include
+        bool _include; // true if default at this level is to include
         //TODO: benchmark vector<pair> vs map
         typedef map<string, boost::shared_ptr<FieldMatcher> > FieldMap;
-        FieldMap fields_;
-        BSONObj source_;
+        FieldMap _fields;
+        BSONObj _source;
     };
 
+    /** returns a string that when used as a matcher, would match a super set of regex()
+        returns "" for complex regular expressions
+        used to optimize queries in some simple regex cases that start with '^'
+
+        if purePrefix != NULL, sets it to whether the regex can be converted to a range query
+    */
+    string simpleRegex(const char* regex, const char* flags, bool* purePrefix=NULL);
+
+    /** returns the upper bound of a query that matches prefix */
+    string simpleRegexEnd( string prefix );
 
 } // namespace mongo
diff --git a/db/rec.h b/db/rec.h
index b749dd8..ee75669 100644
--- a/db/rec.h
+++ b/db/rec.h
@@ -1,4 +1,20 @@
 // rec.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 
 /* TODO for _RECSTORE
 
diff --git a/db/reccache.cpp b/db/reccache.cpp
index 66dd4e3..6e1f3de 100644
--- a/db/reccache.cpp
+++ b/db/reccache.cpp
@@ -1,134 +1,150 @@
-// storage.cpp
-
-#include "stdafx.h"
-#include "pdfile.h"
-#include "reccache.h"
-#include "rec.h"
-#include "db.h"
-
-namespace mongo {
-
-RecCache theRecCache(BucketSize);
-
-// 100k * 8KB = 800MB
-unsigned RecCache::MAXNODES = 50000;
-
-void setRecCacheSize(unsigned mb) {
-    unsigned long long MB = mb;
-    log(2) << "reccache size: " << MB << "MB\n";
-    uassert( 10114 ,  "bad cache size", MB > 0 && MB < 1000000 );
-    RecCache::MAXNODES = (unsigned) MB * 1024 * 1024 / 8192;
-    log(3) << "RecCache::MAXNODES=" << RecCache::MAXNODES << '\n';
-}
-
-void writerThread() { 
-    sleepsecs(10);
-    while( 1 ) { 
-        try { 
-            theRecCache.writeLazily();
-        }
-        catch(...) { 
-            log() << "exception in writerThread()" << endl;
-            sleepsecs(3);
-        }
-    }
-}
-
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// storage.cpp
+
+#include "stdafx.h"
+#include "pdfile.h"
+#include "reccache.h"
+#include "rec.h"
+#include "db.h"
+
+namespace mongo {
+
+RecCache theRecCache(BucketSize);
+
+// 100k * 8KB = 800MB
+unsigned RecCache::MAXNODES = 50000;
+
+void setRecCacheSize(unsigned mb) {
+    unsigned long long MB = mb;
+    log(2) << "reccache size: " << MB << "MB\n";
+    uassert( 10114 ,  "bad cache size", MB > 0 && MB < 1000000 );
+    RecCache::MAXNODES = (unsigned) MB * 1024 * 1024 / 8192;
+    log(3) << "RecCache::MAXNODES=" << RecCache::MAXNODES << '\n';
+}
+
+void writerThread() { 
+    sleepsecs(10);
+    while( 1 ) { 
+        try { 
+            theRecCache.writeLazily();
+        }
+        catch(...) { 
+            log() << "exception in writerThread()" << endl;
+            sleepsecs(3);
+        }
+    }
+}
+
 // called on program exit.
-void recCacheCloseAll() { 
-#if defined(_RECSTORE)
-    theRecCache.closing();
-#endif
-}
-
-int ndirtywritten;
-
-inline static string escape(const char *ns) {
-    char buf[256];
-    char *p = buf;
-    while( 1 ) {
-        if( *ns == '$' ) *p = '~';
-        else
-            *p = *ns;
-        if( *ns == 0 )
-            break;
-        p++; ns++;
-    }
-    assert( p - buf < (int) sizeof(buf) );
-    return buf;
-}
-
-inline static string unescape(const char *ns) {
-    char buf[256];
-    char *p = buf;
-    while( 1 ) {
-        if( *ns == '~' ) *p = '$';
-        else
-            *p = *ns;
-        if( *ns == 0 )
-            break;
-        p++; ns++;
-    }
-    assert( p - buf < (int) sizeof(buf) );
-    return buf;
-}
-
-string RecCache::directory() { 
-    return cc().database()->path;
-}
-
-/* filename format is 
-
-     <n>-<ns>.idx
-*/
-
-BasicRecStore* RecCache::_initStore(string fname) { 
-
-    assert( strchr(fname.c_str(), '/') == 0 );
-    assert( strchr(fname.c_str(), '\\') == 0 );
-
-    stringstream ss(fname);
-    int n;
-    ss >> n;
-    assert( n >= 0 );
-    char ch;
-    ss >> ch;
-    assert( ch == '-' );
-    string rest;
-    ss >> rest;
-    const char *p = rest.c_str();
-    const char *q = strstr(p, ".idx");
-    assert( q );
-    string escaped_ns(p, q-p);
-
-    // arbitrary limit.  if you are hitting, we should use fewer files and put multiple 
-    // indexes in a single file (which is easy to do)
-    massert( 10374 ,  "too many index files", n < 10000 );
-
-    if( stores.size() < (unsigned)n+1 )
-        stores.resize(n+1);
-    assert( stores[n] == 0 );
-    BasicRecStore *rs = new BasicRecStore(n);
-    path pf(directory());
-    pf /= fname;
-    string full = pf.string();
-    rs->init(full.c_str(), recsize);
-    stores[n] = rs;
-    string ns = unescape(escaped_ns.c_str());
-    storesByNsKey[mknskey(ns.c_str())] = rs;
-    return rs;
-}
-
-BasicRecStore* RecCache::initStore(int n) { 
-    string ns;
-    { 
-        stringstream ss;
-        ss << '/' << n << '-';
-        ns = ss.str();
-    }
-
-    /* this will be slow if there are thousands of files */
-    path dir(directory());
+void recCacheCloseAll() { 
+#if defined(_RECSTORE)
+    theRecCache.closing();
+#endif
+}
+
+int ndirtywritten;
+
+inline static string escape(const char *ns) {
+    char buf[256];
+    char *p = buf;
+    while( 1 ) {
+        if( *ns == '$' ) *p = '~';
+        else
+            *p = *ns;
+        if( *ns == 0 )
+            break;
+        p++; ns++;
+    }
+    assert( p - buf < (int) sizeof(buf) );
+    return buf;
+}
+
+inline static string unescape(const char *ns) {
+    char buf[256];
+    char *p = buf;
+    while( 1 ) {
+        if( *ns == '~' ) *p = '$';
+        else
+            *p = *ns;
+        if( *ns == 0 )
+            break;
+        p++; ns++;
+    }
+    assert( p - buf < (int) sizeof(buf) );
+    return buf;
+}
+
+string RecCache::directory() { 
+    return cc().database()->path;
+}
+
+/* filename format is 
+
+     <n>-<ns>.idx
+*/
+
+BasicRecStore* RecCache::_initStore(string fname) { 
+
+    assert( strchr(fname.c_str(), '/') == 0 );
+    assert( strchr(fname.c_str(), '\\') == 0 );
+
+    stringstream ss(fname);
+    int n;
+    ss >> n;
+    assert( n >= 0 );
+    char ch;
+    ss >> ch;
+    assert( ch == '-' );
+    string rest;
+    ss >> rest;
+    const char *p = rest.c_str();
+    const char *q = strstr(p, ".idx");
+    assert( q );
+    string escaped_ns(p, q-p);
+
+    // arbitrary limit.  if you are hitting, we should use fewer files and put multiple 
+    // indexes in a single file (which is easy to do)
+    massert( 10374 ,  "too many index files", n < 10000 );
+
+    if( stores.size() < (unsigned)n+1 )
+        stores.resize(n+1);
+    assert( stores[n] == 0 );
+    BasicRecStore *rs = new BasicRecStore(n);
+    path pf(directory());
+    pf /= fname;
+    string full = pf.string();
+    rs->init(full.c_str(), recsize);
+    stores[n] = rs;
+    string ns = unescape(escaped_ns.c_str());
+    storesByNsKey[mknskey(ns.c_str())] = rs;
+    return rs;
+}
+
+BasicRecStore* RecCache::initStore(int n) { 
+    string ns;
+    { 
+        stringstream ss;
+        ss << '/' << n << '-';
+        ns = ss.str();
+    }
+
+    /* this will be slow if there are thousands of files */
+    path dir(directory());
     directory_iterator end;
     try {
         directory_iterator i(dir);
@@ -152,27 +168,27 @@ BasicRecStore* RecCache::initStore(int n) {
     }
     stringstream ss;
     ss << "index datafile missing? n=" << n;
-    uasserted(12500,ss.str());
-    return 0;
-}
-
-/* find the filename for a given ns.
-   format is 
-     <n>-<escaped_ns>.idx
-   returns filename.  found is true if found.  If false, a proposed name is returned for (optional) creation
-   of the file.
-*/
-string RecCache::findStoreFilename(const char *_ns, bool& found) {
-    string namefrag;
-    { 
-        stringstream ss;
-        ss << '-';
-        ss << escape(_ns);
-        ss << ".idx";
-        namefrag = ss.str();
-    }
-
-    path dir(directory());
+    uasserted(12500,ss.str());
+    return 0;
+}
+
+/* find the filename for a given ns.
+   format is 
+     <n>-<escaped_ns>.idx
+   returns filename.  found is true if found.  If false, a proposed name is returned for (optional) creation
+   of the file.
+*/
+string RecCache::findStoreFilename(const char *_ns, bool& found) {
+    string namefrag;
+    { 
+        stringstream ss;
+        ss << '-';
+        ss << escape(_ns);
+        ss << ".idx";
+        namefrag = ss.str();
+    }
+
+    path dir(directory());
     directory_iterator end;
     int nmax = -1;
     try {
@@ -204,198 +220,198 @@ string RecCache::findStoreFilename(const char *_ns, bool& found) {
     ss << nmax+1 << namefrag;
     found = false;
     return ss.str();
-}
-
-void RecCache::initStoreByNs(const char *_ns, const string& nskey) {
-    bool found;
-    string fn = findStoreFilename(_ns, found);
+}
+
+void RecCache::initStoreByNs(const char *_ns, const string& nskey) {
+    bool found;
+    string fn = findStoreFilename(_ns, found);
     _initStore(fn);
-}
-
-inline void RecCache::writeIfDirty(Node *n) {
-    if( n->dirty ) {
-        ndirtywritten++;
-        n->dirty = false;
-        store(n->loc).update(fileOfs(n->loc), n->data, recsize);
-    }
-}
-
-void RecCache::closeFiles(string dbname, string path) { 
-    assertInWriteLock();
-    boostlock lk(rcmutex);
-
-    // first we write all dirty pages.  it is not easy to check which Nodes are for a particular
-    // db, so we just write them all.
-    writeDirty( dirtyl.begin(), true );
-
-    string key = path + dbname + '.';
-    unsigned sz = key.size();
-    for( map<string, BasicRecStore*>::iterator i = storesByNsKey.begin(); i != storesByNsKey.end(); i++ ) { 
-        map<string, BasicRecStore*>::iterator j = i;
-        i++;
-        if( strncmp(j->first.c_str(), key.c_str(), sz) == 0 ) {
-            assert( stores[j->second->fileNumber] != 0 );
-            stores[j->second->fileNumber] = 0;
-            delete j->second;
-            storesByNsKey.erase(j);
-        }
-    }
-}
-
-void RecCache::closing() { 
-    boostlock lk(rcmutex);
-    (cout << "TEMP: recCacheCloseAll() writing dirty pages...\n").flush();
-    writeDirty( dirtyl.begin(), true );
-    for( unsigned i = 0; i < stores.size(); i++ ) { 
-        if( stores[i] ) {
-            delete stores[i];
-        }
-    }
-    (cout << "TEMP: write dirty done\n").flush();
-}
-
-/* note that this is written in order, as much as possible, given that dirtyl is of type set. */
-void RecCache::writeDirty( set<DiskLoc>::iterator startAt, bool rawLog ) { 
-    try { 
-        ndirtywritten=0;
-        for( set<DiskLoc>::iterator i = startAt; i != dirtyl.end(); i++ ) { 
-            map<DiskLoc, Node*>::iterator j = m.find(*i);
-            if( j != m.end() )
-                writeIfDirty(j->second);
-        }
-        OCCASIONALLY out() << "TEMP: ndirtywritten: " << ndirtywritten << endl;
-    }
-    catch(...) {
+}
+
+inline void RecCache::writeIfDirty(Node *n) {
+    if( n->dirty ) {
+        ndirtywritten++;
+        n->dirty = false;
+        store(n->loc).update(fileOfs(n->loc), n->data, recsize);
+    }
+}
+
+void RecCache::closeFiles(string dbname, string path) { 
+    assertInWriteLock();
+    scoped_lock lk(rcmutex);
+
+    // first we write all dirty pages.  it is not easy to check which Nodes are for a particular
+    // db, so we just write them all.
+    writeDirty( dirtyl.begin(), true );
+
+    string key = path + dbname + '.';
+    unsigned sz = key.size();
+    for( map<string, BasicRecStore*>::iterator i = storesByNsKey.begin(); i != storesByNsKey.end(); i++ ) { 
+        map<string, BasicRecStore*>::iterator j = i;
+        i++;
+        if( strncmp(j->first.c_str(), key.c_str(), sz) == 0 ) {
+            assert( stores[j->second->fileNumber] != 0 );
+            stores[j->second->fileNumber] = 0;
+            delete j->second;
+            storesByNsKey.erase(j);
+        }
+    }
+}
+
+void RecCache::closing() { 
+    scoped_lock lk(rcmutex);
+    (cout << "TEMP: recCacheCloseAll() writing dirty pages...\n").flush();
+    writeDirty( dirtyl.begin(), true );
+    for( unsigned i = 0; i < stores.size(); i++ ) { 
+        if( stores[i] ) {
+            delete stores[i];
+        }
+    }
+    (cout << "TEMP: write dirty done\n").flush();
+}
+
+/* note that this is written in order, as much as possible, given that dirtyl is of type set. */
+void RecCache::writeDirty( set<DiskLoc>::iterator startAt, bool rawLog ) { 
+    try { 
+        ndirtywritten=0;
+        for( set<DiskLoc>::iterator i = startAt; i != dirtyl.end(); i++ ) { 
+            map<DiskLoc, Node*>::iterator j = m.find(*i);
+            if( j != m.end() )
+                writeIfDirty(j->second);
+        }
+        OCCASIONALLY out() << "TEMP: ndirtywritten: " << ndirtywritten << endl;
+    }
+    catch(...) {
         const char *message = "Problem: bad() in RecCache::writeDirty, file io error\n";
 
         if ( rawLog )
             rawOut( message );
         else
             ( log() << message ).flush();
-    }
-    dirtyl.clear();
-}
-
-void RecCache::writeLazily() {
-    int sleep = 0;
-    int k;
-    {
-        boostlock lk(rcmutex);
-        Timer t;
-        set<DiskLoc>::iterator i = dirtyl.end();
-        for( k = 0; k < 100; k++ ) {
-            if( i == dirtyl.begin() ) { 
-                // we're not very far behind
-                sleep = k < 20 ? 2000 : 1000;
-                break;
-            }
-            i--;
-        }
-        writeDirty(i);
-        if( sleep == 0 ) {
-            sleep = t.millis() * 4 + 10;
-        }
-    }
-
-    OCCASIONALLY cout << "writeLazily " << k << " sleep:" << sleep << '\n';
-    sleepmillis(sleep);
-}
-
-void RecCache::_ejectOld() { 
-    boostlock lk(rcmutex);
-    if( nnodes <= MAXNODES )
-        return;
-    Node *n = oldest;
-    while( 1 ) {
-        if( nnodes <= MAXNODES - 4 ) { 
-            n->older = 0;
-            oldest = n;
-            assert( oldest ) ;
-            break;
-        }
-        nnodes--;
-        assert(n);
-        Node *nxt = n->newer;
-        writeIfDirty(n);
-        m.erase(n->loc);
-        delete n;
-        n = nxt;
-    }
-}
-
-void RecCache::dump() { 
-    Node *n = oldest;
-    Node *last = 0;
-    while( n ) { 
-        assert( n->older == last );
-        last = n;
-//        cout << n << ' ' << n->older << ' ' << n->newer << '\n';
-        n=n->newer;
-    }
-    assert( newest == last );
-//    cout << endl;
-}
-
-/* cleans up everything EXCEPT storesByNsKey.
-   note this function is slow should not be invoked often
-*/
-void RecCache::closeStore(BasicRecStore *rs) { 
-    int n = rs->fileNumber + Base;
-    for( set<DiskLoc>::iterator i = dirtyl.begin(); i != dirtyl.end(); ) { 
-        DiskLoc k = *i++;
-        if( k.a() == n )
-            dirtyl.erase(k);
-    }
-
-    for( map<DiskLoc,Node*>::iterator i = m.begin(); i != m.end(); ) { 
-        DiskLoc k = i->first;
-        i++;
-        if( k.a() == n )
-            m.erase(k);
-    }
-
-    assert( stores[rs->fileNumber] != 0 );
-    stores[rs->fileNumber] = 0;
-/*
-    for( unsigned i = 0; i < stores.size(); i++ ) { 
-        if( stores[i] == rs ) { 
-            stores[i] = 0;
-            break;
-        }
-    }*/
-    delete rs; // closes file
-}
-
-void RecCache::drop(const char *_ns) { 
-    // todo: test with a non clean shutdown file
-    boostlock lk(rcmutex);
-
-    map<string, BasicRecStore*>::iterator it = storesByNsKey.find(mknskey(_ns));
-    string fname;
-    if( it != storesByNsKey.end() ) {
-        fname = it->second->filename;
-        closeStore(it->second); // cleans up stores[] etc.
-        storesByNsKey.erase(it);
-    }
-    else { 
-        bool found;
-        fname = findStoreFilename(_ns, found);
-        if( !found ) { 
-            log() << "RecCache::drop: no idx file found for " << _ns << endl;
-            return;
-        }
-        path pf(directory());
-        pf /= fname;
-        fname = pf.string();
-    }
-    try {
-        if( !boost::filesystem::exists(fname) ) 
-            log() << "RecCache::drop: can't find file to remove " << fname << endl;
-        boost::filesystem::remove(fname);
-    } 
-    catch(...) { 
-        log() << "RecCache::drop: exception removing file " << fname << endl;
-    }
-}
-
-}
+    }
+    dirtyl.clear();
+}
+
+void RecCache::writeLazily() {
+    int sleep = 0;
+    int k;
+    {
+        scoped_lock lk(rcmutex);
+        Timer t;
+        set<DiskLoc>::iterator i = dirtyl.end();
+        for( k = 0; k < 100; k++ ) {
+            if( i == dirtyl.begin() ) { 
+                // we're not very far behind
+                sleep = k < 20 ? 2000 : 1000;
+                break;
+            }
+            i--;
+        }
+        writeDirty(i);
+        if( sleep == 0 ) {
+            sleep = t.millis() * 4 + 10;
+        }
+    }
+
+    OCCASIONALLY cout << "writeLazily " << k << " sleep:" << sleep << '\n';
+    sleepmillis(sleep);
+}
+
+void RecCache::_ejectOld() { 
+    scoped_lock lk(rcmutex);
+    if( nnodes <= MAXNODES )
+        return;
+    Node *n = oldest;
+    while( 1 ) {
+        if( nnodes <= MAXNODES - 4 ) { 
+            n->older = 0;
+            oldest = n;
+            assert( oldest ) ;
+            break;
+        }
+        nnodes--;
+        assert(n);
+        Node *nxt = n->newer;
+        writeIfDirty(n);
+        m.erase(n->loc);
+        delete n;
+        n = nxt;
+    }
+}
+
+void RecCache::dump() { 
+    Node *n = oldest;
+    Node *last = 0;
+    while( n ) { 
+        assert( n->older == last );
+        last = n;
+//        cout << n << ' ' << n->older << ' ' << n->newer << '\n';
+        n=n->newer;
+    }
+    assert( newest == last );
+//    cout << endl;
+}
+
+/* cleans up everything EXCEPT storesByNsKey.
+   note this function is slow should not be invoked often
+*/
+void RecCache::closeStore(BasicRecStore *rs) { 
+    int n = rs->fileNumber + Base;
+    for( set<DiskLoc>::iterator i = dirtyl.begin(); i != dirtyl.end(); ) { 
+        DiskLoc k = *i++;
+        if( k.a() == n )
+            dirtyl.erase(k);
+    }
+
+    for( map<DiskLoc,Node*>::iterator i = m.begin(); i != m.end(); ) { 
+        DiskLoc k = i->first;
+        i++;
+        if( k.a() == n )
+            m.erase(k);
+    }
+
+    assert( stores[rs->fileNumber] != 0 );
+    stores[rs->fileNumber] = 0;
+/*
+    for( unsigned i = 0; i < stores.size(); i++ ) { 
+        if( stores[i] == rs ) { 
+            stores[i] = 0;
+            break;
+        }
+    }*/
+    delete rs; // closes file
+}
+
+void RecCache::drop(const char *_ns) { 
+    // todo: test with a non clean shutdown file
+    scoped_lock lk(rcmutex);
+
+    map<string, BasicRecStore*>::iterator it = storesByNsKey.find(mknskey(_ns));
+    string fname;
+    if( it != storesByNsKey.end() ) {
+        fname = it->second->filename;
+        closeStore(it->second); // cleans up stores[] etc.
+        storesByNsKey.erase(it);
+    }
+    else { 
+        bool found;
+        fname = findStoreFilename(_ns, found);
+        if( !found ) { 
+            log() << "RecCache::drop: no idx file found for " << _ns << endl;
+            return;
+        }
+        path pf(directory());
+        pf /= fname;
+        fname = pf.string();
+    }
+    try {
+        if( !boost::filesystem::exists(fname) ) 
+            log() << "RecCache::drop: can't find file to remove " << fname << endl;
+        boost::filesystem::remove(fname);
+    } 
+    catch(...) { 
+        log() << "RecCache::drop: exception removing file " << fname << endl;
+    }
+}
+
+}
diff --git a/db/reccache.h b/db/reccache.h
index 42943c5..d354587 100644
--- a/db/reccache.h
+++ b/db/reccache.h
@@ -1,4 +1,20 @@
 // reccache.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 
 /* CachedBasicRecStore
    This is our store which implements a traditional page-cache type of storage
@@ -33,7 +49,7 @@ class RecCache {
         bool dirty;
         Node *older, *newer; // lru
     };
-    boost::mutex &rcmutex; // mainly to coordinate with the lazy writer thread
+    mongo::mutex rcmutex; // mainly to coordinate with the lazy writer thread
     unsigned recsize;
     map<DiskLoc, Node*> m; // the cache
     Node *newest, *oldest;
@@ -118,7 +134,7 @@ private:
 public:
     /* all public functions (except constructor) should use the mutex */
 
-    RecCache(unsigned recsz) : rcmutex( *( new boost::mutex() ) ), recsize(recsz) { 
+    RecCache(unsigned recsz) : recsize(recsz) { 
         nnodes = 0;
         newest = oldest = 0;
     }
@@ -140,7 +156,7 @@ public:
     */
     void dirty(DiskLoc d) {
         assert( d.a() >= Base );
-        boostlock lk(rcmutex);
+        scoped_lock lk(rcmutex);
         map<DiskLoc, Node*>::iterator i = m.find(d);
         if( i != m.end() ) {
             Node *n = i->second;
@@ -155,7 +171,7 @@ public:
         assert( d.a() >= Base );
         assert( len == recsize );
 
-        boostlock lk(rcmutex);
+        scoped_lock lk(rcmutex);
         map<DiskLoc, Node*>::iterator i = m.find(d);
         if( i != m.end() ) {
             touch(i->second);
@@ -172,7 +188,7 @@ public:
     void drop(const char *ns);
 
     DiskLoc insert(const char *ns, const void *obuf, int len, bool god) {
-        boostlock lk(rcmutex);
+        scoped_lock lk(rcmutex);
         BasicRecStore& rs = store(ns);
         fileofs o = rs.insert((const char *) obuf, len);
         assert( o % recsize == 0 );
@@ -229,9 +245,11 @@ public:
 */
 
 inline void dbunlocking_read() { 
+    /*
     Client *c = currentClient.get();
     if ( c )
         c->top.clientStop();
+    */
 }
 
 inline void dbunlocking_write() { 
diff --git a/db/reci.h b/db/reci.h
index 295388c..08dcece 100644
--- a/db/reci.h
+++ b/db/reci.h
@@ -1,8 +1,24 @@
 // reci.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 
 #pragma once
 
-#include "storage.h"
+#include "diskloc.h"
 
 namespace mongo { 
 
diff --git a/db/recstore.h b/db/recstore.h
index 2e6a90a..bdb3d77 100644
--- a/db/recstore.h
+++ b/db/recstore.h
@@ -1,108 +1,124 @@
-// recstore.h
-
-#pragma once
-
-#include "../util/file.h"
-
-namespace mongo { 
-
-using boost::uint32_t;
-using boost::uint64_t;
-
-/* Current version supports only consistent record sizes within a store. */
-
-class BasicRecStore { 
-    struct RecStoreHeader { 
-        uint32_t version;
-        uint32_t recsize;
-        uint64_t leof; // logical eof, actual file might be prealloc'd further
-        uint64_t firstDeleted; // 0 = no deleted recs
-        uint32_t cleanShutdown; // 0 = clean
-        char reserved[8192-8-8-4-4-4]; // we want our records page-aligned in the file if they are a multiple of a page's size -- so we make this 8KB with that goal
-        RecStoreHeader() { 
-            version = 65;
-            recsize = 0;
-            leof = sizeof(RecStoreHeader);
-            firstDeleted = 0;
-            cleanShutdown = 1;
-            memset(reserved, 0, sizeof(reserved));
-        }
-    };
-
-public:
-    BasicRecStore(int _fileNumber) : fileNumber(_fileNumber) { }
-    ~BasicRecStore();
-    void init(const char *fn, unsigned recsize);
-    fileofs insert(const char *buf, unsigned len);
-    void update(fileofs o, const char *buf, unsigned len);
-    void remove(fileofs o, unsigned len);
-    void get(fileofs o, char *buf, unsigned len);
-
-    int fileNumber; // this goes in DiskLoc::a
-
-    string filename;
-
-private:
-
-    void writeHeader();
-    File f;
-    fileofs len;
-    RecStoreHeader h; // h.reserved is wasteful here; fix later.
-    void write(fileofs ofs, const char *data, unsigned len) { 
-        f.write(ofs, data, len);
-        massert( 10380 , "basicrecstore write io error", !f.bad());
-    }
-};
-
-/* --- implementation --- */
-
-inline BasicRecStore::~BasicRecStore() { 
-    h.cleanShutdown = 0;
-    if( f.is_open() ) {
-        writeHeader();
-        f.fsync();
-    }
-}
-
-inline void BasicRecStore::writeHeader() { 
-    write(0, (const char *) &h, 28); // update header in file for new leof
-    uassert( 10115 , "file io error in BasicRecStore [1]", !f.bad()); 
-}
-
-inline fileofs BasicRecStore::insert(const char *buf, unsigned reclen) { 
-    if( h.firstDeleted ) { 
-        uasserted(11500, "deleted not yet implemented recstoreinsert");
-    }
-    massert( 10381 , "bad len", reclen == h.recsize);
-    fileofs ofs = h.leof;
-    h.leof += reclen;
-    if( h.leof > len ) { 
-        // grow the file.  we grow quite a bit to avoid excessive file system fragmentations
-        len += (len / 8) + h.recsize;
-        uassert( 10116 ,  "recstore file too big for 32 bit", len <= 0x7fffffff || sizeof(std::streamoff) > 4 );
-        write(len, "", 0);
-    }
-    writeHeader();
-    write(ofs, buf, reclen);
-    uassert( 10117 , "file io error in BasicRecStore [2]", !f.bad());
-    return ofs;
-}
-
-/* so far, it's ok to read or update a subset of a record */
-
-inline void BasicRecStore::update(fileofs o, const char *buf, unsigned len) { 
-    assert(o <= h.leof && o >= sizeof(RecStoreHeader));
-    write(o, buf, len);
-}
-
-inline void BasicRecStore::get(fileofs o, char *buf, unsigned len) { 
-    assert(o <= h.leof && o >= sizeof(RecStoreHeader));
-    f.read(o, buf, len);
-    massert( 10382 , "basicrestore::get I/O error", !f.bad());
-}
-
-inline void BasicRecStore::remove(fileofs o, unsigned len) { 
-    uasserted(11501, "not yet implemented recstoreremove");
-}
-
-}
+// recstore.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "../util/file.h"
+
+namespace mongo { 
+
+using boost::uint32_t;
+using boost::uint64_t;
+
+/* Current version supports only consistent record sizes within a store. */
+
+class BasicRecStore { 
+    struct RecStoreHeader { 
+        uint32_t version;
+        uint32_t recsize;
+        uint64_t leof; // logical eof, actual file might be prealloc'd further
+        uint64_t firstDeleted; // 0 = no deleted recs
+        uint32_t cleanShutdown; // 0 = clean
+        char reserved[8192-8-8-4-4-4]; // we want our records page-aligned in the file if they are a multiple of a page's size -- so we make this 8KB with that goal
+        RecStoreHeader() { 
+            version = 65;
+            recsize = 0;
+            leof = sizeof(RecStoreHeader);
+            firstDeleted = 0;
+            cleanShutdown = 1;
+            memset(reserved, 0, sizeof(reserved));
+        }
+    };
+
+public:
+    BasicRecStore(int _fileNumber) : fileNumber(_fileNumber) { }
+    ~BasicRecStore();
+    void init(const char *fn, unsigned recsize);
+    fileofs insert(const char *buf, unsigned len);
+    void update(fileofs o, const char *buf, unsigned len);
+    void remove(fileofs o, unsigned len);
+    void get(fileofs o, char *buf, unsigned len);
+
+    int fileNumber; // this goes in DiskLoc::a
+
+    string filename;
+
+private:
+
+    void writeHeader();
+    File f;
+    fileofs len;
+    RecStoreHeader h; // h.reserved is wasteful here; fix later.
+    void write(fileofs ofs, const char *data, unsigned len) { 
+        f.write(ofs, data, len);
+        massert( 10380 , "basicrecstore write io error", !f.bad());
+    }
+};
+
+/* --- implementation --- */
+
+inline BasicRecStore::~BasicRecStore() { 
+    h.cleanShutdown = 0;
+    if( f.is_open() ) {
+        writeHeader();
+        f.fsync();
+    }
+}
+
+inline void BasicRecStore::writeHeader() { 
+    write(0, (const char *) &h, 28); // update header in file for new leof
+    uassert( 10115 , "file io error in BasicRecStore [1]", !f.bad()); 
+}
+
+inline fileofs BasicRecStore::insert(const char *buf, unsigned reclen) { 
+    if( h.firstDeleted ) { 
+        uasserted(11500, "deleted not yet implemented recstoreinsert");
+    }
+    massert( 10381 , "bad len", reclen == h.recsize);
+    fileofs ofs = h.leof;
+    h.leof += reclen;
+    if( h.leof > len ) { 
+        // grow the file.  we grow quite a bit to avoid excessive file system fragmentations
+        len += (len / 8) + h.recsize;
+        uassert( 10116 ,  "recstore file too big for 32 bit", len <= 0x7fffffff || sizeof(std::streamoff) > 4 );
+        write(len, "", 0);
+    }
+    writeHeader();
+    write(ofs, buf, reclen);
+    uassert( 10117 , "file io error in BasicRecStore [2]", !f.bad());
+    return ofs;
+}
+
+/* so far, it's ok to read or update a subset of a record */
+
+inline void BasicRecStore::update(fileofs o, const char *buf, unsigned len) { 
+    assert(o <= h.leof && o >= sizeof(RecStoreHeader));
+    write(o, buf, len);
+}
+
+inline void BasicRecStore::get(fileofs o, char *buf, unsigned len) { 
+    assert(o <= h.leof && o >= sizeof(RecStoreHeader));
+    f.read(o, buf, len);
+    massert( 10382 , "basicrestore::get I/O error", !f.bad());
+}
+
+inline void BasicRecStore::remove(fileofs o, unsigned len) { 
+    uasserted(11501, "not yet implemented recstoreremove");
+}
+
+}
diff --git a/db/repl.cpp b/db/repl.cpp
index 04c8d73..62b2986 100644
--- a/db/repl.cpp
+++ b/db/repl.cpp
@@ -1,10 +1,8 @@
 // repl.cpp
 
 /* TODO
-
    PAIRING
     _ on a syncexception, don't allow going back to master state?
-
 */
 
 /**
@@ -39,6 +37,7 @@
 #include "repl.h"
 #include "../util/message.h"
 #include "../client/dbclient.h"
+#include "../client/connpool.h"
 #include "pdfile.h"
 #include "query.h"
 #include "db.h"
@@ -47,6 +46,9 @@
 #include "cmdline.h"
 
 namespace mongo {
+    
+    // our config from command line etc.
+    ReplSettings replSettings;
 
     void ensureHaveIdIndex(const char *ns);
 
@@ -63,11 +65,12 @@ namespace mongo {
     */
     const char *replAllDead = 0;
 
-    extern bool autoresync;
     time_t lastForcedResync = 0;
     
     IdTracker &idTracker = *( new IdTracker() );
     
+    int __findingStartInitialTimeout = 5; // configurable for testing    
+
 } // namespace mongo
 
 #include "replset.h"
@@ -137,6 +140,7 @@ namespace mongo {
         virtual bool logTheOp() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; }
         CmdReplacePeer() : Command("replacepeer") { }
         virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if ( replPair == 0 ) {
@@ -196,9 +200,12 @@ namespace mongo {
         virtual bool logTheOp() {
             return false;   
         }
+        virtual LockType locktype(){ return WRITE; }
         CmdForceDead() : Command("forcedead") { }
         virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
-            replAllDead = "forced by command";
+            replAllDead = "replication forced to stop by 'forcedead' command";
+            log() << "*********************************************************\n";
+            log() << "received 'forcedead' command, replication forced to stop" << endl;
             return true;
         }
     } cmdForceDead;
@@ -215,6 +222,7 @@ namespace mongo {
         virtual bool logTheOp() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; }
         CmdResync() : Command("resync") { }
         virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             if ( cmdObj.getBoolField( "force" ) ) {
@@ -253,12 +261,85 @@ namespace mongo {
         }
     } cmdResync;
     
+    bool anyReplEnabled(){
+        return replPair || replSettings.slave || replSettings.master;
+    }
+
+    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ){
+        
+        if ( replAllDead ) {
+            result.append("ismaster", 0.0);
+            if( authed ) { 
+                if ( replPair )
+                    result.append("remote", replPair->remote);
+            }
+            string s = string("dead: ") + replAllDead;
+            result.append("info", s);
+        }
+        else if ( replPair ) {
+            result.append("ismaster", replPair->state);
+            if( authed ) {
+                result.append("remote", replPair->remote);
+                if ( !replPair->info.empty() )
+                    result.append("info", replPair->info);
+            }
+        }
+        else {
+            result.append("ismaster", replSettings.slave ? 0 : 1);
+            result.append("msg", "not paired");
+        }
+        
+        if ( level ){
+            BSONObjBuilder sources( result.subarrayStart( "sources" ) );
+            
+            readlock lk( "local.sources" );
+            Client::Context ctx( "local.sources" );
+            auto_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+            int n = 0;
+            while ( c->ok() ){
+                BSONObj s = c->current();
+                
+                BSONObjBuilder bb;
+                bb.append( s["host"] );
+                string sourcename = s["source"].valuestr();
+                if ( sourcename != "main" )
+                    bb.append( s["source"] );
+                
+                {
+                    BSONElement e = s["syncedTo"];
+                    BSONObjBuilder t( bb.subobjStart( "syncedTo" ) );
+                    t.appendDate( "time" , e.timestampTime() );
+                    t.append( "inc" , e.timestampInc() );
+                    t.done();
+                }
+                
+                if ( level > 1 ){
+                    dbtemprelease unlock;
+                    ScopedDbConnection conn( s["host"].valuestr() );
+                    BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) );
+                    BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) );
+                    bb.appendDate( "masterFirst" , first["ts"].timestampTime() );
+                    bb.appendDate( "masterLast" , last["ts"].timestampTime() );
+                    double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime());
+                    bb.append( "lagSeconds" , lag / 1000 );
+                    conn.done();
+                }
+
+                sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() );
+                c->advance();
+            }
+            
+            sources.done();
+        }
+    }
+
     class CmdIsMaster : public Command {
     public:
         virtual bool requiresAuth() { return false; }
         virtual bool slaveOk() {
             return true;
         }
+        virtual LockType locktype(){ return NONE; }
         CmdIsMaster() : Command("ismaster") { }
         virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
 			/* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not 
@@ -266,30 +347,9 @@ namespace mongo {
 			   we allow unauthenticated ismaster but we aren't as verbose informationally if 
 			   one is not authenticated for admin db to be safe.
 			*/
-            AuthenticationInfo *ai = currentClient.get()->ai;
-			bool authed = ai->isAuthorized("admin");
-
-            if ( replAllDead ) {
-                result.append("ismaster", 0.0);
-				if( authed ) { 
-					if ( replPair )
-						result.append("remote", replPair->remote);
-					result.append("info", replAllDead);
-				}
-            }
-            else if ( replPair ) {
-                result.append("ismaster", replPair->state);
-				if( authed ) {
-					result.append("remote", replPair->remote);
-					if ( !replPair->info.empty() )
-						result.append("info", replPair->info);
-				}
-			}
-            else {
-                result.append("ismaster", slave ? 0 : 1);
-				result.append("msg", "not paired");
-            }
             
+			bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
+            appendReplicationInfo( result , authed );
             return true;
         }
     } cmdismaster;
@@ -300,6 +360,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return true;
         }
+        virtual LockType locktype(){ return WRITE; }
         CmdIsInitialSyncComplete() : Command( "isinitialsynccomplete" ) {}
         virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
             result.appendBool( "initialsynccomplete", getInitialSyncCompleted() );
@@ -333,7 +394,7 @@ namespace mongo {
         virtual bool adminOnly() {
             return true;
         }
-
+        virtual LockType locktype(){ return WRITE; }
         virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {
             if ( replPair == 0 ) {
                 massert( 10383 ,  "Another mongod instance believes incorrectly that this node is its peer", !cmdObj.getBoolField( "fromArbiter" ) );
@@ -541,12 +602,13 @@ namespace mongo {
         BSONObj o = jsobj();
         log( 1 ) << "Saving repl source: " << o << endl;
 
-        OpDebug debug;
-        setClient("local.sources");
-        UpdateResult res = updateObjects("local.sources", o, pattern, true/*upsert for pair feature*/, false,false,debug);
-        assert( ! res.mod );
-        assert( res.num == 1 );
-        cc().clearns();
+        {
+            OpDebug debug;
+            Client::Context ctx("local.sources");
+            UpdateResult res = updateObjects("local.sources", o, pattern, true/*upsert for pair feature*/, false,false,debug);
+            assert( ! res.mod );
+            assert( res.num == 1 );
+        }
 
         if ( replacing ) {
             /* if we were in "replace" mode, we now have synced up with the replacement,
@@ -578,13 +640,13 @@ namespace mongo {
        and cursor in effect.
     */
     void ReplSource::loadAll(SourceVector &v) {
+        Client::Context ctx("local.sources");
         SourceVector old = v;
         v.clear();
 
         bool gotPairWith = false;
 
         if ( !cmdLine.source.empty() ) {
-            setClient("local.sources");
             // --source <host> specified.
             // check that no items are in sources other than that
             // add if missing
@@ -594,8 +656,8 @@ namespace mongo {
                 n++;
                 ReplSource tmp(c->current());
                 if ( tmp.hostName != cmdLine.source ) {
-                    log() << "--source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl;
-                    log() << "terminating after 30 seconds" << endl;
+                    log() << "repl: --source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl;
+                    log() << "repl: terminating mongod after 30 seconds" << endl;
                     sleepsecs(30);
                     dbexit( EXIT_REPLICATION_ERROR );
                 }
@@ -626,8 +688,10 @@ namespace mongo {
         
         if ( replPair ) {
             const string &remote = replPair->remote;
-            setClient( "local.sources" );
             // --pairwith host specified.
+            if ( replSettings.fastsync ) {
+                Helpers::emptyCollection( "local.sources" );  // ignore saved sources
+            }
             // check that no items are in sources other than that
             // add if missing
             auto_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
@@ -652,7 +716,6 @@ namespace mongo {
             }
         }
 
-        setClient("local.sources");
         auto_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
         while ( c->ok() ) {
             ReplSource tmp(c->current());
@@ -664,11 +727,21 @@ namespace mongo {
                     tmp.syncedTo = OpTime();
                     tmp.replacing = true;
                 }
+            } 
+            if ( ( !replPair && tmp.syncedTo.isNull() ) ||
+                ( replPair && replSettings.fastsync ) ) {
+                DBDirectClient c;
+                if ( c.exists( "local.oplog.$main" ) ) {
+                    BSONObj op = c.findOne( "local.oplog.$main", Query().sort( BSON( "$natural" << -1 ) ) );
+                    if ( !op.isEmpty() ) {
+                        tmp.syncedTo = op[ "ts" ].date();
+                        tmp._lastSavedLocalTs = op[ "ts" ].date();
+                    }
+                }
             }
             addSourceToList(v, tmp, c->current(), old);
             c->advance();
         }
-        cc().clearns();
 
         if ( !gotPairWith && replPair ) {
             /* add the --pairwith server */
@@ -732,7 +805,7 @@ namespace mongo {
     string ReplSource::resyncDrop( const char *db, const char *requester ) {
         log() << "resync: dropping database " << db << endl;
         string dummyns = string( db ) + ".";
-        setClient(dummyns.c_str());        
+        Client::Context ctx(dummyns);
         assert( cc().database()->name == db );
         dropDatabase(dummyns.c_str());
         return dummyns;
@@ -741,9 +814,9 @@ namespace mongo {
     /* grab initial copy of a database from the master */
     bool ReplSource::resync(string db) {
         string dummyNs = resyncDrop( db.c_str(), "internal" );
-        setClient( dummyNs.c_str() );
+        Client::Context ctx( dummyNs );
         {
-            log() << "resync: cloning database " << db << endl;
+            log() << "resync: cloning database " << db << " to get an initial copy" << endl;
             ReplInfo r("resync: cloning a database");
             string errmsg;
             bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveok*/ true, /*replauth*/ true, /*snapshot*/false);
@@ -753,7 +826,7 @@ namespace mongo {
             }
         }
 
-        log() << "resync: done " << db << endl;
+        log() << "resync: done with initial clone for db: " << db << endl;
 
         return true;
     }
@@ -864,29 +937,21 @@ namespace mongo {
             throw SyncException();
         }
         
-        bool justCreated;
-        try {
-            justCreated = setClient(ns);
-        } catch ( AssertionException& ) {
-            problem() << "skipping bad(?) op in oplog, setClient() failed, ns: '" << ns << "'\n";
-            addDbNextPass.erase(clientName);
-            return;
-        }
+        Client::Context ctx( ns );
 
-        bool empty = cc().database()->isEmpty();
+        bool empty = ctx.db()->isEmpty();
         bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0;
 
-        log( 6 ) << "ns: " << ns << ", justCreated: " << justCreated << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl;
-
-	// always apply admin command command
-	// this is a bit hacky -- the semantics of replication/commands aren't well specified
-	if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) {
-	  applyOperation( op );
-      cc().clearns();
-	  return;
-	}
+        log( 6 ) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl;
         
-        if ( justCreated || empty || incompleteClone ) {
+        // always apply admin command command
+        // this is a bit hacky -- the semantics of replication/commands aren't well specified
+        if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) {
+            applyOperation( op );
+            return;
+        }
+        
+        if ( ctx.justCreated() || empty || incompleteClone ) {
             // we must add to incomplete list now that setClient has been called
             incompleteCloneDbs.insert( clientName );
             if ( nClonedThisPass ) {
@@ -901,9 +966,9 @@ namespace mongo {
                     log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl;
                 }
                 save();
-                setClient( ns );
+                Client::Context ctx(ns);
                 nClonedThisPass++;
-                resync(cc().database()->name);
+                resync(ctx.db()->name);
                 addDbNextPass.erase(clientName);
                 incompleteCloneDbs.erase( clientName );
             }
@@ -927,7 +992,6 @@ namespace mongo {
             }
             addDbNextPass.erase( clientName );
         }
-        cc().clearns();
     }
 
     BSONObj ReplSource::idForOp( const BSONObj &op, bool &mod ) {
@@ -981,16 +1045,20 @@ namespace mongo {
     
     void ReplSource::syncToTailOfRemoteLog() {
         string _ns = ns();
-        BSONObj last = conn->findOne( _ns.c_str(), Query().sort( BSON( "$natural" << -1 ) ) );
+        BSONObjBuilder b;
+        if ( !only.empty() ) {
+            b.appendRegex("ns", string("^") + only);
+        }        
+        BSONObj last = conn->findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) );
         if ( !last.isEmpty() ) {
-            BSONElement ts = last.findElement( "ts" );
+            BSONElement ts = last.getField( "ts" );
             massert( 10386 ,  "non Date ts found", ts.type() == Date || ts.type() == Timestamp );
             syncedTo = OpTime( ts.date() );
         }        
     }
     
     OpTime ReplSource::nextLastSavedLocalTs() const {
-        setClient( "local.oplog.$main" );
+        Client::Context ctx( "local.oplog.$main" );
         auto_ptr< Cursor > c = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) );
         if ( c->ok() )
             return OpTime( c->current().getField( "ts" ).date() );        
@@ -1003,7 +1071,10 @@ namespace mongo {
     }
     
     void ReplSource::resetSlave() {
-        massert( 10387 ,  "request to kill slave replication falied",
+        log() << "**********************************************************\n";
+        log() << "Sending forcedead command to slave to stop its replication\n";
+        log() << "Host: " << hostName << " paired: " << paired << endl;
+        massert( 10387 ,  "request to kill slave replication failed",
                 conn->simpleCommand( "admin", 0, "forcedead" ) );        
         syncToTailOfRemoteLog();
         {
@@ -1015,7 +1086,7 @@ namespace mongo {
     }
     
     bool ReplSource::updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock ) {
-        setClient( "local.oplog.$main" );
+        Client::Context ctx( "local.oplog.$main" );
         auto_ptr< Cursor > localLog = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) );
         OpTime newTail;
         for( ; localLog->ok(); localLog->advance() ) {
@@ -1153,67 +1224,70 @@ namespace mongo {
             }
             return true;
         }
-
-        int n = 0;
-        BSONObj op = c->next();
-        BSONElement ts = op.findElement("ts");
-        if ( ts.type() != Date && ts.type() != Timestamp ) {
-            string err = op.getStringField("$err");
-            if ( !err.empty() ) {
-                problem() << "repl: $err reading remote oplog: " + err << '\n';
-                massert( 10390 ,  "got $err reading remote oplog", false );
-            }
-            else {
-                problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n';
-                massert( 10391 , "repl: bad object read from remote oplog", false);
+        
+        OpTime nextOpTime;
+        {
+            BSONObj op = c->next();
+            BSONElement ts = op.getField("ts");
+            if ( ts.type() != Date && ts.type() != Timestamp ) {
+                string err = op.getStringField("$err");
+                if ( !err.empty() ) {
+                    problem() << "repl: $err reading remote oplog: " + err << '\n';
+                    massert( 10390 ,  "got $err reading remote oplog", false );
+                }
+                else {
+                    problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n';
+                    massert( 10391 , "repl: bad object read from remote oplog", false);
+                }
             }
-        }
         
-        if ( replPair && replPair->state == ReplPair::State_Master ) {
+            if ( replPair && replPair->state == ReplPair::State_Master ) {
             
-            OpTime nextOpTime( ts.date() );
-            if ( !tailing && !initial && nextOpTime != syncedTo ) {
-                log() << "remote slave log filled, forcing slave resync" << endl;
-                resetSlave();
-                return true;
-            }            
+                OpTime next( ts.date() );
+                if ( !tailing && !initial && next != syncedTo ) {
+                    log() << "remote slave log filled, forcing slave resync" << endl;
+                    resetSlave();
+                    return true;
+                }            
             
-            dblock lk;
-            updateSetsWithLocalOps( localLogTail, true );
-        }
+                dblock lk;
+                updateSetsWithLocalOps( localLogTail, true );
+            }
         
-        OpTime nextOpTime( ts.date() );
-        log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n';
-        if ( tailing || initial ) {
-            if ( initial )
-                log(1) << "repl:   initial run\n";
-            else
+            nextOpTime = OpTime( ts.date() );
+            log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n';
+            if ( tailing || initial ) {
+                if ( initial )
+                    log(1) << "repl:   initial run\n";
+                else
+                    assert( syncedTo < nextOpTime );
+                c->putBack( op ); // op will be processed in the loop below
+                nextOpTime = OpTime(); // will reread the op below
+            }
+            else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error
+                Nullstream& l = log();
+                l << "repl:   nextOpTime " << nextOpTime.toStringLong() << ' ';
+                if ( nextOpTime < syncedTo )
+                    l << "<??";
+                else
+                    l << ">";
+
+                l << " syncedTo " << syncedTo.toStringLong() << '\n';
+                log() << "repl:   time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n";
+                log() << "repl:   tailing: " << tailing << '\n';
+                log() << "repl:   data too stale, halting replication" << endl;
+                replInfo = replAllDead = "data too stale halted replication";
                 assert( syncedTo < nextOpTime );
-            sync_pullOpLog_applyOperation(op, &localLogTail);
-            n++;
-        }
-        else if ( nextOpTime != syncedTo ) {
-            Nullstream& l = log();
-            l << "repl:   nextOpTime " << nextOpTime.toStringLong() << ' ';
-            if ( nextOpTime < syncedTo )
-                l << "<??";
-            else
-                l << ">";
-
-            l << " syncedTo " << syncedTo.toStringLong() << '\n';
-            log() << "repl:   time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n";
-            log() << "repl:   tailing: " << tailing << '\n';
-            log() << "repl:   data too stale, halting replication" << endl;
-            replInfo = replAllDead = "data too stale halted replication";
-            assert( syncedTo < nextOpTime );
-            throw SyncException();
-        }
-        else {
-            /* t == syncedTo, so the first op was applied previously. */
+                throw SyncException();
+            }
+            else {
+                /* t == syncedTo, so the first op was applied previously. */
+            }
         }
 
         // apply operations
         {
+            int n = 0;
 			time_t saveLast = time(0);
             while ( 1 ) {
                 /* from a.s.:
@@ -1232,7 +1306,7 @@ namespace mongo {
                 */
                 if ( !c->more() ) {
                     dblock lk;
-                    OpTime nextLastSaved = nextLastSavedLocalTs(); // this may make c->more() become true
+                    OpTime nextLastSaved = nextLastSavedLocalTs();
                     {
                         dbtemprelease t;
                         if ( c->more() ) {
@@ -1245,11 +1319,11 @@ namespace mongo {
                     save(); // note how far we are synced up to now
                     log() << "repl:   applied " << n << " operations" << endl;
                     nApplied = n;
-                    log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl;
+                    log() << "repl:  end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl;
                     break;
                 }
 
-                OCCASIONALLY if( n > 100000 || time(0) - saveLast > 60 ) { 
+                OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { 
 					// periodically note our progress, in case we are doing a lot of work and crash
 					dblock lk;
                     syncedTo = nextOpTime;
@@ -1262,14 +1336,36 @@ namespace mongo {
 				}
 
                 BSONObj op = c->next();
-                ts = op.findElement("ts");
-                assert( ts.type() == Date || ts.type() == Timestamp );
+                BSONElement ts = op.getField("ts");
+                if( !( ts.type() == Date || ts.type() == Timestamp ) ) { 
+                    log() << "sync error: problem querying remote oplog record\n";
+                    log() << "op: " << op.toString() << '\n';
+                    log() << "halting replication" << endl;
+                    replInfo = replAllDead = "sync error: no ts found querying remote oplog record";
+                    throw SyncException();
+                }
                 OpTime last = nextOpTime;
-                OpTime tmp( ts.date() );
-                nextOpTime = tmp;
+                nextOpTime = OpTime( ts.date() );
                 if ( !( last < nextOpTime ) ) {
-                    problem() << "sync error: last " << last.toString() << " >= nextOpTime " << nextOpTime.toString() << endl;
-                    uassert( 10123 , "bad 'ts' value in sources", false);
+                    log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl;
+                    log() << " last:       " << last.toStringLong() << '\n';
+                    log() << " nextOpTime: " << nextOpTime.toStringLong() << '\n';
+                    log() << " halting replication" << endl;
+                    replInfo = replAllDead = "sync error last >= nextOpTime";
+                    uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false);
+                }
+                if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) {
+                    c->putBack( op );
+                    _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1;
+                    dblock lk;
+                    if ( n > 0 ) {
+                        syncedTo = last;
+                        save();
+                    }
+                    log() << "repl:   applied " << n << " operations" << endl;
+                    log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
+                    log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl;
+                    break;
                 }
 
                 sync_pullOpLog_applyOperation(op, &localLogTail);
@@ -1283,8 +1379,7 @@ namespace mongo {
 	BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}");
 
 	bool replAuthenticate(DBClientConnection *conn) {
-        AuthenticationInfo *ai = currentClient.get()->ai;
-		if( !ai->isAuthorized("admin") ) { 
+		if( ! cc().isAdmin() ){
 			log() << "replauthenticate: requires admin permissions, failing\n";
 			return false;
 		}
@@ -1324,7 +1419,7 @@ namespace mongo {
             ReplInfo r("trying to connect to sync source");
             if ( !conn->connect(hostName.c_str(), errmsg) || !replAuthenticate(conn.get()) ) {
                 resetConnection();
-                log() << "repl: " << errmsg << endl;
+                log() << "repl:  " << errmsg << endl;
                 return false;
             }
         }
@@ -1335,9 +1430,16 @@ namespace mongo {
        returns true if everything happy.  return false if you want to reconnect.
     */
     bool ReplSource::sync(int& nApplied) {
+        _sleepAdviceTime = 0;
         ReplInfo r("sync");
-        if ( !cmdLine.quiet )
-            log() << "repl: " << sourceName() << '@' << hostName << endl;
+        if ( !cmdLine.quiet ) {
+            Nullstream& l = log();
+            l << "repl: from ";
+            if( sourceName() != "main" ) {
+                l << "source:" << sourceName() << ' ';
+            }
+            l << "host:" << hostName << endl;
+        }
         nClonedThisPass = 0;
 
         // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName.
@@ -1348,13 +1450,11 @@ namespace mongo {
         }
 
         if ( !connect() ) {
+			log(4) << "repl:  can't connect to sync source" << endl;
             if ( replPair && paired ) {
                 assert( startsWith(hostName.c_str(), replPair->remoteHost.c_str()) );
                 replPair->arbitrate();
             }
-            {
-                ReplInfo r("can't connect to sync source");
-            }
             return false;            
         }
         
@@ -1370,7 +1470,7 @@ namespace mongo {
         /*
         	// get current mtime at the server.
         	BSONObj o = conn->findOne("admin.$cmd", opTimeQuery);
-        	BSONElement e = o.findElement("optime");
+        	BSONElement e = o.getField("optime");
         	if( e.eoo() ) {
         		log() << "repl:   failed to get cur optime from master" << endl;
         		log() << "        " << o.toString() << endl;
@@ -1387,24 +1487,13 @@ namespace mongo {
 
 // cached copies of these...so don't rename them
     NamespaceDetails *localOplogMainDetails = 0;
-    Database *localOplogClient = 0;
+    Database *localOplogDB = 0;
+
+    void replCheckCloseDatabase( Database * db ){
+        localOplogDB = 0;
+        localOplogMainDetails = 0;
+    }
 
-    void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) {
-        if ( master ) {
-            _logOp(opstr, ns, "local.oplog.$main", obj, patt, b, OpTime::now());
-            char cl[ 256 ];
-            nsToDatabase( ns, cl );
-        }
-        NamespaceDetailsTransient &t = NamespaceDetailsTransient::get_w( ns );
-        if ( t.cllEnabled() ) {
-            try {
-                _logOp(opstr, ns, t.cllNS().c_str(), obj, patt, b, OpTime::now());
-            } catch ( const DBException & ) {
-                t.cllInvalidate();
-            }
-        }
-    }    
-    
     /* we write to local.opload.$main:
          { ts : ..., op: ..., ns: ..., o: ... }
        ts: an OpTime timestamp
@@ -1415,6 +1504,7 @@ namespace mongo {
         "c" db cmd
         "db" declares presence of a database (ns is set to the db name + '.')
         "n" no op
+       logNS - e.g. "local.oplog.$main"
        bb:
          if not null, specifies a boolean to pass along to the other side as b: param.
          used for "justOne" or "upsert" flags on 'd', 'u'
@@ -1422,7 +1512,7 @@ namespace mongo {
          when set, indicates this is the first thing we have logged for this database.
          thus, the slave does not need to copy down all the data when it sees this.
     */
-    void _logOp(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, const OpTime &ts ) {
+    static void _logOp(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, const OpTime &ts ) {
         if ( strncmp(ns, "local.", 6) == 0 )
             return;
 
@@ -1449,14 +1539,14 @@ namespace mongo {
         Record *r;
         if ( strncmp( logNS, "local.", 6 ) == 0 ) { // For now, assume this is olog main
             if ( localOplogMainDetails == 0 ) {
-                setClient("local.");
-                localOplogClient = cc().database();
+                Client::Context ctx("local.", dbpath, 0, false);
+                localOplogDB = ctx.db();
                 localOplogMainDetails = nsdetails(logNS);
             }
-            cc().setns("", localOplogClient); // database = localOplogClient;
+            Client::Context ctx( "" , localOplogDB, false );
             r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len);
         } else {
-            setClient( logNS );
+            Client::Context ctx( logNS, dbpath, 0, false );
             assert( nsdetails( logNS ) );
             r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
         }
@@ -1478,6 +1568,27 @@ namespace mongo {
         }
     }
 
+    static void logKeepalive() { 
+        BSONObj obj;
+        _logOp("n", "", "local.oplog.$main", obj, 0, 0, OpTime::now());
+    }
+
+    void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) {
+        if ( replSettings.master ) {
+            _logOp(opstr, ns, "local.oplog.$main", obj, patt, b, OpTime::now());
+            char cl[ 256 ];
+            nsToDatabase( ns, cl );
+        }
+        NamespaceDetailsTransient &t = NamespaceDetailsTransient::get_w( ns );
+        if ( t.cllEnabled() ) {
+            try {
+                _logOp(opstr, ns, t.cllNS().c_str(), obj, patt, b, OpTime::now());
+            } catch ( const DBException & ) {
+                t.cllInvalidate();
+            }
+        }
+    }    
+    
     /* --------------------------------------------------------------*/
 
     /*
@@ -1517,6 +1628,9 @@ namespace mongo {
                 else if( moreToSync ) {
                     sleepAdvice = 0;
                 }
+                else if ( s->sleepAdvice() ) {
+                    sleepAdvice = s->sleepAdvice();
+                }
                 if ( ok && !moreToSync /*&& !s->syncedTo.isNull()*/ ) {
                     pairSync->setInitialSyncCompletedLocking();
                 }
@@ -1560,10 +1674,10 @@ namespace mongo {
             {
                 dblock lk;
                 if ( replAllDead ) {
-                    if ( !autoresync || !ReplSource::throttledForceResyncDead( "auto" ) )
+                    if ( !replSettings.autoresync || !ReplSource::throttledForceResyncDead( "auto" ) )
                         break;
                 }
-                assert( syncing == 0 );
+                assert( syncing == 0 ); // i.e., there is only one sync thread running. we will want to change/fix this.
                 syncing++;
             }
             try {
@@ -1590,7 +1704,8 @@ namespace mongo {
                 stringstream ss;
                 ss << "repl: sleep " << s << "sec before next pass";
                 string msg = ss.str();
-                log() << msg << endl;
+                if ( ! cmdLine.quiet )
+                    log() << msg << endl;
                 ReplInfo r(msg.c_str());
                 sleepsecs(s);
             }
@@ -1599,14 +1714,38 @@ namespace mongo {
 
     int debug_stop_repl = 0;
 
+    static void replMasterThread() {
+        sleepsecs(4);
+        Client::initThread("replmaster");
+        while( 1 ) {
+            {
+                dblock lk;
+                cc().getAuthenticationInfo()->authorize("admin");   
+            }
+            sleepsecs(10);
+            /* write a keep-alive like entry to the log.  this will make things like 
+               printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date
+               even when things are idle.
+            */
+            {
+                writelock lk("");
+                try { 
+                    logKeepalive();
+                }
+                catch(...) { 
+                    log() << "caught exception in replMasterThread()" << endl;
+                }
+            }
+        }
+    }
+
     void replSlaveThread() {
         sleepsecs(1);
-
+        Client::initThread("replslave");
+            
         {
             dblock lk;
-
-            Client::initThread("replslave");
-	    currentClient.get()->ai->authorize("admin");
+            cc().getAuthenticationInfo()->authorize("admin");
         
             BSONObj obj;
             if ( Helpers::getSingleton("local.pair.startup", obj) ) {
@@ -1642,10 +1781,16 @@ namespace mongo {
         dblock lk;
 
         const char * ns = "local.oplog.$main";
-        setClient(ns);
+        Client::Context ctx(ns);
         
-        if ( nsdetails( ns ) )
+        if ( nsdetails( ns ) ) {
+            DBDirectClient c;
+            BSONObj lastOp = c.findOne( ns, Query().sort( BSON( "$natural" << -1 ) ) );
+            if ( !lastOp.isEmpty() ) {
+                OpTime::setLast( lastOp[ "ts" ].date() );
+            }
             return;
+        }
         
         /* create an oplog collection, if it doesn't yet exist. */
         BSONObjBuilder b;
@@ -1653,13 +1798,19 @@ namespace mongo {
         if ( cmdLine.oplogSize != 0 )
             sz = (double)cmdLine.oplogSize;
         else {
+			/* not specified. pick a default size */
             sz = 50.0 * 1000 * 1000;
             if ( sizeof(int *) >= 8 ) {
+#if defined(__APPLE__)
+				// typically these are desktops (dev machines), so keep it smallish
+				sz = (256-64) * 1000 * 1000;
+#else
                 sz = 990.0 * 1000 * 1000;
                 boost::intmax_t free = freeSpace(); //-1 if call not supported.
                 double fivePct = free * 0.05;
                 if ( fivePct > sz )
                     sz = fivePct;
+#endif
             }
         }
 
@@ -1675,7 +1826,6 @@ namespace mongo {
         BSONObj o = b.done();
         userCreateNS(ns, o, err, false);
         logOp( "n", "dummy", BSONObj() );
-        cc().clearns();
     }
     
     void startReplication() {
@@ -1684,29 +1834,31 @@ namespace mongo {
            */
         //boost::thread tempt(tempThread);
 
-        if ( !slave && !master && !replPair )
+        if ( !replSettings.slave && !replSettings.master && !replPair )
             return;
 
         {
             dblock lk;
+            cc().getAuthenticationInfo()->authorize("admin");
             pairSync->init();
         }
 
-        if ( slave || replPair ) {
-            if ( slave ) {
-				assert( slave == SimpleSlave );
+        if ( replSettings.slave || replPair ) {
+            if ( replSettings.slave ) {
+				assert( replSettings.slave == SimpleSlave );
                 log(1) << "slave=true" << endl;
 			}
 			else
-				slave = ReplPairSlave;
+				replSettings.slave = ReplPairSlave;
             boost::thread repl_thread(replSlaveThread);
         }
 
-        if ( master || replPair ) {
-            if ( master  )
+        if ( replSettings.master || replPair ) {
+            if ( replSettings.master )
                 log(1) << "master=true" << endl;
-            master = true;
+            replSettings.master = true;
             createOplog();
+            boost::thread t(replMasterThread);
         }
     }
 
@@ -1720,6 +1872,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return false;
         }
+        virtual LockType locktype(){ return WRITE; }
         CmdLogCollection() : Command( "logCollection" ) {}
         virtual void help( stringstream &help ) const {
             help << "examples: { logCollection: <collection ns>, start: 1 }, "
diff --git a/db/repl.h b/db/repl.h
index a4c1737..c5e0f63 100644
--- a/db/repl.h
+++ b/db/repl.h
@@ -32,6 +32,7 @@
 #include "db.h"
 #include "dbhelpers.h"
 #include "query.h"
+#include "queryoptimizer.h"
 
 #include "../client/dbclient.h"
 
@@ -46,14 +47,31 @@ namespace mongo {
        --slave cmd line setting -> SimpleSlave
 	*/
 	typedef enum { NotSlave=0, SimpleSlave, ReplPairSlave } SlaveTypes;
-	extern SlaveTypes slave;
 
-	/* true means we are master and doing replication.  if we are not writing to oplog (no --master or repl pairing), 
-	   this won't be true.
-	*/
-    extern bool master;
+    class ReplSettings {
+    public:
+        SlaveTypes slave;
+
+        /* true means we are master and doing replication.  if we are not writing to oplog (no --master or repl pairing), 
+           this won't be true.
+        */
+        bool master;
+
+        int opIdMem;
+
+        bool fastsync;
+        
+        bool autoresync;
+        
+        int slavedelay;
+
+        ReplSettings()
+            : slave(NotSlave) , master(false) , opIdMem(100000000) , fastsync() , autoresync(false), slavedelay() {
+        }
+
+    };
 
-    extern int opIdMem;
+    extern ReplSettings replSettings;
     
     bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, 
 				   bool slaveOk, bool useReplAuth, bool snapshot);
@@ -115,6 +133,7 @@ namespace mongo {
         // returns false if the slave has been reset
         bool updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock );
         string ns() const { return string( "local.oplog.$" ) + sourceName(); }
+        unsigned _sleepAdviceTime;
         
     public:
         static void applyOperation(const BSONObj& op);
@@ -131,11 +150,11 @@ namespace mongo {
         OpTime syncedTo;
 
         /* This is for repl pairs.
-           _lastSavedLocalTs is the most recent point in the local log that we know is consistent
-           with the remote log ( ie say the local op log has entries ABCDE and the remote op log 
-           has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled 
-           the DE-XY difference.)
-        */
+           _lastSavedLocalTs is the most recent point in the local log that we know is consistent
+           with the remote log ( ie say the local op log has entries ABCDE and the remote op log 
+           has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled 
+           the DE-XY difference.)
+        */
         OpTime _lastSavedLocalTs;
 
         int nClonedThisPass;
@@ -160,7 +179,13 @@ namespace mongo {
         operator string() const { return sourceName() + "@" + hostName; }
         
         bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); }        
-
+        int sleepAdvice() const {
+            if ( !_sleepAdviceTime )
+                return 0;
+            int wait = _sleepAdviceTime - unsigned( time( 0 ) );
+            return wait > 0 ? wait : 0;
+        }
+        
         static bool throttledForceResyncDead( const char *requester );
         static void forceResyncDead( const char *requester );
         void forceResync( const char *requester );
@@ -173,7 +198,6 @@ namespace mongo {
        "c" db cmd
        "db" declares presence of a database (ns is set to the db name + '.')
     */
-    void _logOp(const char *opstr, const char *ns, const char *logNs, const BSONObj& obj, BSONObj *patt, bool *b, const OpTime &ts);
     void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0);
 
     // class for managing a set of ids in memory
@@ -239,9 +263,9 @@ namespace mongo {
         dbIds_( "local.temp.replIds" ),
         dbModIds_( "local.temp.replModIds" ),
         inMem_( true ),
-        maxMem_( opIdMem ) {
+        maxMem_( replSettings.opIdMem ) {
         }
-        void reset( int maxMem = opIdMem ) {
+        void reset( int maxMem = replSettings.opIdMem ) {
             memIds_.reset();
             memModIds_.reset();
             dbIds_.reset();
@@ -312,4 +336,146 @@ namespace mongo {
         int maxMem_;
     };
     
+    bool anyReplEnabled();
+    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 );
+    
+    void replCheckCloseDatabase( Database * db );
+    
+    extern int __findingStartInitialTimeout; // configurable for testing    
+
+    class FindingStartCursor {
+    public:
+        FindingStartCursor( const QueryPlan & qp ) : 
+        _qp( qp ),
+        _findingStart( true ),
+        _findingStartMode(),
+        _findingStartTimer( 0 ),
+        _findingStartCursor( 0 )
+        { init(); }
+        bool done() const { return !_findingStart; }
+        auto_ptr< Cursor > cRelease() { return _c; }
+        void next() {
+            if ( !_findingStartCursor || !_findingStartCursor->c->ok() ) {
+                _findingStart = false;
+                _c = _qp.newCursor(); // on error, start from beginning
+                destroyClientCursor();
+                return;
+            }
+            switch( _findingStartMode ) {
+                case Initial: {
+                    if ( !_matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) {
+                        _findingStart = false; // found first record out of query range, so scan normally
+                        _c = _qp.newCursor( _findingStartCursor->c->currLoc() );
+                        destroyClientCursor();
+                        return;
+                    }
+                    _findingStartCursor->c->advance();
+                    RARELY {
+                        if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) {
+                            createClientCursor( startLoc( _findingStartCursor->c->currLoc() ) );
+                            _findingStartMode = FindExtent;
+                            return;
+                        }
+                    }
+                    maybeRelease();
+                    return;
+                }
+                case FindExtent: {
+                    if ( !_matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) {
+                        _findingStartMode = InExtent;
+                        return;
+                    }
+                    DiskLoc prev = prevLoc( _findingStartCursor->c->currLoc() );
+                    if ( prev.isNull() ) { // hit beginning, so start scanning from here
+                        createClientCursor();
+                        _findingStartMode = InExtent;
+                        return;
+                    }
+                    // There might be a more efficient implementation than creating new cursor & client cursor each time,
+                    // not worrying about that for now
+                    createClientCursor( prev );
+                    maybeRelease();
+                    return;
+                }
+                case InExtent: {
+                    if ( _matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) {
+                        _findingStart = false; // found first record in query range, so scan normally
+                        _c = _qp.newCursor( _findingStartCursor->c->currLoc() );
+                        destroyClientCursor();
+                        return;
+                    }
+                    _findingStartCursor->c->advance();
+                    maybeRelease();
+                    return;
+                }
+                default: {
+                    massert( 12600, "invalid _findingStartMode", false );
+                }
+            }                
+        }            
+    private:
+        enum FindingStartMode { Initial, FindExtent, InExtent };
+        const QueryPlan &_qp;
+        bool _findingStart;
+        FindingStartMode _findingStartMode;
+        auto_ptr< CoveredIndexMatcher > _matcher;
+        Timer _findingStartTimer;
+        ClientCursor * _findingStartCursor;
+        auto_ptr< Cursor > _c;
+        DiskLoc startLoc( const DiskLoc &rec ) {
+            Extent *e = rec.rec()->myExtent( rec );
+            if ( e->myLoc != _qp.nsd()->capExtent )
+                return e->firstRecord;
+            // Likely we are on the fresh side of capExtent, so return first fresh record.
+            // If we are on the stale side of capExtent, then the collection is small and it
+            // doesn't matter if we start the extent scan with capFirstNewRecord.
+            return _qp.nsd()->capFirstNewRecord;
+        }
+        
+        DiskLoc prevLoc( const DiskLoc &rec ) {
+            Extent *e = rec.rec()->myExtent( rec );
+            if ( e->xprev.isNull() )
+                e = _qp.nsd()->lastExtent.ext();
+            else
+                e = e->xprev.ext();
+            if ( e->myLoc != _qp.nsd()->capExtent )
+                return e->firstRecord;
+            return DiskLoc(); // reached beginning of collection
+        }
+        void createClientCursor( const DiskLoc &startLoc = DiskLoc() ) {
+            auto_ptr<Cursor> c = _qp.newCursor( startLoc );
+            _findingStartCursor = new ClientCursor(c, _qp.ns(), false);            
+        }
+        void destroyClientCursor() {
+            if ( _findingStartCursor ) {
+                ClientCursor::erase( _findingStartCursor->cursorid );
+                _findingStartCursor = 0;
+            }
+        }
+        void maybeRelease() {
+            RARELY {
+                CursorId id = _findingStartCursor->cursorid;
+                _findingStartCursor->updateLocation();
+                {
+                    dbtemprelease t;
+                }   
+                _findingStartCursor = ClientCursor::find( id, false );
+            }                                            
+        }
+        void init() {
+            // Use a ClientCursor here so we can release db mutex while scanning
+            // oplog (can take quite a while with large oplogs).
+            auto_ptr<Cursor> c = _qp.newReverseCursor();
+            _findingStartCursor = new ClientCursor(c, _qp.ns(), false);
+            _findingStartTimer.reset();
+            _findingStartMode = Initial;
+            BSONElement tsElt = _qp.query()[ "ts" ];
+            massert( 13044, "no ts field in query", !tsElt.eoo() );
+            BSONObjBuilder b;
+            b.append( tsElt );
+            BSONObj tsQuery = b.obj();
+            _matcher.reset(new CoveredIndexMatcher(tsQuery, _qp.indexKey()));
+        }
+    };
+
 } // namespace mongo
diff --git a/db/replset.h b/db/replset.h
index 98d80d6..66a8604 100644
--- a/db/replset.h
+++ b/db/replset.h
@@ -49,13 +49,13 @@ namespace mongo {
         };
 
         int state;
-        string info; // commentary about our current state
+        ThreadSafeString info; // commentary about our current state
         string arbHost;  // "-" for no arbiter.  "host[:port]"
         int remotePort;
         string remoteHost;
         string remote; // host:port if port specified.
 //    int date; // -1 not yet set; 0=slave; 1=master
-
+        
         string getInfo() {
             stringstream ss;
             ss << "  state:   ";
@@ -111,7 +111,7 @@ namespace mongo {
        If 'client' is not specified, the current client is used.
     */
     inline bool isMaster( const char *client = 0 ) {
-		if( !slave ) 
+		if( ! replSettings.slave ) 
 			return true;
 
         if ( !client ) {
@@ -128,7 +128,7 @@ namespace mongo {
 				return true;
 		}
         else { 
-            if( master ) {
+            if( replSettings.master ) {
                 // if running with --master --slave, allow.  note that master is also true 
                 // for repl pairs so the check for replPair above is important.
                 return true;
diff --git a/db/scanandorder.h b/db/scanandorder.h
index 3f41433..f038069 100644
--- a/db/scanandorder.h
+++ b/db/scanandorder.h
@@ -40,7 +40,7 @@ namespace mongo {
 
         // returns the key value for o
         BSONObj getKeyFromObject(BSONObj o) {
-            return o.extractFields(pattern);
+            return o.extractFields(pattern,true);
         }
     };
 
diff --git a/db/security.cpp b/db/security.cpp
index 747b04a..6a01627 100644
--- a/db/security.cpp
+++ b/db/security.cpp
@@ -21,12 +21,44 @@
 #include "instance.h"
 #include "client.h"
 #include "curop.h"
+#include "db.h"
+#include "dbhelpers.h"
 
 namespace mongo {
 
     bool noauth = true;
-
+    
 	int AuthenticationInfo::warned = 0;
 
+    void AuthenticationInfo::print(){
+        cout << "AuthenticationInfo: " << this << "\n";
+        for ( map<string,Auth>::iterator i=m.begin(); i!=m.end(); i++ ){
+            cout << "\t" << i->first << "\t" << i->second.level << "\n";
+        }
+        cout << "END" << endl;
+    }
+
+
+    bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) {
+        if ( cc().isGod() ){
+            return true;
+        }
+        
+        if ( isLocalHost ){
+            atleastreadlock l(""); 
+            Client::GodScope gs;
+            Client::Context c("admin.system.users");
+            BSONObj result;
+            if( ! Helpers::getSingleton("admin.system.users", result) ){
+                if( warned == 0 ) {
+                    warned++;
+                    log() << "note: no users configured in admin.system.users, allowing localhost access" << endl;
+                }
+                return true;
+            }
+        }
+        return false;
+    }
+
 } // namespace mongo
 
diff --git a/db/security.h b/db/security.h
index f61d5e1..261b123 100644
--- a/db/security.h
+++ b/db/security.h
@@ -22,9 +22,8 @@
 #undef assert
 #define assert xassert
 
-#include "db.h"
-#include "dbhelpers.h"
 #include "nonce.h"
+#include "concurrency.h"
 
 namespace mongo {
 
@@ -38,40 +37,42 @@ namespace mongo {
     };
 
     class AuthenticationInfo : boost::noncopyable {
+        mongo::mutex _lock;
         map<string, Auth> m; // dbname -> auth
 		static int warned;
     public:
 		bool isLocalHost;
         AuthenticationInfo() { isLocalHost = false; }
-        virtual ~AuthenticationInfo() {
+        ~AuthenticationInfo() {
         }
-        void logout(const char *dbname) { 
-			assertInWriteLock();
+        void logout(const string& dbname ) { 
+            scoped_lock lk(_lock);
 			m.erase(dbname); 
 		}
-        void authorize(const char *dbname) { 
-			assertInWriteLock();
+        void authorize(const string& dbname ) { 
+            scoped_lock lk(_lock);
             m[dbname].level = 2;
         }
-        virtual bool isAuthorized(const char *dbname) { 
-            if( m[dbname].level == 2 ) return true;
+        void authorizeReadOnly(const string& dbname) {
+            scoped_lock lk(_lock);
+            m[dbname].level = 1;            
+        }
+        bool isAuthorized(const string& dbname) { return _isAuthorized( dbname, 2 ); }
+        bool isAuthorizedReads(const string& dbname) { return _isAuthorized( dbname, 1 ); }
+        bool isAuthorizedForLock(const string& dbname, int lockType ) { return _isAuthorized( dbname , lockType > 0 ? 2 : 1 ); }
+        
+        void print();
+
+    protected:
+        bool _isAuthorized(const string& dbname, int level) { 
+            if( m[dbname].level >= level ) return true;
 			if( noauth ) return true;
-			if( m["admin"].level == 2 ) return true;
-			if( m["local"].level == 2 ) return true;
-			if( isLocalHost ) { 
-                readlock l(""); 
-                Client::Context c("admin.system.users");
-				BSONObj result;
-				if( Helpers::getSingleton("admin.system.users", result) )
-					return false;
-				if( warned == 0 ) {
-					warned++;
-					log() << "warning: no users configured in admin.system.users, allowing localhost access" << endl;
-				}
-				return true;
-			}
-			return false;
+			if( m["admin"].level >= level ) return true;
+			if( m["local"].level >= level ) return true;
+            return _isAuthorizedSpecialChecks( dbname );
         }
+
+        bool _isAuthorizedSpecialChecks( const string& dbname );
     };
 
 } // namespace mongo
diff --git a/db/security_commands.cpp b/db/security_commands.cpp
index 9d63744..326d6e4 100644
--- a/db/security_commands.cpp
+++ b/db/security_commands.cpp
@@ -1,4 +1,20 @@
 // security_commands.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 // security.cpp links with both dbgrid and db.  this file db only -- at least for now.
 
 // security.cpp
@@ -39,6 +55,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return true;
         }
+        virtual LockType locktype(){ return NONE; }
         CmdGetNonce() : Command("getnonce") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             nonce *n = new nonce(security.getNonce());
@@ -58,12 +75,12 @@ namespace mongo {
         virtual bool slaveOk() {
             return true;
         }
+        virtual LockType locktype(){ return NONE; }
         CmdLogout() : Command("logout") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             // database->name is the one we are logging out...
-            Client& client = cc();
-            AuthenticationInfo *ai = client.ai;
-            ai->logout(client.database()->name.c_str());
+            AuthenticationInfo *ai = cc().getAuthenticationInfo();
+            ai->logout(nsToDatabase(ns));
             return true;
         }
     } cmdLogout;
@@ -77,6 +94,7 @@ namespace mongo {
         virtual bool slaveOk() {
             return true;
         }
+        virtual LockType locktype(){ return WRITE; } // TODO: make this READ
         CmdAuthenticate() : Command("authenticate") {}
         bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
             log(1) << " authenticate: " << cmdObj << endl;
@@ -88,7 +106,7 @@ namespace mongo {
             if( user.empty() || key.empty() || received_nonce.empty() ) { 
                 log() << "field missing/wrong type in received authenticate command " 
                     << cc().database()->name
-                    << '\n';               
+                    << endl;               
                 errmsg = "auth fails";
                 sleepmillis(10);
                 return false;
@@ -107,7 +125,7 @@ namespace mongo {
                 }
                     
                 if ( reject ) {
-                    log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << '\n';
+                    log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << endl;
                     errmsg = "auth fails";
                     sleepmillis(30);
                     return false;
@@ -124,7 +142,7 @@ namespace mongo {
                 b << "user" << user;
                 BSONObj query = b.done();
                 if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) { 
-                    log() << "auth: couldn't find user " << user << ", " << systemUsers << '\n';
+                    log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
                     errmsg = "auth fails";
                     return false;
                 }
@@ -146,13 +164,24 @@ namespace mongo {
             string computed = digestToString( d );
             
             if ( key != computed ){
-                log() << "auth: key mismatch " << user << ", ns:" << ns << '\n';
+                log() << "auth: key mismatch " << user << ", ns:" << ns << endl;
                 errmsg = "auth fails";
                 return false;
             }
 
-            AuthenticationInfo *ai = currentClient.get()->ai;
-            ai->authorize(cc().database()->name.c_str());
+            AuthenticationInfo *ai = cc().getAuthenticationInfo();
+            
+            if ( userObj[ "readOnly" ].isBoolean() && userObj[ "readOnly" ].boolean() ) {
+                if ( readLockSupported() ){
+                    ai->authorizeReadOnly( cc().database()->name.c_str() );
+                }
+                else {
+                    log() << "warning: old version of boost, read-only users not supported" << endl;
+                    ai->authorize( cc().database()->name.c_str() );
+                }
+            } else {
+                ai->authorize( cc().database()->name.c_str() );
+            }
             return true;
         }
     } cmdAuthenticate;
diff --git a/db/stats/counters.cpp b/db/stats/counters.cpp
new file mode 100644
index 0000000..8e90902
--- /dev/null
+++ b/db/stats/counters.cpp
@@ -0,0 +1,131 @@
+// counters.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "stdafx.h"
+#include "../jsobj.h"
+#include "counters.h"
+
+namespace mongo {
+
+    OpCounters::OpCounters(){
+        int zero = 0;
+
+        BSONObjBuilder b;
+        b.append( "insert" , zero );
+        b.append( "query" , zero );
+        b.append( "update" , zero );
+        b.append( "delete" , zero );
+        b.append( "getmore" , zero );
+        b.append( "command" , zero );
+        _obj = b.obj();
+
+        _insert = (int*)_obj["insert"].value();
+        _query = (int*)_obj["query"].value();
+        _update = (int*)_obj["update"].value();
+        _delete = (int*)_obj["delete"].value();
+        _getmore = (int*)_obj["getmore"].value();
+        _command = (int*)_obj["command"].value();
+    }
+
+    void OpCounters::gotOp( int op , bool isCommand ){
+        switch ( op ){
+        case dbInsert: gotInsert(); break;
+        case dbQuery: 
+            if ( isCommand )
+                gotCommand();
+            else 
+                gotQuery(); 
+            break;
+            
+        case dbUpdate: gotUpdate(); break;
+        case dbDelete: gotDelete(); break;
+        case dbGetMore: gotGetMore(); break;
+        case dbKillCursors:
+        case opReply:
+        case dbMsg:
+            break;
+        default: log() << "OpCounters::gotOp unknown op: " << op << endl;
+        }
+    }
+    
+    IndexCounters::IndexCounters(){
+        _memSupported = _pi.blockCheckSupported();
+        
+        _btreeMemHits = 0;
+        _btreeMemMisses = 0;
+        _btreeAccesses = 0;
+        
+        
+        _maxAllowed = ( numeric_limits< long long >::max() ) / 2;
+        _resets = 0;
+
+        _sampling = 0;
+        _samplingrate = 100;
+    }
+    
+    void IndexCounters::append( BSONObjBuilder& b ){
+        if ( ! _memSupported ){
+            b.append( "note" , "not supported on this platform" );
+            return;
+        }
+
+        BSONObjBuilder bb( b.subobjStart( "btree" ) );
+        bb.appendNumber( "accesses" , _btreeAccesses );
+        bb.appendNumber( "hits" , _btreeMemHits );
+        bb.appendNumber( "misses" , _btreeMemMisses );
+
+        bb.append( "resets" , _resets );
+        
+        bb.append( "missRatio" , (_btreeAccesses ? (_btreeMemMisses / (double)_btreeAccesses) : 0) );
+        
+        bb.done();
+        
+        if ( _btreeAccesses > _maxAllowed ){
+            _btreeAccesses = 0;
+            _btreeMemMisses = 0;
+            _btreeMemHits = 0;
+            _resets++;
+        }
+    }
+    
+    FlushCounters::FlushCounters()
+        : _total_time(0)
+        , _flushes(0)
+        , _last()
+    {}
+
+    void FlushCounters::flushed(int ms){
+        _flushes++;
+        _total_time += ms;
+        _last_time = ms;
+        _last = jsTime();
+    }
+
+    void FlushCounters::append( BSONObjBuilder& b ){
+        b.appendNumber( "flushes" , _flushes );
+        b.appendNumber( "total_ms" , _total_time );
+        b.appendNumber( "average_ms" , (_flushes ? (_total_time / double(_flushes)) : 0.0) );
+        b.appendNumber( "last_ms" , _last_time );
+        b.append("last_finished", _last);
+    }
+    
+
+    OpCounters globalOpCounters;
+    IndexCounters globalIndexCounters;
+    FlushCounters globalFlushCounters;
+}
diff --git a/db/stats/counters.h b/db/stats/counters.h
new file mode 100644
index 0000000..41c2cd2
--- /dev/null
+++ b/db/stats/counters.h
@@ -0,0 +1,121 @@
+// counters.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "../../stdafx.h"
+#include "../jsobj.h"
+#include "../../util/message.h"
+#include "../../util/processinfo.h"
+
+namespace mongo {
+
+    /**
+     * for storing operation counters
+     * note: not thread safe.  ok with that for speed
+     */
+    class OpCounters {
+    public:
+        
+        OpCounters();
+
+        int * getInsert(){ return _insert; }
+        int * getQuery(){ return _query; }
+        int * getUpdate(){ return _update; }
+        int * getDelete(){ return _delete; }
+        int * getGetMore(){ return _getmore; }
+        int * getCommand(){ return _command; }
+        
+        void gotInsert(){ _insert[0]++; }
+        void gotQuery(){ _query[0]++; }
+        void gotUpdate(){ _update[0]++; }
+        void gotDelete(){ _delete[0]++; }
+        void gotGetMore(){ _getmore[0]++; }
+        void gotCommand(){ _command[0]++; }
+
+        void gotOp( int op , bool isCommand );
+
+        BSONObj& getObj(){ return _obj; }
+    private:
+        BSONObj _obj;
+        int * _insert;
+        int * _query;
+        int * _update;
+        int * _delete;
+        int * _getmore;
+        int * _command;
+    };
+    
+    extern OpCounters globalOpCounters;
+
+    class IndexCounters {
+    public:
+        IndexCounters();
+        
+        void btree( char * node ){
+            if ( ! _memSupported )
+                return;
+            if ( _sampling++ % _samplingrate )
+                return;
+            btree( _pi.blockInMemory( node ) );
+        }
+
+        void btree( bool memHit ){
+            if ( memHit )
+                _btreeMemHits++;
+            else
+                _btreeMemMisses++;
+            _btreeAccesses++;
+        }
+        void btreeHit(){ _btreeMemHits++; _btreeAccesses++; }
+        void btreeMiss(){ _btreeMemMisses++; _btreeAccesses++; }
+        
+        void append( BSONObjBuilder& b );
+        
+    private:
+        ProcessInfo _pi;
+        bool _memSupported;
+
+        int _sampling;
+        int _samplingrate;
+        
+        int _resets;
+        long long _maxAllowed;
+        
+        long long _btreeMemMisses;
+        long long _btreeMemHits;
+        long long _btreeAccesses;
+    };
+
+    extern IndexCounters globalIndexCounters;
+
+    class FlushCounters {
+    public:
+        FlushCounters();
+
+        void flushed(int ms);
+        
+        void append( BSONObjBuilder& b );
+
+    private:
+        long long _total_time;
+        long long _flushes;
+        int _last_time;
+        Date_t _last;
+    };
+
+    extern FlushCounters globalFlushCounters;
+}
diff --git a/db/stats/snapshots.cpp b/db/stats/snapshots.cpp
new file mode 100644
index 0000000..71ddd72
--- /dev/null
+++ b/db/stats/snapshots.cpp
@@ -0,0 +1,144 @@
+// snapshots.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "stdafx.h"
+#include "snapshots.h"
+#include "../client.h"
+#include "../clientcursor.h"
+
+/**
+   handles snapshotting performance metrics and other such things
+ */
+namespace mongo {
+    void SnapshotData::takeSnapshot(){
+         _created = curTimeMicros64();
+         _globalUsage = Top::global.getGlobalData();
+        _totalWriteLockedTime = dbMutex.info().getTimeLocked();
+        Top::global.cloneMap(_usage);
+    }
+
+    SnapshotDelta::SnapshotDelta( const SnapshotData& older , const SnapshotData& newer )
+        : _older( older ) , _newer( newer )
+    {
+        assert( _newer._created > _older._created );
+        _elapsed = _newer._created - _older._created;
+        
+    }
+    
+    Top::CollectionData SnapshotDelta::globalUsageDiff(){
+        return Top::CollectionData( _older._globalUsage , _newer._globalUsage );
+    }
+    Top::UsageMap SnapshotDelta::collectionUsageDiff(){
+        Top::UsageMap u;
+        
+        for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ){
+            Top::UsageMap::const_iterator j = _older._usage.find(i->first);
+            if (j != _older._usage.end())
+                u[i->first] = Top::CollectionData( j->second , i->second );
+        }
+        return u;
+    }
+
+    Snapshots::Snapshots(int n)
+        : _n(n)
+        , _snapshots(new SnapshotData[n])
+        , _loc(0)
+        , _stored(0)
+    {}
+    
+    const SnapshotData* Snapshots::takeSnapshot(){
+        scoped_lock lk(_lock);
+        _loc = ( _loc + 1 ) % _n;
+        _snapshots[_loc].takeSnapshot();
+        if ( _stored < _n )
+            _stored++;
+        return &_snapshots[_loc];
+    }
+
+    auto_ptr<SnapshotDelta> Snapshots::computeDelta( int numBack ){
+        scoped_lock lk(_lock);
+        auto_ptr<SnapshotDelta> p;
+        if ( numBack < numDeltas() )
+            p.reset( new SnapshotDelta( getPrev(numBack+1) , getPrev(numBack) ) );
+        return p;
+    }
+
+    const SnapshotData& Snapshots::getPrev( int numBack ){
+        int x = _loc - numBack;
+        if ( x < 0 )
+            x += _n;
+        return _snapshots[x];
+    }
+
+    void Snapshots::outputLockInfoHTML( stringstream& ss ){
+        scoped_lock lk(_lock);
+        ss << "\n<table>";
+        ss << "<tr><th>elapsed(ms)</th><th>% write locked</th></tr>\n";
+        
+        for ( int i=0; i<numDeltas(); i++ ){
+            SnapshotDelta d( getPrev(i+1) , getPrev(i) );
+            ss << "<tr>"
+               << "<td>" << ( d.elapsed() / 1000 ) << "</td>"
+               << "<td>" << (unsigned)(100*d.percentWriteLocked()) << "%</td>"
+               << "</tr>"
+                ;
+        }
+        
+        ss << "</table>\n";
+    }
+
+    void SnapshotThread::run(){
+        Client::initThread("snapshotthread");
+        Client& client = cc();
+
+        long long numLoops = 0;
+        
+        const SnapshotData* prev = 0;
+
+        while ( ! inShutdown() ){
+            try {
+                const SnapshotData* s = statsSnapshots.takeSnapshot();
+                
+                if ( prev ){
+                    unsigned long long elapsed = s->_created - prev->_created;
+
+                    if ( cmdLine.cpu ){
+                        SnapshotDelta d( *prev , *s );
+                        log() << "cpu: elapsed:" << (elapsed/1000) <<"  writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl;
+                    }
+
+                    // TODO: this should really be somewhere else, like in a special ClientCursor thread
+                    ClientCursor::idleTimeReport( (unsigned)(elapsed/1000) );
+                }
+
+                prev = s;
+            }
+            catch ( std::exception& e ){
+                log() << "ERROR in SnapshotThread: " << e.what() << endl;
+            }
+            
+            numLoops++;
+            sleepsecs(4);
+        }
+        
+        client.shutdown();
+    }
+
+    Snapshots statsSnapshots;
+    SnapshotThread snapshotThread;    
+}
diff --git a/db/stats/snapshots.h b/db/stats/snapshots.h
new file mode 100644
index 0000000..542318a
--- /dev/null
+++ b/db/stats/snapshots.h
@@ -0,0 +1,113 @@
+// snapshots.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "../../stdafx.h"
+#include "../jsobj.h"
+#include "top.h"
+#include "../../util/background.h"
+
+/**
+   handles snapshotting performance metrics and other such things
+ */
+namespace mongo {
+
+    class SnapshotThread;
+    
+    /**
+     * stores a point in time snapshot
+     * i.e. all counters at a given time
+     */
+    class SnapshotData {
+        void takeSnapshot();
+
+        unsigned long long _created;
+        Top::CollectionData _globalUsage;
+        unsigned long long _totalWriteLockedTime; // micros of total time locked
+        Top::UsageMap _usage;
+
+        friend class SnapshotThread;
+        friend class SnapshotDelta;
+        friend class Snapshots;
+    };
+    
+    /**
+     * contains performance information for a time period
+     */
+    class SnapshotDelta {
+    public:
+        SnapshotDelta( const SnapshotData& older , const SnapshotData& newer );
+        
+        unsigned long long start() const {
+            return _older._created;
+        }
+
+        unsigned long long elapsed() const {
+            return _elapsed;
+        }
+        
+        unsigned long long timeInWriteLock() const {
+            return _newer._totalWriteLockedTime - _older._totalWriteLockedTime;
+        }
+        double percentWriteLocked() const {
+            double e = (double) elapsed();
+            double w = (double) timeInWriteLock();
+            return w/e;
+        }
+
+        Top::CollectionData globalUsageDiff();
+        Top::UsageMap collectionUsageDiff();
+
+    private:
+        const SnapshotData& _older;
+        const SnapshotData& _newer;
+
+        unsigned long long _elapsed;
+    };
+
+    class Snapshots {
+    public:
+        Snapshots(int n=100);
+        
+        const SnapshotData* takeSnapshot();
+        
+        int numDeltas() const { return _stored-1; }
+
+        const SnapshotData& getPrev( int numBack = 0 );
+        auto_ptr<SnapshotDelta> computeDelta( int numBack = 0 );
+        
+        
+        void outputLockInfoHTML( stringstream& ss );
+    private:
+        mongo::mutex _lock;
+        int _n;
+        boost::scoped_array<SnapshotData> _snapshots;
+        int _loc;
+        int _stored;
+    };
+
+    class SnapshotThread : public BackgroundJob {
+    public:
+        void run();
+    };
+    
+    extern Snapshots statsSnapshots;
+    extern SnapshotThread snapshotThread;
+
+
+}
diff --git a/db/stats/top.cpp b/db/stats/top.cpp
new file mode 100644
index 0000000..0f27943
--- /dev/null
+++ b/db/stats/top.cpp
@@ -0,0 +1,181 @@
+// top.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "stdafx.h"
+#include "top.h"
+#include "../../util/message.h"
+#include "../commands.h"
+
+namespace mongo {
+    
+    Top::UsageData::UsageData( const UsageData& older , const UsageData& newer )
+        : time(newer.time-older.time) , 
+          count(newer.count-older.count) 
+    {
+        
+    }
+
+    Top::CollectionData::CollectionData( const CollectionData& older , const CollectionData& newer )
+        : total( older.total , newer.total ) , 
+          readLock( older.readLock , newer.readLock ) ,
+          writeLock( older.writeLock , newer.writeLock ) ,
+          queries( older.queries , newer.queries ) ,
+          getmore( older.getmore , newer.getmore ) ,
+          insert( older.insert , newer.insert ) ,
+          update( older.update , newer.update ) ,
+          remove( older.remove , newer.remove ),
+          commands( older.commands , newer.commands ) 
+    {
+        
+    }
+
+    
+    void Top::record( const string& ns , int op , int lockType , long long micros , bool command ){
+        //cout << "record: " << ns << "\t" << op << "\t" << command << endl;
+        scoped_lock lk(_lock);
+        
+        if ( ( command || op == dbQuery ) && ns == _lastDropped ){
+            _lastDropped = "";
+            return;
+        }
+
+        CollectionData& coll = _usage[ns];
+        _record( coll , op , lockType , micros , command );
+        _record( _global , op , lockType , micros , command );
+    }
+
+    void Top::collectionDropped( const string& ns ){
+        //cout << "collectionDropped: " << ns << endl;
+        scoped_lock lk(_lock);
+        _usage.erase(ns);
+        _lastDropped = ns;
+    }
+    
+    void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ){
+        c.total.inc( micros );
+        
+        if ( lockType > 0 )
+            c.writeLock.inc( micros );
+        else if ( lockType < 0 )
+            c.readLock.inc( micros );
+        
+        switch ( op ){
+        case 0:
+            // use 0 for unknown, non-specific
+            break;
+        case dbUpdate:
+            c.update.inc( micros );
+            break;
+        case dbInsert:
+            c.insert.inc( micros );
+            break;
+        case dbQuery:
+            if ( command )
+                c.commands.inc( micros );
+            else
+                c.queries.inc( micros );
+            break;
+        case dbGetMore:
+            c.getmore.inc( micros );
+            break;
+        case dbDelete:
+            c.remove.inc( micros );
+            break;
+        case opReply: 
+        case dbMsg:
+        case dbKillCursors:
+            log() << "unexpected op in Top::record: " << op << endl;
+            break;
+        default:
+            log() << "unknown op in Top::record: " << op << endl;
+        }
+
+    }
+
+    void Top::cloneMap(Top::UsageMap& out){
+        scoped_lock lk(_lock);
+        out = _usage;
+    }
+
+    void Top::append( BSONObjBuilder& b ){
+        scoped_lock lk( _lock );
+        append( b , _usage );
+    }
+
+    void Top::append( BSONObjBuilder& b , const char * name , const UsageData& map ){
+        BSONObjBuilder bb( b.subobjStart( name ) );
+        bb.appendNumber( "time" , map.time );
+        bb.appendNumber( "count" , map.count );
+        bb.done();
+    }
+
+    void Top::append( BSONObjBuilder& b , const UsageMap& map ){
+        for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ){
+            BSONObjBuilder bb( b.subobjStart( i->first.c_str() ) );
+            
+            const CollectionData& coll = i->second;
+            
+            append( b , "total" , coll.total );
+            
+            append( b , "readLock" , coll.readLock );
+            append( b , "writeLock" , coll.writeLock );
+
+            append( b , "queries" , coll.queries );
+            append( b , "getmore" , coll.getmore );
+            append( b , "insert" , coll.insert );
+            append( b , "update" , coll.update );
+            append( b , "remove" , coll.remove );
+            append( b , "commands" , coll.commands );
+            
+            bb.done();
+        }
+    }
+
+    class TopCmd : public Command {
+    public:
+        TopCmd() : Command( "top" ){}
+
+        virtual bool slaveOk(){ return true; }
+        virtual bool adminOnly(){ return true; }
+        virtual LockType locktype(){ return READ; } 
+        virtual void help( stringstream& help ) const { help << "usage by collection"; }
+
+        virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){
+            {
+                BSONObjBuilder b( result.subobjStart( "totals" ) );
+                Top::global.append( b );
+                b.done();
+            }
+            return true;
+        }
+        
+    } topCmd;
+
+    Top Top::global;
+    
+    TopOld::T TopOld::_snapshotStart = TopOld::currentTime();
+    TopOld::D TopOld::_snapshotDuration;
+    TopOld::UsageMap TopOld::_totalUsage;
+    TopOld::UsageMap TopOld::_snapshotA;
+    TopOld::UsageMap TopOld::_snapshotB;
+    TopOld::UsageMap &TopOld::_snapshot = TopOld::_snapshotA;
+    TopOld::UsageMap &TopOld::_nextSnapshot = TopOld::_snapshotB;
+    mongo::mutex TopOld::topMutex;
+
+
+}
diff --git a/db/stats/top.h b/db/stats/top.h
new file mode 100644
index 0000000..8dab3b0
--- /dev/null
+++ b/db/stats/top.h
@@ -0,0 +1,248 @@
+// top.h : DB usage monitor.
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/date_time/posix_time/posix_time.hpp>
+#undef assert
+#define assert xassert
+
+namespace mongo {
+
+    /**
+     * tracks usage by collection
+     */
+    class Top {
+
+    public:
+        class UsageData {
+        public:
+            UsageData() : time(0) , count(0){}
+            UsageData( const UsageData& older , const UsageData& newer );
+            long long time;
+            long long count;
+
+            void inc( long long micros ){
+                count++;
+                time += micros;
+            }
+        };
+
+        class CollectionData {
+        public:
+            /**
+             * constructs a diff
+             */
+            CollectionData(){}
+            CollectionData( const CollectionData& older , const CollectionData& newer );
+            
+            UsageData total;
+            
+            UsageData readLock;
+            UsageData writeLock;
+
+            UsageData queries;
+            UsageData getmore;
+            UsageData insert;
+            UsageData update;
+            UsageData remove;
+            UsageData commands;
+        };
+
+        typedef map<string,CollectionData> UsageMap;
+        
+    public:
+        void record( const string& ns , int op , int lockType , long long micros , bool command );
+        void append( BSONObjBuilder& b );
+        void cloneMap(UsageMap& out);
+        CollectionData getGlobalData(){ return _global; }
+        void collectionDropped( const string& ns );
+
+    public: // static stuff
+        static Top global;
+        
+        void append( BSONObjBuilder& b , const char * name , const UsageData& map );
+        void append( BSONObjBuilder& b , const UsageMap& map );
+        
+    private:
+        
+        void _record( CollectionData& c , int op , int lockType , long long micros , bool command );
+
+        mongo::mutex _lock;
+        CollectionData _global;
+        UsageMap _usage;
+        string _lastDropped;
+    };
+
+    /* Records per namespace utilization of the mongod process.
+       No two functions of this class may be called concurrently.
+    */
+    class TopOld {
+        typedef boost::posix_time::ptime T;
+        typedef boost::posix_time::time_duration D;
+        typedef boost::tuple< D, int, int, int > UsageData;
+    public:
+        TopOld() : _read(false), _write(false) { }
+        
+        /* these are used to record activity: */
+        
+        void clientStart( const char *client ) {
+            clientStop();
+            _currentStart = currentTime();
+            _current = client;
+        }
+
+        /* indicate current request is a read operation. */
+        void setRead() { _read = true; }
+
+        void setWrite() { _write = true; }
+
+        void clientStop() {
+            if ( _currentStart == T() )
+                return;
+            D d = currentTime() - _currentStart;
+
+            {
+                scoped_lock L(topMutex);
+                recordUsage( _current, d );
+            }
+
+            _currentStart = T();
+            _read = false;
+            _write = false;
+        }
+
+        /* these are used to fetch the stats: */
+
+        struct Usage { 
+            string ns; 
+            D time; 
+            double pct; 
+            int reads, writes, calls; 
+        };
+
+        static void usage( vector< Usage > &res ) {
+            scoped_lock L(topMutex);
+
+            // Populate parent namespaces
+            UsageMap snapshot;
+            UsageMap totalUsage;
+            fillParentNamespaces( snapshot, _snapshot );
+            fillParentNamespaces( totalUsage, _totalUsage );
+        
+            multimap< D, string, more > sorted;
+            for( UsageMap::iterator i = snapshot.begin(); i != snapshot.end(); ++i )
+                sorted.insert( make_pair( i->second.get<0>(), i->first ) );
+            for( multimap< D, string, more >::iterator i = sorted.begin(); i != sorted.end(); ++i ) {
+                if ( trivialNs( i->second.c_str() ) )
+                    continue;
+                Usage u;
+                u.ns = i->second;
+                u.time = totalUsage[ u.ns ].get<0>();
+                u.pct = _snapshotDuration != D() ? 100.0 * i->first.ticks() / _snapshotDuration.ticks() : 0;
+                u.reads = snapshot[ u.ns ].get<1>();
+                u.writes = snapshot[ u.ns ].get<2>();
+                u.calls = snapshot[ u.ns ].get<3>();
+                res.push_back( u );
+            }
+            for( UsageMap::iterator i = totalUsage.begin(); i != totalUsage.end(); ++i ) {
+                if ( snapshot.count( i->first ) != 0 || trivialNs( i->first.c_str() ) )
+                    continue;
+                Usage u;
+                u.ns = i->first;
+                u.time = i->second.get<0>();
+                u.pct = 0;
+                u.reads = 0;
+                u.writes = 0;
+                u.calls = 0;
+                res.push_back( u );
+            }
+        }
+
+        static void completeSnapshot() {
+            scoped_lock L(topMutex);
+
+            if ( &_snapshot == &_snapshotA ) {
+                _snapshot = _snapshotB;
+                _nextSnapshot = _snapshotA;
+            } else {
+                _snapshot = _snapshotA;
+                _nextSnapshot = _snapshotB;
+            }
+            _snapshotDuration = currentTime() - _snapshotStart;
+            _snapshotStart = currentTime();
+            _nextSnapshot.clear();
+        }
+
+    private:
+        static mongo::mutex topMutex;
+        static bool trivialNs( const char *ns ) {
+            const char *ret = strrchr( ns, '.' );
+            return ret && ret[ 1 ] == '\0';
+        }
+        typedef map<string,UsageData> UsageMap; // duration, # reads, # writes, # total calls
+        static T currentTime() {
+            return boost::posix_time::microsec_clock::universal_time();
+        }
+        void recordUsage( const string &client, D duration ) {
+            recordUsageForMap( _totalUsage, client, duration );
+            recordUsageForMap( _nextSnapshot, client, duration );
+        }
+        void recordUsageForMap( UsageMap &map, const string &client, D duration ) {
+            UsageData& g = map[client];
+            g.get< 0 >() += duration;
+            if ( _read && !_write )
+                g.get< 1 >()++;
+            else if ( !_read && _write )
+                g.get< 2 >()++;
+            g.get< 3 >()++;        
+        }
+        static void fillParentNamespaces( UsageMap &to, const UsageMap &from ) {
+            for( UsageMap::const_iterator i = from.begin(); i != from.end(); ++i ) {
+                string current = i->first;
+                size_t dot = current.rfind( "." );
+                if ( dot == string::npos || dot != current.length() - 1 ) {
+                    inc( to[ current ], i->second );
+                }
+                while( dot != string::npos ) {
+                    current = current.substr( 0, dot );
+                    inc( to[ current ], i->second );
+                    dot = current.rfind( "." );
+                }            
+            }        
+        }
+        static void inc( UsageData &to, const UsageData &from ) {
+            to.get<0>() += from.get<0>();
+            to.get<1>() += from.get<1>();
+            to.get<2>() += from.get<2>();
+            to.get<3>() += from.get<3>();
+        }
+        struct more { bool operator()( const D &a, const D &b ) { return a > b; } };
+        string _current;
+        T _currentStart;
+        static T _snapshotStart;
+        static D _snapshotDuration;
+        static UsageMap _totalUsage;
+        static UsageMap _snapshotA;
+        static UsageMap _snapshotB;
+        static UsageMap &_snapshot;
+        static UsageMap &_nextSnapshot;
+        bool _read;
+        bool _write;
+    };
+
+} // namespace mongo
diff --git a/db/storage.cpp b/db/storage.cpp
index 4da2d82..7ddfc65 100644
--- a/db/storage.cpp
+++ b/db/storage.cpp
@@ -1,4 +1,20 @@
 // storage.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 
 #include "stdafx.h"
 #include "pdfile.h"
diff --git a/db/update.cpp b/db/update.cpp
index 0639a99..d6a5c5e 100644
--- a/db/update.cpp
+++ b/db/update.cpp
@@ -26,8 +26,11 @@
 
 namespace mongo {
 
+    //#define DEBUGUPDATE(x) cout << x << endl;
+#define DEBUGUPDATE(x)
+    
     const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" ,
-                                     "$bitand" , "$bitor" , "$bit" };
+                                    "$bitand" , "$bitor" , "$bit" , "$addToSet" };
     unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*);
 
     bool Mod::_pullElementMatch( BSONElement& toMatch ) const {
@@ -46,13 +49,42 @@ namespace mongo {
         return matcher->matches( toMatch.embeddedObject() );
     }
 
-    void Mod::apply( BSONObjBuilder& b , BSONElement in ){
+    template< class Builder >
+    void Mod::appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const {
+        BSONType a = in.type();
+        BSONType b = elt.type();
+        
+        if ( a == NumberDouble || b == NumberDouble ){
+            ms.incType = NumberDouble;
+            ms.incdouble = elt.numberDouble() + in.numberDouble();
+        }
+        else if ( a == NumberLong || b == NumberLong ){
+            ms.incType = NumberLong;
+            ms.inclong = elt.numberLong() + in.numberLong();
+        }
+        else {
+            ms.incType = NumberInt;
+            ms.incint = elt.numberInt() + in.numberInt();
+        }
+        
+        ms.appendIncValue( bb );
+    }
+
+    template< class Builder >
+    void appendUnset( Builder &b ) {
+    }
+    
+    template<>
+    void appendUnset( BSONArrayBuilder &b ) {
+        b.appendNull();
+    }
+    
+    template< class Builder >
+    void Mod::apply( Builder& b , BSONElement in , ModState& ms ) const {
         switch ( op ){
         
         case INC: {
-            // TODO: this is horrible
-            inc( in );
-            b.appendAs( elt , shortFieldName ); 
+            appendIncremented( b , in , ms );
             break;
         }
             
@@ -63,10 +95,10 @@ namespace mongo {
         }
 
         case UNSET: {
-            //Explicit NOOP
+            appendUnset( b );
             break;
         }
-
+            
         case PUSH: {
             uassert( 10131 ,  "$push can only be applied to an array" , in.type() == Array );
             BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
@@ -77,13 +109,60 @@ namespace mongo {
                 n++;
             }
 
-            pushStartSize = n;
+            ms.pushStartSize = n;
 
             bb.appendAs( elt ,  bb.numStr( n ) );
             bb.done();
             break;
         }
             
+        case ADDTOSET: {
+            uassert( 12592 ,  "$addToSet can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+            
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;            
+
+            if ( isEach() ){
+                
+                BSONElementSet toadd;
+                parseEach( toadd );
+                
+                while ( i.more() ){
+                    BSONElement cur = i.next();
+                    bb.append( cur );
+                    n++;           
+                    toadd.erase( cur );
+                }
+                
+                for ( BSONElementSet::iterator j=toadd.begin(); j!=toadd.end(); j++ ){
+                    bb.appendAs( *j , BSONObjBuilder::numStr( n++ ) );
+                }
+
+            }
+            else {
+
+                bool found = false;
+
+                while ( i.more() ){
+                    BSONElement cur = i.next();
+                    bb.append( cur );
+                    n++;
+                    if ( elt.woCompare( cur , false ) == 0 )
+                        found = true;
+                }
+                
+                if ( ! found )
+                    bb.appendAs( elt ,  bb.numStr( n ) );
+                
+            }
+            
+            bb.done();
+            break;
+        }
+
+
+            
         case PUSH_ALL: {
             uassert( 10132 ,  "$pushAll can only be applied to an array" , in.type() == Array );
             uassert( 10133 ,  "$pushAll has to be passed an array" , elt.type() );
@@ -97,7 +176,7 @@ namespace mongo {
                 n++;
             }
 
-            pushStartSize = n;
+            ms.pushStartSize = n;
 
             i = BSONObjIterator( elt.embeddedObject() );
             while ( i.more() ){
@@ -172,8 +251,8 @@ namespace mongo {
                 }
             }
 
-            pushStartSize = n;
-            assert( pushStartSize == in.embeddedObject().nFields() );
+            ms.pushStartSize = n;
+            assert( ms.pushStartSize == in.embeddedObject().nFields() );
             bb.done();
             break;
         }
@@ -226,97 +305,130 @@ namespace mongo {
         }
     }
 
-    bool ModSet::canApplyInPlaceAndVerify(const BSONObj &obj) const {
-        bool inPlacePossible = true;
+    auto_ptr<ModSetState> ModSet::prepare(const BSONObj &obj) const {
+        ModSetState * mss = new ModSetState( obj );
 
         // Perform this check first, so that we don't leave a partially modified object on uassert.
         for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            ModState& ms = mss->_mods[i->first];
+
             const Mod& m = i->second;
             BSONElement e = obj.getFieldDotted(m.fieldName);
-            
+
+            ms.m = &m;
+            ms.old = e;
+
             if ( e.eoo() ) {
-                inPlacePossible = (m.op == Mod::UNSET);
+                mss->amIInPlacePossible( m.op == Mod::UNSET );
+                continue;
             } 
-            else {
-                switch( m.op ) {
-                case Mod::INC:
-                    uassert( 10140 ,  "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() );
-                    if ( !e.isNumber() )
-                        inPlacePossible = false;
-                    break;
-                case Mod::SET:
-                    inPlacePossible = 
-                        m.elt.type() == e.type() &&
-                        m.elt.valuesize() == e.valuesize();
-                    break;
-                case Mod::PUSH:
-                case Mod::PUSH_ALL:
-                    uassert( 10141 ,  "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() );
-                    inPlacePossible = false;
-                    break;
-                case Mod::PULL:
-                case Mod::PULL_ALL: {
-                    uassert( 10142 ,  "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() );
-                    BSONObjIterator i( e.embeddedObject() );
-                    while( inPlacePossible && i.more() ) {
-                        BSONElement arrI = i.next();
-                        if ( m.op == Mod::PULL ) {
-                            if ( m._pullElementMatch( arrI ) )
-                                inPlacePossible = false;
-                        } 
-                        else if ( m.op == Mod::PULL_ALL ) {
-                            BSONObjIterator j( m.elt.embeddedObject() );
-                            while( inPlacePossible && j.moreWithEOO() ) {
-                                BSONElement arrJ = j.next();
-                                if ( arrJ.eoo() )
-                                    break;
-                                if ( arrI.woCompare( arrJ, false ) == 0 ) {
-                                    inPlacePossible = false;
-                                }
-                            }
+            
+            switch( m.op ) {
+            case Mod::INC:
+                uassert( 10140 ,  "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() );
+                if ( mss->amIInPlacePossible( e.isNumber() ) ){
+                    // check more typing info here
+                    if ( m.elt.type() != e.type() ){
+                        // if i'm incrememnting with a double, then the storage has to be a double
+                        mss->amIInPlacePossible( m.elt.type() != NumberDouble ); 
+                    }
+                }
+                break;
+
+            case Mod::SET:
+                mss->amIInPlacePossible( m.elt.type() == e.type() &&
+                                         m.elt.valuesize() == e.valuesize() );
+                break;
+            
+            case Mod::PUSH:
+            case Mod::PUSH_ALL:
+                uassert( 10141 ,  "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() );
+                mss->amIInPlacePossible( false );
+                break;
+
+            case Mod::PULL:
+            case Mod::PULL_ALL: {
+                uassert( 10142 ,  "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() );
+                BSONObjIterator i( e.embeddedObject() );
+                while( mss->_inPlacePossible && i.more() ) {
+                    BSONElement arrI = i.next();
+                    if ( m.op == Mod::PULL ) {
+                        mss->amIInPlacePossible( ! m._pullElementMatch( arrI ) );
+                    } 
+                    else if ( m.op == Mod::PULL_ALL ) {
+                        BSONObjIterator j( m.elt.embeddedObject() );
+                        while( mss->_inPlacePossible && j.moreWithEOO() ) {
+                            BSONElement arrJ = j.next();
+                            if ( arrJ.eoo() )
+                                break;
+                            mss->amIInPlacePossible( arrI.woCompare( arrJ, false ) );
                         }
                     }
-                    break;
                 }
-                case Mod::POP: {
-                    uassert( 10143 ,  "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() );
-                    if ( ! e.embeddedObject().isEmpty() )
-                        inPlacePossible = false;
-                    break;
+                break;
+            }
+
+            case Mod::POP: {
+                uassert( 10143 ,  "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() );
+                mss->amIInPlacePossible( e.embeddedObject().isEmpty() );
+                break;
+            }
+                
+            case Mod::ADDTOSET: {
+                uassert( 12591 ,  "Cannot apply $addToSet modifier to non-array", e.type() == Array || e.eoo() );
+                
+                BSONObjIterator i( e.embeddedObject() );
+                if ( m.isEach() ){
+                    BSONElementSet toadd;
+                    m.parseEach( toadd );
+                    while( i.more() ) {
+                        BSONElement arrI = i.next();
+                        toadd.erase( arrI );
+                    }
+                    mss->amIInPlacePossible( toadd.size() == 0 );
                 }
-                default:
-                    // mods we don't know about shouldn't be done in place
-                    inPlacePossible = false;
+                else {
+                    bool found = false;
+                    while( i.more() ) {
+                        BSONElement arrI = i.next();
+                        if ( arrI.woCompare( m.elt , false ) == 0 ){
+                            found = true;
+                            break;
+                        }
+                    }
+                    mss->amIInPlacePossible( found );
                 }
+                break;
+            }
+                
+            default:
+                // mods we don't know about shouldn't be done in place
+                mss->amIInPlacePossible( false );
             }
         }
-        return inPlacePossible;
+        return auto_ptr<ModSetState>( mss );
     }
     
-    void ModSet::applyModsInPlace(const BSONObj &obj) const {
-        for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) {
-            const Mod& m = i->second;
-            BSONElement e = obj.getFieldDotted(m.fieldName);
+    void ModSetState::applyModsInPlace() {
+        for ( ModStateHolder::iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            ModState& m = i->second;
             
-            switch ( m.op ){
+            switch ( m.m->op ){
             case Mod::UNSET:
             case Mod::PULL:
             case Mod::PULL_ALL:
+            case Mod::ADDTOSET:
+                // this should have been handled by prepare
                 break;
 
             // [dm] the BSONElementManipulator statements below are for replication (correct?)
             case Mod::INC:
-                m.inc(e);
-                m.setElementToOurNumericValue(e);
+                m.m->incrementMe( m.old );
+                m.fixedName = "$set";
+                m.fixed = &(m.old);
                 break;
             case Mod::SET:
-                if ( e.isNumber() && m.elt.isNumber() ) {
-                    // todo: handle NumberLong:
-                    m.setElementToOurNumericValue(e);
-                } 
-                else {
-                    BSONElementManipulator( e ).replaceTypeAndValue( m.elt );
-                }
+                BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt );
                 break;
             default:
                 uassert( 10144 ,  "can't apply mod in place - shouldn't have gotten here" , 0 );
@@ -342,18 +454,19 @@ namespace mongo {
             fields[ base + top.fieldName() ] = top;            
     }
     
-    void ModSet::_appendNewFromMods( const string& root , Mod& m , BSONObjBuilder& b , set<string>& onedownseen ){
-        const char * temp = m.fieldName;
+    template< class Builder >
+    void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ){
+        const char * temp = m.fieldName();
         temp += root.size();
         const char * dot = strchr( temp , '.' );
         if ( dot ){
-            string nr( m.fieldName , 0 , 1 + ( dot - m.fieldName ) );
+            string nr( m.fieldName() , 0 , 1 + ( dot - m.fieldName() ) );
             string nf( temp , 0 , dot - temp );
             if ( onedownseen.count( nf ) )
                 return;
             onedownseen.insert( nf );
             BSONObjBuilder bb ( b.subobjStart( nf.c_str() ) );
-            createNewFromMods( nr , bb , BSONObj() );
+            createNewFromMods( nr , bb , BSONObj() ); // don't infer an array from name
             bb.done();
         }
         else {
@@ -362,29 +475,37 @@ namespace mongo {
         
     }
     
-    void ModSet::createNewFromMods( const string& root , BSONObjBuilder& b , const BSONObj &obj ){
+    template< class Builder >
+    void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ){
         BSONObjIteratorSorted es( obj );
         BSONElement e = es.next();
 
-        ModHolder::iterator m = _mods.lower_bound( root );
-        ModHolder::iterator mend = _mods.lower_bound( root + "{" );
+        ModStateHolder::iterator m = _mods.lower_bound( root );
+        ModStateHolder::iterator mend = _mods.lower_bound( root + '{' );
 
         set<string> onedownseen;
         
         while ( e.type() && m != mend ){
             string field = root + e.fieldName();
-            FieldCompareResult cmp = compareDottedFieldNames( m->second.fieldName , field );
-
+            FieldCompareResult cmp = compareDottedFieldNames( m->second.m->fieldName , field );
+            
             switch ( cmp ){
                 
             case LEFT_SUBFIELD: { // Mod is embeddeed under this element
                 uassert( 10145 ,  "LEFT_SUBFIELD only supports Object" , e.type() == Object || e.type() == Array );
                 if ( onedownseen.count( e.fieldName() ) == 0 ){
                     onedownseen.insert( e.fieldName() );
-                    BSONObjBuilder bb ( e.type() == Object ? b.subobjStart( e.fieldName() ) : b.subarrayStart( e.fieldName() ) );
-                    stringstream nr; nr << root << e.fieldName() << ".";
-                    createNewFromMods( nr.str() , bb , e.embeddedObject() );
-                    bb.done();
+                    if ( e.type() == Object ) {
+                        BSONObjBuilder bb( b.subobjStart( e.fieldName() ) );
+                        stringstream nr; nr << root << e.fieldName() << ".";
+                        createNewFromMods( nr.str() , bb , e.embeddedObject() );
+                        bb.done();                        
+                    } else {
+                        BSONArrayBuilder ba( b.subarrayStart( e.fieldName() ) );
+                        stringstream nr; nr << root << e.fieldName() << ".";
+                        createNewFromMods( nr.str() , ba , e.embeddedObject() );
+                        ba.done();
+                    }
                     // inc both as we handled both
                     e = es.next();
                     m++;
@@ -401,7 +522,7 @@ namespace mongo {
                 m++;
                 continue;
             case RIGHT_BEFORE: // field that doesn't have a MOD
-                b.append( e );
+                b.append( e ); // if array, ignore field name
                 e = es.next();
                 continue;
             case RIGHT_SUBFIELD:
@@ -414,7 +535,7 @@ namespace mongo {
         
         // finished looping the mods, just adding the rest of the elements
         while ( e.type() ){
-            b.append( e );
+            b.append( e );  // if array, ignore field name
             e = es.next();
         }
         
@@ -424,9 +545,9 @@ namespace mongo {
         }
     }
 
-    BSONObj ModSet::createNewFromMods( const BSONObj &obj ) {
-        BSONObjBuilder b( (int)(obj.objsize() * 1.1) );
-        createNewFromMods( "" , b , obj );
+    BSONObj ModSetState::createNewFromMods() {
+        BSONObjBuilder b( (int)(_obj.objsize() * 1.1) );
+        createNewFromMods( "" , b , _obj );
         return b.obj();
     }
 
@@ -451,10 +572,12 @@ namespace mongo {
             newObj = bb.obj();
         }
         
-        if ( canApplyInPlaceAndVerify( newObj ) )
-            applyModsInPlace( newObj );
+        auto_ptr<ModSetState> mss = prepare( newObj );
+
+        if ( mss->canApplyInPlace() )
+            mss->applyModsInPlace();
         else
-            newObj = createNewFromMods( newObj );
+            newObj = mss->createNewFromMods();
         
         return newObj;
     }
@@ -468,17 +591,24 @@ namespace mongo {
        { $pullAll : { a:[99,1010] } }
        NOTE: MODIFIES source from object!
     */
-    void ModSet::getMods(const BSONObj &from) {
+    ModSet::ModSet(
+        const BSONObj &from , 
+        const set<string>& idxKeys,
+        const set<string> *backgroundKeys)
+        : _isIndexed(0) , _hasDynamicArray( false ) {
+        
         BSONObjIterator it(from);
+        
         while ( it.more() ) {
             BSONElement e = it.next();
             const char *fn = e.fieldName();
+            
             uassert( 10147 ,  "Invalid modifier specified" + string( fn ), e.type() == Object );
             BSONObj j = e.embeddedObject();
+            
             BSONObjIterator jt(j);
             Mod::Op op = opFromStr( fn );
-            if ( op == Mod::INC )
-                strcpy((char *) fn, "$set"); // rewrite for op log
+
             while ( jt.more() ) {
                 BSONElement f = jt.next(); // x:44
 
@@ -490,28 +620,46 @@ namespace mongo {
                 uassert( 10151 ,  "have conflict mod" , ! haveConflictingMod( fieldName ) );
                 uassert( 10152 ,  "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC );
                 uassert( 10153 ,  "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) );
-
+                
+                _hasDynamicArray = _hasDynamicArray || strstr( fieldName , ".$" ) > 0;
+                
                 Mod m;
                 m.init( op , f );
                 m.setFieldName( f.fieldName() );
-
-                // horrible - to be cleaned up
-                if ( f.type() == NumberDouble ) {
-                    m.ndouble = (double *) f.value();
-                    m.nint = 0;
-                } else if ( f.type() == NumberInt ) {
-                    m.ndouble = 0;
-                    m.nint = (int *) f.value();
-                }
-                else if( f.type() == NumberLong ) { 
-                    m.ndouble = 0;
-                    m.nint = 0;
-                    m.nlong = (long long *) f.value();
+                
+                if ( m.isIndexed( idxKeys ) ||
+                    (backgroundKeys && m.isIndexed(*backgroundKeys)) ) {
+                    _isIndexed++;
                 }
 
                 _mods[m.fieldName] = m;
+
+                DEBUGUPDATE( "\t\t " << fieldName << "\t" << _hasDynamicArray );
+            }
+        }
+
+    }
+
+    ModSet * ModSet::fixDynamicArray( const char * elemMatchKey ) const {
+        ModSet * n = new ModSet();
+        n->_isIndexed = _isIndexed;
+        n->_hasDynamicArray = _hasDynamicArray;
+        for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ){
+            string s = i->first;
+            size_t idx = s.find( ".$" );
+            if ( idx == string::npos ){
+                n->_mods[s] = i->second;
+                continue;
             }
+            StringBuilder buf(s.size()+strlen(elemMatchKey));
+            buf << s.substr(0,idx+1) << elemMatchKey << s.substr(idx+2);
+            string fixed = buf.str();
+            DEBUGUPDATE( "fixed dynamic: " << s << " -->> " << fixed );
+            n->_mods[fixed] = i->second;
+            ModHolder::iterator temp = n->_mods.find( fixed );
+            temp->second.setFieldName( temp->first.c_str() );
         }
+        return n;
     }
     
     void checkNoMods( BSONObj o ) {
@@ -526,46 +674,58 @@ namespace mongo {
     
     class UpdateOp : public QueryOp {
     public:
-        UpdateOp() : nscanned_() {}
+        UpdateOp() : _nscanned() {}
         virtual void init() {
             BSONObj pattern = qp().query();
-            c_.reset( qp().newCursor().release() );
-            if ( !c_->ok() )
+            _c.reset( qp().newCursor().release() );
+            if ( ! _c->ok() )
                 setComplete();
             else
-                matcher_.reset( new CoveredIndexMatcher( pattern, qp().indexKey() ) );
+                _matcher.reset( new CoveredIndexMatcher( pattern, qp().indexKey() ) );
         }
         virtual void next() {
-            if ( !c_->ok() ) {
+            if ( ! _c->ok() ) {
                 setComplete();
                 return;
             }
-            nscanned_++;
-            if ( matcher_->matches(c_->currKey(), c_->currLoc()) ) {
+            _nscanned++;
+            if ( _matcher->matches(_c->currKey(), _c->currLoc(), &_details ) ) {
                 setComplete();
                 return;
             }
-            c_->advance();
+            _c->advance();
         }
         bool curMatches(){
-            return matcher_->matches(c_->currKey(), c_->currLoc() );
+            return _matcher->matches(_c->currKey(), _c->currLoc() , &_details );
         }
         virtual bool mayRecordPlan() const { return false; }
         virtual QueryOp *clone() const {
             return new UpdateOp();
         }
-        shared_ptr< Cursor > c() { return c_; }
-        long long nscanned() const { return nscanned_; }
+        shared_ptr< Cursor > c() { return _c; }
+        long long nscanned() const { return _nscanned; }
+        MatchDetails& getMatchDetails(){ return _details; }
     private:
-        shared_ptr< Cursor > c_;
-        long long nscanned_;
-        auto_ptr< CoveredIndexMatcher > matcher_;
+        shared_ptr< Cursor > _c;
+        long long _nscanned;
+        auto_ptr< CoveredIndexMatcher > _matcher;
+        MatchDetails _details;
     };
 
     
-    UpdateResult updateObjects(const char *ns, BSONObj updateobjOrig, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) {
+    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) {
+        DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi );
         int profile = cc().database()->profile;
         StringBuilder& ss = debug.str;
+
+        if ( logLevel > 2 )
+            ss << " update: " << updateobj;
+        
+        /* idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case */
+        /* NOTE: when yield() is added herein, these must be refreshed after each call to yield! */
+        NamespaceDetails *d = nsdetails(ns); // can be null if an upsert...
+        NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get_w(ns);
+        /* end note */
         
         uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 );
         if ( strstr(ns, ".system.") ) {
@@ -573,6 +733,21 @@ namespace mongo {
             uassert( 10156 , "cannot update system collection", legalClientSystemNS( ns , true ) );
         }
 
+        auto_ptr<ModSet> mods;
+        bool isOperatorUpdate = updateobj.firstElement().fieldName()[0] == '$';
+        int modsIsIndexed = false; // really the # of indexes
+        if ( isOperatorUpdate ){
+            if( d && d->backgroundIndexBuildInProgress ) { 
+                set<string> bgKeys;
+                d->backgroundIdx().keyPattern().getFieldNames(bgKeys);
+                mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) );
+            }
+            else {
+                mods.reset( new ModSet(updateobj, nsdt->indexKeys()) );
+            }
+            modsIsIndexed = mods->isIndexed();
+        }
+
         set<DiskLoc> seenObjects;
         
         QueryPlanSet qps( ns, patternOrig, BSONObj() );
@@ -593,11 +768,10 @@ namespace mongo {
                 c->advance();
                 continue;
             }
-                               
+
             BSONObj js(r);
             
             BSONObj pattern = patternOrig;
-            BSONObj updateobj = updateobjOrig;
 
             if ( logop ) {
                 BSONObjBuilder idPattern;
@@ -620,43 +794,46 @@ namespace mongo {
             
             /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
                regular ones at the moment. */
-            
-            const char *firstField = updateobj.firstElement().fieldName();
-            
-            if ( firstField[0] == '$' ) {
-
+            if ( isOperatorUpdate ) {
+                
                 if ( multi ){
                     c->advance(); // go to next record in case this one moves
                     if ( seenObjects.count( loc ) )
                         continue;
-                    updateobj = updateobj.copy();
                 }
                 
-                ModSet mods;
-                mods.getMods(updateobj);
-                NamespaceDetailsTransient& ndt = NamespaceDetailsTransient::get_w(ns);
-                set<string>& idxKeys = ndt.indexKeys();
-                int isIndexed = mods.isIndexed( idxKeys );
-                
-                if ( isIndexed && multi ){
+                if ( modsIsIndexed && multi ){
                     c->noteLocation();
                 }
 
-                if ( isIndexed <= 0 && mods.canApplyInPlaceAndVerify( loc.obj() ) ) {
-                    mods.applyModsInPlace( loc.obj() );
-                    //seenObjects.insert( loc );
+                const BSONObj& onDisk = loc.obj();
+
+                ModSet * useMods = mods.get();
+
+                auto_ptr<ModSet> mymodset;
+                if ( u->getMatchDetails().elemMatchKey && mods->hasDynamicArray() ){
+                    useMods = mods->fixDynamicArray( u->getMatchDetails().elemMatchKey );
+                    mymodset.reset( useMods );
+                }
+
+                     
+                auto_ptr<ModSetState> mss = useMods->prepare( onDisk );
+                
+                if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ){
+                    mss->applyModsInPlace();// const_cast<BSONObj&>(onDisk) );
+
                     if ( profile )
                         ss << " fastmod ";
                     
-                    if ( isIndexed ){
+                    if ( modsIsIndexed ){
                         seenObjects.insert( loc );
                     }
                 } 
                 else {
-                    BSONObj newObj = mods.createNewFromMods( loc.obj() );
-                    uassert( 12522 , "$ operator made objcet too large" , newObj.isValid() );
-                    DiskLoc newLoc = theDataFileMgr.update(ns, r, loc , newObj.objdata(), newObj.objsize(), debug);
-                    if ( newLoc != loc || isIndexed ){
+                    BSONObj newObj = mss->createNewFromMods();
+                    uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= ( 4 * 1024 * 1024 ) );
+                    DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+                    if ( newLoc != loc || modsIsIndexed ) {
                         // object moved, need to make sure we don' get again
                         seenObjects.insert( newLoc );
                     }
@@ -664,25 +841,27 @@ namespace mongo {
                 }
                 
                 if ( logop ) {
-                    
-                    assert( mods.size() );
+                    DEV assert( mods->size() );
 
-                    if ( mods.haveArrayDepMod() ) {
+                    if ( mss->haveArrayDepMod() ) {
                         BSONObjBuilder patternBuilder;
                         patternBuilder.appendElements( pattern );
-                        mods.appendSizeSpecForArrayDepMods( patternBuilder );
+                        mss->appendSizeSpecForArrayDepMods( patternBuilder );
                         pattern = patternBuilder.obj();                        
                     }
                     
-                    if ( mods.needOpLogRewrite() )
-                        updateobj = mods.getOpLogRewrite();
-                    
-                    logOp("u", ns, updateobj, &pattern );
+                    if ( mss->needOpLogRewrite() ){
+                        DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+                        logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+                    }
+                    else {
+                        logOp("u", ns, updateobj, &pattern );
+                    }
                 }
                 numModded++;
                 if ( ! multi )
                     break;
-                if ( multi && isIndexed )
+                if ( multi && modsIsIndexed )
                     c->checkLocation();
                 continue;
             } 
@@ -691,7 +870,7 @@ namespace mongo {
 
             BSONElementManipulator::lookForTimestamps( updateobj );
             checkNoMods( updateobj );
-            theDataFileMgr.update(ns, r, loc , updateobj.objdata(), updateobj.objsize(), debug);
+            theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug);
             if ( logop )
                 logOp("u", ns, updateobj, &pattern );
             return UpdateResult( 1 , 0 , 1 );
@@ -705,13 +884,9 @@ namespace mongo {
             ss << " nscanned:" << u->nscanned();
         
         if ( upsert ) {
-            if ( updateobjOrig.firstElement().fieldName()[0] == '$' ) {
+            if ( updateobj.firstElement().fieldName()[0] == '$' ) {
                 /* upsert of an $inc. build a default */
-                ModSet mods;
-                mods.getMods(updateobjOrig);
-                 
-                BSONObj newObj = mods.createNewFromQuery( patternOrig );
-
+                BSONObj newObj = mods->createNewFromQuery( patternOrig );
                 if ( profile )
                     ss << " fastmodinsert ";
                 theDataFileMgr.insert(ns, newObj);
@@ -722,12 +897,13 @@ namespace mongo {
                 return UpdateResult( 0 , 1 , 1 );
             }
             uassert( 10159 ,  "multi update only works with $ operators" , ! multi );
-            checkNoMods( updateobjOrig );
+            checkNoMods( updateobj );
             if ( profile )
                 ss << " upsert ";
-            theDataFileMgr.insert(ns, updateobjOrig);
+            BSONObj no = updateobj;
+            theDataFileMgr.insert(ns, no);
             if ( logop )
-                logOp( "i", ns, updateobjOrig );
+                logOp( "i", ns, no );
             return UpdateResult( 0 , 0 , 1 );
         }
         return UpdateResult( 0 , 0 , 0 );
diff --git a/db/update.h b/db/update.h
index 26a8a8d..e14b0fb 100644
--- a/db/update.h
+++ b/db/update.h
@@ -23,11 +23,17 @@
 
 namespace mongo {
 
-    /* Used for modifiers such as $inc, $set, $push, ... */
+    class ModState;
+    class ModSetState;
+
+    /* Used for modifiers such as $inc, $set, $push, ... 
+     * stores the info about a single operation
+     * once created should never be modified
+     */
     struct Mod {
         // See opFromStr below
-        //        0    1    2     3         4     5          6    7      8       9       10
-        enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT  } op;
+        //        0    1    2     3         4     5          6    7      8       9       10    11
+        enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET  } op;
         
         static const char* modNames[];
         static unsigned modNamesNum;
@@ -35,13 +41,7 @@ namespace mongo {
         const char *fieldName;
         const char *shortFieldName;
         
-        // kind of lame; fix one day?
-        double *ndouble;
-        int *nint;
-        long long *nlong;
-
         BSONElement elt; // x:5 note: this is the actual element from the updateobj
-        int pushStartSize;
         boost::shared_ptr<Matcher> matcher;
 
         void init( Op o , BSONElement& e ){
@@ -59,36 +59,32 @@ namespace mongo {
             else
                 shortFieldName = fieldName;
         }
-
-        /* [dm] why is this const? (or rather, why was setn const?)  i see why but think maybe clearer if were not.  */
-        void inc(BSONElement& n) const { 
-            uassert( 10160 ,  "$inc value is not a number", n.isNumber() );
-            if( ndouble ) 
-                *ndouble += n.numberDouble();
-            else if( nint )
-                *nint += n.numberInt();
-            else
-                *nlong += n.numberLong();
-        }
-
-        void setElementToOurNumericValue(BSONElement& e) const { 
-            BSONElementManipulator manip(e);
-            if( e.type() == NumberLong )
-                manip.setLong(_getlong());
-            else
-                manip.setNumber(_getn());
-        }
-
-        double _getn() const {
-            if( ndouble ) return *ndouble;
-            if( nint ) return *nint;
-            return (double) *nlong;
-        }
-        long long _getlong() const {
-            if( nlong ) return *nlong; 
-            if( ndouble ) return (long long) *ndouble;
-            return *nint;
+        
+        /**
+         * @param in incrememnts the actual value inside in
+         */
+        void incrementMe( BSONElement& in ) const {
+            BSONElementManipulator manip( in );
+            
+            switch ( in.type() ){
+            case NumberDouble:
+                manip.setNumber( elt.numberDouble() + in.numberDouble() );
+                break;
+            case NumberLong:
+                manip.setLong( elt.numberLong() + in.numberLong() );
+                break;
+            case NumberInt:
+                manip.setInt( elt.numberInt() + in.numberInt() );
+                break;
+            default:
+                assert(0);
+            }
+            
         }
+        
+        template< class Builder >
+        void appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const;
+        
         bool operator<( const Mod &other ) const {
             return strcmp( fieldName, other.fieldName ) < 0;
         }
@@ -120,34 +116,15 @@ namespace mongo {
             return false;
         }
         
-        void apply( BSONObjBuilder& b , BSONElement in );
+        template< class Builder >
+        void apply( Builder& b , BSONElement in , ModState& ms ) const;
         
         /**
          * @return true iff toMatch should be removed from the array
          */
         bool _pullElementMatch( BSONElement& toMatch ) const;
 
-        bool needOpLogRewrite() const {
-            switch( op ){
-            case BIT:
-            case BITAND:
-            case BITOR:
-                // TODO: should we convert this to $set?
-                return false;
-            default:
-                return false;
-            }
-        }
-        
-        void appendForOpLog( BSONObjBuilder& b ) const {
-            const char * name = modNames[op];
-            
-            BSONObjBuilder bb( b.subobjStart( name ) );
-            bb.append( elt );
-            bb.done();
-        }
-
-        void _checkForAppending( BSONElement& e ){
+        void _checkForAppending( const BSONElement& e ) const {
             if ( e.type() == Object ){
                 // this is a tiny bit slow, but rare and important
                 // only when setting something TO an object, not setting something in an object
@@ -157,12 +134,38 @@ namespace mongo {
             }
         }
         
+        bool isEach() const {
+            if ( elt.type() != Object )
+                return false;
+            BSONElement e = elt.embeddedObject().firstElement();
+            if ( e.type() != Array )
+                return false;
+            return strcmp( e.fieldName() , "$each" ) == 0;
+        }
+
+        BSONObj getEach() const {
+            return elt.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck();
+        }
+        
+        void parseEach( BSONElementSet& s ) const {
+            BSONObjIterator i(getEach());
+            while ( i.more() ){
+                s.insert( i.next() );
+            }
+        }
+        
     };
 
-    class ModSet {
+    /**
+     * stores a set of Mods
+     * once created, should never be changed
+     */
+    class ModSet : boost::noncopyable {
         typedef map<string,Mod> ModHolder;
         ModHolder _mods;
-        
+        int _isIndexed;
+        bool _hasDynamicArray;
+
         static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base );
         
         FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const {
@@ -180,45 +183,6 @@ namespace mongo {
 
             return compareDottedFieldNames( m->first, p->first.c_str() );
         }
-
-        void _appendNewFromMods( const string& root , Mod& m , BSONObjBuilder& b , set<string>& onedownseen );
-        
-        void appendNewFromMod( Mod& m , BSONObjBuilder& b ){
-            switch ( m.op ){
-                
-            case Mod::PUSH: { 
-                BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
-                arr.appendAs( m.elt, "0" );
-                arr.done();
-                m.pushStartSize = -1;
-                break;
-            } 
-                
-            case Mod::PUSH_ALL: {
-                b.appendAs( m.elt, m.shortFieldName );
-                m.pushStartSize = -1;
-                break;
-            } 
-                
-            case Mod::UNSET:
-            case Mod::PULL:
-            case Mod::PULL_ALL:
-                // no-op b/c unset/pull of nothing does nothing
-                break;
-                
-            case Mod::INC:
-            case Mod::SET: {
-                m._checkForAppending( m.elt );
-                b.appendAs( m.elt, m.shortFieldName );
-                break;
-            }
-            default: 
-                stringstream ss;
-                ss << "unknown mod in appendNewFromMod: " << m.op;
-                throw UserException( 9015, ss.str() );
-            }
-         
-        }
         
         bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) {
             for( string left = EmbeddedBuilder::splitDot( right );
@@ -279,39 +243,51 @@ namespace mongo {
                 }
                 break;
             }
+            case 'a': {
+                if ( fn[2] == 'd' && fn[3] == 'd' ){
+                    // add
+                    if ( fn[4] == 'T' && fn[5] == 'o' && fn[6] == 'S' && fn[7] == 'e' && fn[8] == 't' && fn[9] == 0 )
+                        return Mod::ADDTOSET;
+                    
+                }
+            }
             default: break;
             }
             uassert( 10161 ,  "Invalid modifier specified " + string( fn ), false );
             return Mod::INC;
         }
         
-    public:
+        ModSet(){}
 
-        void getMods( const BSONObj &from );
-        /**
-           will return if can be done in place, or uassert if there is an error
-           @return whether or not the mods can be done in place
-         */
-        bool canApplyInPlaceAndVerify( const BSONObj &obj ) const;
-        void applyModsInPlace( const BSONObj &obj ) const;
+    public:
+        
+        ModSet( const BSONObj &from , 
+            const set<string>& idxKeys = set<string>(),
+            const set<string>* backgroundKeys = 0
+            );
 
-        // new recursive version, will replace at some point
-        void createNewFromMods( const string& root , BSONObjBuilder& b , const BSONObj &obj );
+        // TODO: this is inefficient - should probably just handle when iterating
+        ModSet * fixDynamicArray( const char * elemMatchKey ) const;
 
-        BSONObj createNewFromMods( const BSONObj &obj );
+        bool hasDynamicArray() const { return _hasDynamicArray; }
 
+        /**
+         * creates a ModSetState suitable for operation on obj
+         * doesn't change or modify this ModSet or any underying Mod
+         */
+        auto_ptr<ModSetState> prepare( const BSONObj& obj ) const;
+        
+        /**
+         * given a query pattern, builds an object suitable for an upsert
+         * will take the query spec and combine all $ operators
+         */
         BSONObj createNewFromQuery( const BSONObj& query );
 
         /**
          *
          */
-        int isIndexed( const set<string>& idxKeys ) const {
-            int numIndexes = 0;
-            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ){
-                if ( i->second.isIndexed( idxKeys ) )
-                    numIndexes++;
-            }
-            return numIndexes;
+        int isIndexed() const {
+            return _isIndexed;
         }
 
         unsigned size() const { return _mods.size(); }
@@ -341,10 +317,190 @@ namespace mongo {
             
         }
         
+    };
+
+    /**
+     * stores any information about a single Mod operating on a single Object
+     */
+    class ModState {
+    public:
+        const Mod * m;
+        BSONElement old;
+        
+        const char * fixedName;
+        BSONElement * fixed;
+        int pushStartSize;
+        
+        BSONType incType;
+        int incint;
+        double incdouble;
+        long long inclong;
+        
+        ModState(){
+            fixedName = 0;
+            fixed = 0;
+            pushStartSize = -1;
+            incType = EOO;
+        }
+           
+        Mod::Op op() const {
+            return m->op;
+        }
+
+        const char * fieldName() const {
+            return m->fieldName;
+        }
+        
+        bool needOpLogRewrite() const {
+            if ( fixed || fixedName || incType )
+                return true;
+            
+            switch( op() ){
+            case Mod::BIT:
+            case Mod::BITAND:
+            case Mod::BITOR:
+                // TODO: should we convert this to $set?
+                return false;
+            default:
+                return false;
+            }
+        }
+        
+        void appendForOpLog( BSONObjBuilder& b ) const {
+            if ( incType ){
+                BSONObjBuilder bb( b.subobjStart( "$set" ) );
+                appendIncValue( bb );
+                bb.done();
+                return;
+            }
+            
+            const char * name = fixedName ? fixedName : Mod::modNames[op()];
+
+            BSONObjBuilder bb( b.subobjStart( name ) );
+            if ( fixed )
+                bb.appendAs( *fixed , m->fieldName );
+            else
+                bb.append( m->elt );
+            bb.done();
+        }
+
+        template< class Builder >
+        void apply( Builder& b , BSONElement in ){
+            m->apply( b , in , *this );
+        }
+        
+        template< class Builder >
+        void appendIncValue( Builder& b ) const {
+            switch ( incType ){
+            case NumberDouble:
+                b.append( m->shortFieldName , incdouble ); break;
+            case NumberLong:
+                b.append( m->shortFieldName , inclong ); break;
+            case NumberInt:
+                b.append( m->shortFieldName , incint ); break;
+            default:
+                assert(0);
+            }
+        }
+    };
+    
+    /**
+     * this is used to hold state, meta data while applying a ModSet to a BSONObj
+     * the goal is to make ModSet const so its re-usable
+     */
+    class ModSetState : boost::noncopyable {
+        struct FieldCmp {
+            bool operator()( const string &l, const string &r ) const {
+                return lexNumCmp( l.c_str(), r.c_str() ) < 0;
+            }
+        };
+        typedef map<string,ModState,FieldCmp> ModStateHolder;
+        const BSONObj& _obj;
+        ModStateHolder _mods;
+        bool _inPlacePossible;
+        
+        ModSetState( const BSONObj& obj ) 
+            : _obj( obj ) , _inPlacePossible(true){
+        }
+        
+        /**
+         * @return if in place is still possible
+         */
+        bool amIInPlacePossible( bool inPlacePossible ){
+            if ( ! inPlacePossible )
+                _inPlacePossible = false;
+            return _inPlacePossible;
+        }
+
+        template< class Builder >
+        void createNewFromMods( const string& root , Builder& b , const BSONObj &obj );
+
+        template< class Builder >
+        void _appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen );
+        
+        template< class Builder >
+        void appendNewFromMod( ModState& ms , Builder& b ){
+            //const Mod& m = *(ms.m); // HACK
+            Mod& m = *((Mod*)(ms.m)); // HACK
+                
+            switch ( m.op ){
+                    
+            case Mod::PUSH: 
+            case Mod::ADDTOSET: { 
+                if ( m.isEach() ){
+                    b.appendArray( m.shortFieldName , m.getEach() );
+                }
+                else {
+                    BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+                    arr.appendAs( m.elt, "0" );
+                    arr.done();
+                }
+                break;
+            } 
+                
+            case Mod::PUSH_ALL: {
+                b.appendAs( m.elt, m.shortFieldName );
+                break;
+            } 
+                
+            case Mod::UNSET:
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                // no-op b/c unset/pull of nothing does nothing
+                break;
+                
+            case Mod::INC:
+                ms.fixedName = "$set";
+            case Mod::SET: {
+                m._checkForAppending( m.elt );
+                b.appendAs( m.elt, m.shortFieldName );
+                break;
+            }
+            default: 
+                stringstream ss;
+                ss << "unknown mod in appendNewFromMod: " << m.op;
+                throw UserException( 9015, ss.str() );
+            }
+         
+        }
+
+    public:
+        
+        bool canApplyInPlace() const {
+            return _inPlacePossible;
+        }
+        
+        /**
+         * modified underlying _obj
+         */
+        void applyModsInPlace();
+
+        BSONObj createNewFromMods();
+
         // re-writing for oplog
 
         bool needOpLogRewrite() const {
-            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
                 if ( i->second.needOpLogRewrite() )
                     return true;
             return false;            
@@ -352,31 +508,33 @@ namespace mongo {
         
         BSONObj getOpLogRewrite() const {
             BSONObjBuilder b;
-            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
                 i->second.appendForOpLog( b );
             return b.obj();
         }
 
         bool haveArrayDepMod() const {
-            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
-                if ( i->second.arrayDep() )
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                if ( i->second.m->arrayDep() )
                     return true;
             return false;
         }
 
         void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const {
-            for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) {
-                const Mod& m = i->second;
-                if ( m.arrayDep() ){
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) {
+                const ModState& m = i->second;
+                if ( m.m->arrayDep() ){
                     if ( m.pushStartSize == -1 )
-                        b.appendNull( m.fieldName );
+                        b.appendNull( m.fieldName() );
                     else
-                        b << m.fieldName << BSON( "$size" << m.pushStartSize );
+                        b << m.fieldName() << BSON( "$size" << m.pushStartSize );
                 }
             }
         }
+
+
+        friend class ModSet;
     };
     
-
 }
author	Antonin Kral <a.kral@bobek.cz>	2010-03-25 19:21:32 +0100
committer	Antonin Kral <a.kral@bobek.cz>	2010-03-25 19:21:32 +0100
commit	0ca01a91ae0a3562e54c226e7b9512feb2ea83d0 (patch)
tree	2b3886e435b0217d6afd63a213b04d32bb4b4f6f /db
parent	a696359b248adef0cc8576fce3f473535e995136 (diff)
download	mongodb-0ca01a91ae0a3562e54c226e7b9512feb2ea83d0.tar.gz