Imported Upstream version 2.0.0

author: Antonin Kral <a.kral@bobek.cz> 2011-09-14 17:08:06 +0200
committer: Antonin Kral <a.kral@bobek.cz> 2011-09-14 17:08:06 +0200
commit: 5d342a758c6095b4d30aba0750b54f13b8916f51 (patch)
tree: 762e9aa84781f5e3b96db2c02d356c29cf0217c0 /db/ops
parent: cbe2d992e9cd1ea66af9fa91df006106775d3073 (diff)
download: mongodb-5d342a758c6095b4d30aba0750b54f13b8916f51.tar.gz
6 files changed, 3595 insertions, 0 deletions
diff --git a/db/ops/delete.cpp b/db/ops/delete.cpp
new file mode 100644
index 0000000..3009047
--- /dev/null
+++ b/db/ops/delete.cpp
@@ -0,0 +1,242 @@
+// delete.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "delete.h"
+#include "../queryoptimizer.h"
+#include "../oplog.h"
+
+namespace mongo {
+    
+    // Just try to identify best plan.
+    class DeleteOp : public MultiCursor::CursorOp {
+    public:
+        DeleteOp( bool justOne, int& bestCount, int orClauseIndex = -1 ) :
+            justOne_( justOne ),
+            count_(),
+            bestCount_( bestCount ),
+            _nscanned(),
+            _orClauseIndex( orClauseIndex ) {
+        }
+        virtual void _init() {
+            c_ = qp().newCursor();
+        }
+        virtual bool prepareToYield() {
+            if ( _orClauseIndex > 0 ) {
+                return false;
+            }
+            if ( ! _cc ) {
+                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , qp().ns() ) );
+            }
+            return _cc->prepareToYield( _yieldData );
+        }
+        virtual void recoverFromYield() {
+            if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _cc.reset();
+                c_.reset();
+                massert( 13340, "cursor dropped during delete", false );
+            }
+        }
+        virtual long long nscanned() {
+            return c_.get() ? c_->nscanned() : _nscanned;
+        }
+        virtual void next() {
+            if ( !c_->ok() ) {
+                setComplete();
+                return;
+            }
+
+            DiskLoc rloc = c_->currLoc();
+
+            if ( matcher( c_ )->matchesCurrent(c_.get()) ) {
+                if ( !c_->getsetdup(rloc) )
+                    ++count_;
+            }
+
+            c_->advance();
+            _nscanned = c_->nscanned();
+            
+            if ( _orClauseIndex > 0 && _nscanned >= 100 ) {
+                setComplete();
+                return;
+            }
+            
+            if ( count_ > bestCount_ )
+                bestCount_ = count_;
+
+            if ( count_ > 0 ) {
+                if ( justOne_ )
+                    setComplete();
+                else if ( _nscanned >= 100 && count_ == bestCount_ )
+                    setComplete();
+            }
+        }
+        virtual bool mayRecordPlan() const { return !justOne_; }
+        virtual QueryOp *_createChild() const {
+            bestCount_ = 0; // should be safe to reset this in contexts where createChild() is called
+            return new DeleteOp( justOne_, bestCount_, _orClauseIndex + 1 );
+        }
+        virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); }
+    private:
+        bool justOne_;
+        int count_;
+        int &bestCount_;
+        long long _nscanned;
+        shared_ptr<Cursor> c_;
+        ClientCursor::CleanupPointer _cc;
+        ClientCursor::YieldData _yieldData;
+        // Avoid yielding in the MultiPlanScanner when not the first $or clause - just a temporary implementaiton for now.  SERVER-3555
+        int _orClauseIndex;
+    };
+
+    /* ns:      namespace, e.g. <database>.<collection>
+       pattern: the "where" clause / criteria
+       justOne: stop after 1 match
+       god:     allow access to system namespaces, and don't yield
+    */
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
+        if( !god ) {
+            if ( strstr(ns, ".system.") ) {
+                /* note a delete from system.indexes would corrupt the db
+                if done here, as there are pointers into those objects in
+                NamespaceDetails.
+                */
+                uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
+            }
+            if ( strchr( ns , '$' ) ) {
+                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
+                uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
+            }
+        }
+
+        {
+            NamespaceDetails *d = nsdetails( ns );
+            if ( ! d )
+                return 0;
+            uassert( 10101 ,  "can't remove from a capped collection" , ! d->capped );
+        }
+
+        long long nDeleted = 0;
+
+        int best = 0;
+        shared_ptr< MultiCursor::CursorOp > opPtr( new DeleteOp( justOneOrig, best ) );
+        shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, !god ) );
+
+        if( !creal->ok() )
+            return nDeleted;
+
+        shared_ptr< Cursor > cPtr = creal;
+        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
+        cc->setDoingDeletes( true );
+
+        CursorId id = cc->cursorid();
+
+        bool justOne = justOneOrig;
+        bool canYield = !god && !creal->matcher()->docMatcher().atomic();
+
+        do {
+            // TODO: we can generalize this I believe
+            //       
+            bool willNeedRecord = creal->matcher()->needRecord() || pattern.isEmpty() || isSimpleIdQuery( pattern );
+            if ( ! willNeedRecord ) {
+                // TODO: this is a total hack right now
+                // check if the index full encompasses query
+                
+                if ( pattern.nFields() == 1 && 
+                     str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) )
+                    willNeedRecord = true;
+            }
+            
+            if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) {
+                cc.release(); // has already been deleted elsewhere
+                // TODO should we assert or something?
+                break;
+            }
+            if ( !cc->ok() ) {
+                break; // if we yielded, could have hit the end
+            }
+
+            // this way we can avoid calling updateLocation() every time (expensive)
+            // as well as some other nuances handled
+            cc->setDoingDeletes( true );
+
+            DiskLoc rloc = cc->currLoc();
+            BSONObj key = cc->currKey();
+
+            // NOTE Calling advance() may change the matcher, so it's important
+            // to try to match first.
+            bool match = creal->matcher()->matchesCurrent(creal.get());
+
+            if ( ! cc->advance() )
+                justOne = true;
+
+            if ( ! match )
+                continue;
+
+            assert( !cc->c()->getsetdup(rloc) ); // can't be a dup, we deleted it!
+
+            if ( !justOne ) {
+                /* NOTE: this is SLOW.  this is not good, noteLocation() was designed to be called across getMore
+                    blocks.  here we might call millions of times which would be bad.
+                    */
+                cc->c()->noteLocation();
+            }
+
+            if ( logop ) {
+                BSONElement e;
+                if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
+                    BSONObjBuilder b;
+                    b.append( e );
+                    bool replJustOne = true;
+                    logOp( "d", ns, b.done(), 0, &replJustOne );
+                }
+                else {
+                    problem() << "deleted object without id, not logging" << endl;
+                }
+            }
+
+            if ( rs )
+                rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );
+
+            theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
+            nDeleted++;
+            if ( justOne ) {
+                break;
+            }
+            cc->c()->checkLocation();
+         
+            if( !god ) 
+                getDur().commitIfNeeded();
+
+            if( debug && god && nDeleted == 100 ) 
+                log() << "warning high number of deletes with god=true which could use significant memory" << endl;
+        }
+        while ( cc->ok() );
+
+        if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
+            // TODO: remove this and the id declaration above if this doesn't trigger
+            //       if it does, then i'm very confused (ERH 06/2011)
+            error() << "this should be impossible" << endl;
+            printStackTrace();
+            cc.release();
+        }
+
+        return nDeleted;
+    }
+
+}
diff --git a/db/ops/delete.h b/db/ops/delete.h
new file mode 100644
index 0000000..a74b7a6
--- /dev/null
+++ b/db/ops/delete.h
@@ -0,0 +1,33 @@
+// delete.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    class RemoveSaver;
+
+    // If justOne is true, deletedId is set to the id of the deleted object.
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false, RemoveSaver * rs=0);
+
+
+}
diff --git a/db/ops/query.cpp b/db/ops/query.cpp
new file mode 100644
index 0000000..cf4dc98
--- /dev/null
+++ b/db/ops/query.cpp
@@ -0,0 +1,1020 @@
+// query.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "query.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../../bson/util/builder.h"
+#include <time.h>
+#include "../introspect.h"
+#include "../btree.h"
+#include "../../util/lruishmap.h"
+#include "../json.h"
+#include "../repl.h"
+#include "../replutil.h"
+#include "../scanandorder.h"
+#include "../security.h"
+#include "../curop-inl.h"
+#include "../commands.h"
+#include "../queryoptimizer.h"
+#include "../lasterror.h"
+#include "../../s/d_logic.h"
+#include "../repl_block.h"
+#include "../../server.h"
+
+namespace mongo {
+
+    /* We cut off further objects once we cross this threshold; thus, you might get
+       a little bit more than this, it is a threshold rather than a limit.
+    */
+    const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024;
+
+    //ns->query->DiskLoc
+//    LRUishMap<BSONObj,DiskLoc,5> lrutest(123);
+
+    extern bool useCursors;
+    extern bool useHints;
+
+    bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+        try {
+            return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
+        }
+        catch ( AssertionException& e ) {
+            e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" );
+            curop.debug().exceptionInfo = e.getInfo();
+        }
+        anObjBuilder.append("errmsg", "db assertion failure");
+        anObjBuilder.append("ok", 0.0);
+        BSONObj x = anObjBuilder.done();
+        b.appendBuf((void*) x.objdata(), x.objsize());
+        return true;
+    }
+
+
+    BSONObj id_obj = fromjson("{\"_id\":1}");
+    BSONObj empty_obj = fromjson("{}");
+
+
+    //int dump = 0;
+
+    /* empty result for error conditions */
+    QueryResult* emptyMoreResult(long long cursorid) {
+        BufBuilder b(32768);
+        b.skip(sizeof(QueryResult));
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->cursorId = 0; // 0 indicates no more data to retrieve.
+        qr->startingFrom = 0;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->initializeResultFlags();
+        qr->nReturned = 0;
+        b.decouple();
+        return qr;
+    }
+
+    QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
+        exhaust = false;
+        ClientCursor::Pointer p(cursorid);
+        ClientCursor *cc = p.c();
+
+        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;
+
+        BufBuilder b( bufSize );
+        b.skip(sizeof(QueryResult));
+        int resultFlags = ResultFlag_AwaitCapable;
+        int start = 0;
+        int n = 0;
+
+        if ( unlikely(!cc) ) {
+            log() << "getMore: cursorid not found " << ns << " " << cursorid << endl;
+            cursorid = 0;
+            resultFlags = ResultFlag_CursorNotFound;
+        }
+        else {
+            // check for spoofing of the ns such that it does not match the one originally there for the cursor
+            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));
+
+            if ( pass == 0 )
+                cc->updateSlaveLocation( curop );
+
+            int queryOptions = cc->queryOptions();
+            
+            curop.debug().query = cc->query();
+
+            start = cc->pos();
+            Cursor *c = cc->c();
+            c->checkLocation();
+            DiskLoc last;
+
+            scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
+            if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
+                keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );
+
+            // This manager may be stale, but it's the state of chunking when the cursor was created.
+            ShardChunkManagerPtr manager = cc->getChunkManager();
+
+            while ( 1 ) {
+                if ( !c->ok() ) {
+                    if ( c->tailable() ) {
+                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
+                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
+                           return true.  that's what we are doing here.
+                           */
+                        if ( c->advance() )
+                            continue;
+
+                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
+                            return 0;
+                        }
+
+                        break;
+                    }
+                    p.release();
+                    bool ok = ClientCursor::erase(cursorid);
+                    assert(ok);
+                    cursorid = 0;
+                    cc = 0;
+                    break;
+                }
+
+                // in some cases (clone collection) there won't be a matcher
+                if ( c->matcher() && !c->matcher()->matchesCurrent( c ) ) {
+                }
+                else if ( manager && ! manager->belongsToMe( cc ) ){
+                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
+                }
+                else {
+                    if( c->getsetdup(c->currLoc()) ) {
+                        //out() << "  but it's a dup \n";
+                    }
+                    else {
+                        last = c->currLoc();
+                        n++;
+
+                        if ( keyFieldsOnly ) {
+                            fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
+                        }
+                        else {
+                            BSONObj js = c->current();
+                            // show disk loc should be part of the main query, not in an $or clause, so this should be ok
+                            fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
+                        }
+
+                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
+                            c->advance();
+                            cc->incPos( n );
+                            break;
+                        }
+                    }
+                }
+                c->advance();
+
+                if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) {
+                    ClientCursor::erase(cursorid);
+                    cursorid = 0;
+                    cc = 0;
+                    p.deleted();
+                    break;
+                }
+            }
+            
+            if ( cc ) {
+                cc->updateLocation();
+                cc->mayUpgradeStorage();
+                cc->storeOpForSlave( last );
+                exhaust = cc->queryOptions() & QueryOption_Exhaust;
+            }
+        }
+
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->_resultFlags() = resultFlags;
+        qr->cursorId = cursorid;
+        qr->startingFrom = start;
+        qr->nReturned = n;
+        b.decouple();
+
+        return qr;
+    }
+
+    class CountOp : public QueryOp {
+    public:
+        CountOp( const string& ns , const BSONObj &spec ) :
+            _ns(ns), _capped(false), _count(), _myCount(),
+            _skip( spec["skip"].numberLong() ),
+            _limit( spec["limit"].numberLong() ),
+            _nscanned(),
+            _bc() {
+        }
+
+        virtual void _init() {
+            _c = qp().newCursor();
+            _capped = _c->capped();
+            if ( qp().exactKeyMatch() && ! matcher( _c )->needRecord() ) {
+                _query = qp().simplifiedQuery( qp().indexKey() );
+                _bc = dynamic_cast< BtreeCursor* >( _c.get() );
+                _bc->forgetEndKey();
+            }
+        }
+
+        virtual long long nscanned() {
+            return _c.get() ? _c->nscanned() : _nscanned;
+        }
+
+        virtual bool prepareToYield() {
+            if ( _c && !_cc ) {
+                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _ns.c_str() ) );
+            }
+            if ( _cc ) {
+	            return _cc->prepareToYield( _yieldData );
+            }
+            // no active cursor - ok to yield
+            return true;
+        }
+
+        virtual void recoverFromYield() {
+            if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _c.reset();
+                _cc.reset();
+
+                if ( _capped ) {
+                    msgassertedNoTrace( 13337, str::stream() << "capped cursor overrun during count: " << _ns );
+                }
+                else if ( qp().mustAssertOnYieldFailure() ) {
+                    msgassertedNoTrace( 15891, str::stream() << "CountOp::recoverFromYield() failed to recover: " << _ns );
+                }
+                else {
+                    // we don't fail query since we're fine with returning partial data if collection dropped
+                }
+            }
+        }
+
+        virtual void next() {
+            if ( ! _c || !_c->ok() ) {
+                setComplete();
+                return;
+            }
+
+            _nscanned = _c->nscanned();
+            if ( _bc ) {
+                if ( _firstMatch.isEmpty() ) {
+                    _firstMatch = _bc->currKey().getOwned();
+                    // if not match
+                    if ( _query.woCompare( _firstMatch, BSONObj(), false ) ) {
+                        setComplete();
+                        return;
+                    }
+                    _gotOne();
+                }
+                else {
+                    if ( ! _firstMatch.equal( _bc->currKey() ) ) {
+                        setComplete();
+                        return;
+                    }
+                    _gotOne();
+                }
+            }
+            else {
+                if ( !matcher( _c )->matchesCurrent( _c.get() ) ) {
+                }
+                else if( !_c->getsetdup(_c->currLoc()) ) {
+                    _gotOne();
+                }
+            }
+            _c->advance();
+        }
+        virtual QueryOp *_createChild() const {
+            CountOp *ret = new CountOp( _ns , BSONObj() );
+            ret->_count = _count;
+            ret->_skip = _skip;
+            ret->_limit = _limit;
+            return ret;
+        }
+        long long count() const { return _count; }
+        virtual bool mayRecordPlan() const {
+            return ( _myCount > _limit / 2 ) || ( complete() && !stopRequested() );
+        }
+    private:
+
+        void _gotOne() {
+            if ( _skip ) {
+                _skip--;
+                return;
+            }
+
+            if ( _limit > 0 && _count >= _limit ) {
+                setStop();
+                return;
+            }
+
+            _count++;
+            _myCount++;
+        }
+
+        string _ns;
+        bool _capped;
+
+        long long _count;
+        long long _myCount;
+        long long _skip;
+        long long _limit;
+        long long _nscanned;
+        shared_ptr<Cursor> _c;
+        BSONObj _query;
+        BtreeCursor * _bc;
+        BSONObj _firstMatch;
+
+        ClientCursor::CleanupPointer _cc;
+        ClientCursor::YieldData _yieldData;
+    };
+
+    /* { count: "collectionname"[, query: <query>] }
+       returns -1 on ns does not exist error.
+    */
+    long long runCount( const char *ns, const BSONObj &cmd, string &err ) {
+        Client::Context cx(ns);
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d ) {
+            err = "ns missing";
+            return -1;
+        }
+        BSONObj query = cmd.getObjectField("query");
+
+        // count of all objects
+        if ( query.isEmpty() ) {
+            return applySkipLimit( d->stats.nrecords , cmd );
+        }
+        MultiPlanScanner mps( ns, query, BSONObj(), 0, true, BSONObj(), BSONObj(), false, true );
+        CountOp original( ns , cmd );
+        shared_ptr< CountOp > res = mps.runOp( original );
+        if ( !res->complete() ) {
+            log() << "Count with ns: " << ns << " and query: " << query
+                  << " failed with exception: " << res->exception()
+                  << endl;
+            return 0;
+        }
+        return res->count();
+    }
+
+    class ExplainBuilder {
+        // Note: by default we filter out allPlans and oldPlan in the shell's
+        // explain() function. If you add any recursive structures, make sure to
+        // edit the JS to make sure everything gets filtered.
+    public:
+        ExplainBuilder() : _i() {}
+        void ensureStartScan() {
+            if ( !_a.get() ) {
+                _a.reset( new BSONArrayBuilder() );
+            }
+        }
+        void noteCursor( Cursor *c ) {
+            BSONObjBuilder b( _a->subobjStart() );
+            b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds();
+            b.done();
+        }
+        void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder,
+                       int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) {
+            if ( _i == 1 ) {
+                _c.reset( new BSONArrayBuilder() );
+                *_c << _b->obj();
+            }
+            if ( _i == 0 ) {
+                _b.reset( new BSONObjBuilder() );
+            }
+            else {
+                _b.reset( new BSONObjBuilder( _c->subobjStart() ) );
+            }
+            *_b << "cursor" << c->toString();
+            _b->appendNumber( "nscanned", nscanned );
+            _b->appendNumber( "nscannedObjects", nscannedObjects );
+            *_b << "n" << n;
+
+            if ( scanAndOrder )
+                *_b << "scanAndOrder" << true;
+
+            *_b << "millis" << millis;
+
+            *_b << "nYields" << nYields;
+            *_b << "nChunkSkips" << nChunkSkips;
+            *_b << "isMultiKey" << c->isMultiKey();
+            *_b << "indexOnly" << indexOnly;
+
+            *_b << "indexBounds" << c->prettyIndexBounds();
+
+            c->explainDetails( *_b );
+
+            if ( !hint ) {
+                *_b << "allPlans" << _a->arr();
+            }
+            if ( _i != 0 ) {
+                _b->done();
+            }
+            _a.reset( 0 );
+            ++_i;
+        }
+        BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) {
+            if ( _i > 1 ) {
+                BSONObjBuilder b;
+                b << "clauses" << _c->arr();
+                b.appendNumber( "nscanned", nscanned );
+                b.appendNumber( "nscannedObjects", nscannedObjects );
+                b << "n" << n;
+                b << "millis" << millis;
+                b.appendElements( suffix );
+                return b.obj();
+            }
+            else {
+                _b->appendElements( suffix );
+                return _b->obj();
+            }
+        }
+    private:
+        auto_ptr< BSONArrayBuilder > _a;
+        auto_ptr< BSONObjBuilder > _b;
+        auto_ptr< BSONArrayBuilder > _c;
+        int _i;
+    };
+
+    // Implements database 'query' requests using the query optimizer's QueryOp interface
+    class UserQueryOp : public QueryOp {
+    public:
+
+        UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) :
+            _buf( 32768 ) , // TODO be smarter here
+            _pq( pq ) ,
+            _ntoskip( pq.getSkip() ) ,
+            _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0),
+            _n(0),
+            _oldN(0),
+            _nYields(),
+            _nChunkSkips(),
+            _chunkManager( shardingState.needShardChunkManager(pq.ns()) ?
+                           shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ),
+            _inMemSort(false),
+            _capped(false),
+            _saveClientCursor(false),
+            _wouldSaveClientCursor(false),
+            _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ),
+            _response( response ),
+            _eb( eb ),
+            _curop( curop )
+        {}
+
+        virtual void _init() {
+            // only need to put the QueryResult fields there if we're building the first buffer in the message.
+            if ( _response.empty() ) {
+                _buf.skip( sizeof( QueryResult ) );
+            }
+
+            if ( _oplogReplay ) {
+                _findingStartCursor.reset( new FindingStartCursor( qp() ) );
+                _capped = true;
+            }
+            else {
+                _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() );
+                _capped = _c->capped();
+
+                // setup check for if we can only use index to extract
+                if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) {
+                    _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) );
+                }
+            }
+
+            if ( qp().scanAndOrderRequired() ) {
+                _inMemSort = true;
+                _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder(), qp().multikeyFrs() ) );
+            }
+
+            if ( _pq.isExplain() ) {
+                _eb.noteCursor( _c.get() );
+            }
+
+        }
+
+        virtual bool prepareToYield() {
+            if ( _findingStartCursor.get() ) {
+                return _findingStartCursor->prepareToYield();
+            }
+            else {
+                if ( _c && !_cc ) {
+                    _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) );
+                }
+                if ( _cc ) {
+	                return _cc->prepareToYield( _yieldData );
+                }
+            }
+            // no active cursor - ok to yield
+            return true;
+        }
+
+        virtual void recoverFromYield() {
+            _nYields++;
+
+            if ( _findingStartCursor.get() ) {
+                _findingStartCursor->recoverFromYield();
+            }
+            else if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _c.reset();
+                _cc.reset();
+                _so.reset();
+
+                if ( _capped ) {
+                    msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() );
+                }
+                else if ( qp().mustAssertOnYieldFailure() ) {
+                    msgassertedNoTrace( 15890, str::stream() << "UserQueryOp::recoverFromYield() failed to recover: " << _pq.ns() );
+                }
+                else {
+                    // we don't fail query since we're fine with returning partial data if collection dropped
+
+                    // todo: this is wrong.  the cursor could be gone if closeAllDatabases command just ran
+                }
+
+            }
+        }
+
+        virtual long long nscanned() {
+            if ( _findingStartCursor.get() ) {
+                return 0; // should only be one query plan, so value doesn't really matter.
+            }
+            return _c.get() ? _c->nscanned() : _nscanned;
+        }
+
+        virtual void next() {
+            if ( _findingStartCursor.get() ) {
+                if ( !_findingStartCursor->done() ) {
+                    _findingStartCursor->next();
+                }                    
+                if ( _findingStartCursor->done() ) {
+                    _c = _findingStartCursor->cursor();
+                    _findingStartCursor.reset( 0 );
+                }
+                _capped = true;
+                return;
+            }
+
+            if ( !_c || !_c->ok() ) {
+                finish( false );
+                return;
+            }
+
+            bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors;
+
+            if( 0 ) {
+                cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl;
+            }
+
+            if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) {
+                finish( true ); //?
+                return;
+            }
+
+            _nscanned = _c->nscanned();
+            if ( !matcher( _c )->matchesCurrent(_c.get() , &_details ) ) {
+                // not a match, continue onward
+                if ( _details._loadedObject )
+                    _nscannedObjects++;
+            }
+            else {
+                _nscannedObjects++;
+                DiskLoc cl = _c->currLoc();
+                if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) { // TODO: should make this covered at some point
+                    _nChunkSkips++;
+                    // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl;
+                }
+                else if( _c->getsetdup(cl) ) {
+                    // dup
+                }
+                else {
+                    // got a match.
+
+                    if ( _inMemSort ) {
+                        // note: no cursors for non-indexed, ordered results.  results must be fairly small.
+                        _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 );
+                    }
+                    else if ( _ntoskip > 0 ) {
+                        _ntoskip--;
+                    }
+                    else {
+                        if ( _pq.isExplain() ) {
+                            _n++;
+                            if ( n() >= _pq.getNumToReturn() && !_pq.wantMore() ) {
+                                // .limit() was used, show just that much.
+                                finish( true ); //?
+                                return;
+                            }
+                        }
+                        else {
+
+                            if ( _pq.returnKey() ) {
+                                BSONObjBuilder bb( _buf );
+                                bb.appendKeys( _c->indexKeyPattern() , _c->currKey() );
+                                bb.done();
+                            }
+                            else if ( _keyFieldsOnly ) {
+                                fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) );
+                            }
+                            else {
+                                BSONObj js = _c->current();
+                                assert( js.isValid() );
+
+                                if ( _oplogReplay ) {
+                                    BSONElement e = js["ts"];
+                                    if ( e.type() == Date || e.type() == Timestamp )
+                                        _slaveReadTill = e._opTime();
+                                }
+
+                                fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0));
+                            }
+                            _n++;
+                            if ( ! _c->supportGetMore() ) {
+                                if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) {
+                                    finish( true );
+                                    return;
+                                }
+                            }
+                            else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) {
+                                /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
+                                if ( mayCreateCursor1 ) {
+                                    _wouldSaveClientCursor = true;
+                                    if ( _c->advance() ) {
+                                        // more...so save a cursor
+                                        _saveClientCursor = true;
+                                    }
+                                }
+                                finish( true );
+                                return;
+                            }
+                        }
+                    }
+                }
+            }
+            _c->advance();
+        }
+
+        // this plan won, so set data for response broadly
+        void finish( bool stop ) {
+            massert( 13638, "client cursor dropped during explain query yield", !_pq.isExplain() || _c.get() );
+
+            if ( _pq.isExplain() ) {
+                _n = _inMemSort ? _so->size() : _n;
+            }
+            else if ( _inMemSort ) {
+                if( _so.get() )
+                    _so->fill( _buf, _pq.getFields() , _n );
+            }
+
+            if ( _c.get() ) {
+                _nscanned = _c->nscanned();
+
+                if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 )
+                    _c->setTailable();
+
+                // If the tailing request succeeded.
+                if ( _c->tailable() )
+                    _saveClientCursor = true;
+            }
+
+            if ( _pq.isExplain() ) {
+                _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(),
+                              _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields ,
+                              _nChunkSkips, _keyFieldsOnly.get() > 0 );
+            }
+            else {
+                if ( _buf.len() ) {
+                    _response.appendData( _buf.buf(), _buf.len() );
+                    _buf.decouple();
+                }
+            }
+
+            if ( stop ) {
+                setStop();
+            }
+            else {
+                setComplete();
+            }
+
+        }
+
+        void finishExplain( const BSONObj &suffix ) {
+            BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix);
+            fillQueryResultFromObj(_buf, 0, obj);
+            _n = 1;
+            _oldN = 0;
+            _response.appendData( _buf.buf(), _buf.len() );
+            _buf.decouple();
+        }
+
+        virtual bool mayRecordPlan() const {
+            return ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) );
+        }
+
+        virtual QueryOp *_createChild() const {
+            if ( _pq.isExplain() ) {
+                _eb.ensureStartScan();
+            }
+            UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop );
+            ret->_oldN = n();
+            ret->_oldNscanned = totalNscanned();
+            ret->_oldNscannedObjects = nscannedObjects();
+            ret->_ntoskip = _ntoskip;
+            return ret;
+        }
+
+        bool scanAndOrderRequired() const { return _inMemSort; }
+        shared_ptr<Cursor> cursor() { return _c; }
+        int n() const { return _oldN + _n; }
+        long long totalNscanned() const { return _nscanned + _oldNscanned; }
+        long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; }
+        bool saveClientCursor() const { return _saveClientCursor; }
+        bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; }
+
+        void finishForOplogReplay( ClientCursor * cc ) {
+            if ( _oplogReplay && ! _slaveReadTill.isNull() )
+                cc->slaveReadTill( _slaveReadTill );
+
+        }
+
+        ShardChunkManagerPtr getChunkManager(){ return _chunkManager; }
+
+    private:
+        BufBuilder _buf;
+        const ParsedQuery& _pq;
+        scoped_ptr<Projection::KeyOnly> _keyFieldsOnly;
+
+        long long _ntoskip;
+        long long _nscanned;
+        long long _oldNscanned;
+        long long _nscannedObjects;
+        long long _oldNscannedObjects;
+        int _n; // found so far
+        int _oldN;
+
+        int _nYields;
+        int _nChunkSkips;
+
+        MatchDetails _details;
+
+        ShardChunkManagerPtr _chunkManager;
+
+        bool _inMemSort;
+        auto_ptr< ScanAndOrder > _so;
+
+        shared_ptr<Cursor> _c;
+        ClientCursor::CleanupPointer _cc;
+        ClientCursor::YieldData _yieldData;
+
+        bool _capped;
+        bool _saveClientCursor;
+        bool _wouldSaveClientCursor;
+        bool _oplogReplay;
+        auto_ptr< FindingStartCursor > _findingStartCursor;
+
+        Message &_response;
+        ExplainBuilder &_eb;
+        CurOp &_curop;
+        OpTime _slaveReadTill;
+    };
+
+    /* run a query -- includes checking for and running a Command \
+       @return points to ns if exhaust mode. 0=normal mode
+    */
+    const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
+        shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
+        ParsedQuery& pq( *pq_shared );
+        int ntoskip = q.ntoskip;
+        BSONObj jsobj = q.query;
+        int queryOptions = q.queryOptions;
+        const char *ns = q.ns;
+
+        if( logLevel >= 2 )
+            log() << "runQuery called " << ns << " " << jsobj << endl;
+
+        curop.debug().ns = ns;
+        curop.debug().ntoreturn = pq.getNumToReturn();
+        curop.setQuery(jsobj);
+
+        if ( pq.couldBeCommand() ) {
+            BufBuilder bb;
+            bb.skip(sizeof(QueryResult));
+            BSONObjBuilder cmdResBuf;
+            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
+                curop.debug().iscommand = true;
+                curop.debug().query = jsobj;
+                curop.markCommand();
+
+                auto_ptr< QueryResult > qr;
+                qr.reset( (QueryResult *) bb.buf() );
+                bb.decouple();
+                qr->setResultFlagsToOk();
+                qr->len = bb.len();
+                curop.debug().responseLength = bb.len();
+                qr->setOperation(opReply);
+                qr->cursorId = 0;
+                qr->startingFrom = 0;
+                qr->nReturned = 1;
+                result.setData( qr.release(), true );
+            }
+            else {
+                uasserted(13530, "bad or malformed command request?");
+            }
+            return 0;
+        }
+
+        /* --- regular query --- */
+
+        int n = 0;
+        BSONElement hint = useHints ? pq.getHint() : BSONElement();
+        bool explain = pq.isExplain();
+        bool snapshot = pq.isSnapshot();
+        BSONObj order = pq.getOrder();
+        BSONObj query = pq.getFilter();
+
+        /* The ElemIter will not be happy if this isn't really an object. So throw exception
+           here when that is true.
+           (Which may indicate bad data from client.)
+        */
+        if ( query.objsize() == 0 ) {
+            out() << "Bad query object?\n  jsobj:";
+            out() << jsobj.toString() << "\n  query:";
+            out() << query.toString() << endl;
+            uassert( 10110 , "bad query object", false);
+        }
+
+        /* --- read lock --- */
+
+        mongolock lk(false);
+
+        Client::Context ctx( ns , dbpath , &lk );
+
+        replVerifyReadsOk(pq);
+
+        if ( pq.hasOption( QueryOption_CursorTailable ) ) {
+            NamespaceDetails *d = nsdetails( ns );
+            uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped );
+            const BSONObj nat1 = BSON( "$natural" << 1 );
+            if ( order.isEmpty() ) {
+                order = nat1;
+            }
+            else {
+                uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
+            }
+        }
+
+        BSONObj snapshotHint; // put here to keep the data in scope
+        if( snapshot ) {
+            NamespaceDetails *d = nsdetails(ns);
+            if ( d ) {
+                int i = d->findIdIndex();
+                if( i < 0 ) {
+                    if ( strstr( ns , ".system." ) == 0 )
+                        log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
+                }
+                else {
+                    /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
+                       probably need a better way to specify "use the _id index" as a hint.  if someone is
+                       in the query optimizer please fix this then!
+                    */
+                    BSONObjBuilder b;
+                    b.append("$hint", d->idx(i).indexName());
+                    snapshotHint = b.obj();
+                    hint = snapshotHint.firstElement();
+                }
+            }
+        }
+
+        if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
+
+            bool nsFound = false;
+            bool indexFound = false;
+
+            BSONObj resObject;
+            Client& c = cc();
+            bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
+            if ( nsFound == false || indexFound == true ) {
+                BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
+                bb.skip(sizeof(QueryResult));
+                
+                curop.debug().idhack = true;
+                if ( found ) {
+                    n = 1;
+                    fillQueryResultFromObj( bb , pq.getFields() , resObject );
+                }
+                auto_ptr< QueryResult > qr;
+                qr.reset( (QueryResult *) bb.buf() );
+                bb.decouple();
+                qr->setResultFlagsToOk();
+                qr->len = bb.len();
+                
+                curop.debug().responseLength = bb.len();
+                qr->setOperation(opReply);
+                qr->cursorId = 0;
+                qr->startingFrom = 0;
+                qr->nReturned = n;
+                result.setData( qr.release(), true );
+                return NULL;
+            }
+        }
+
+        // regular, not QO bypass query
+
+        BSONObj oldPlan;
+        if ( explain && ! pq.hasIndexSpecifier() ) {
+            MultiPlanScanner mps( ns, query, order );
+            if ( mps.usingPrerecordedPlan() )
+                oldPlan = mps.oldExplain();
+        }
+        auto_ptr< MultiPlanScanner > mps( new MultiPlanScanner( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax(), false, true ) );
+        BSONObj explainSuffix;
+        if ( explain ) {
+            BSONObjBuilder bb;
+            if ( !oldPlan.isEmpty() )
+                bb.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
+            explainSuffix = bb.obj();
+        }
+        ExplainBuilder eb;
+        UserQueryOp original( pq, result, eb, curop );
+        shared_ptr< UserQueryOp > o = mps->runOp( original );
+        UserQueryOp &dqo = *o;
+        if ( ! dqo.complete() )
+            throw MsgAssertionException( dqo.exception() );
+        if ( explain ) {
+            dqo.finishExplain( explainSuffix );
+        }
+        n = dqo.n();
+        long long nscanned = dqo.totalNscanned();
+        curop.debug().scanAndOrder = dqo.scanAndOrderRequired();
+
+        shared_ptr<Cursor> cursor = dqo.cursor();
+        if( logLevel >= 5 )
+            log() << "   used cursor: " << cursor.get() << endl;
+        long long cursorid = 0;
+        const char * exhaust = 0;
+        if ( dqo.saveClientCursor() || ( dqo.wouldSaveClientCursor() && mps->mayRunMore() ) ) {
+            ClientCursor *cc;
+            bool moreClauses = mps->mayRunMore();
+            if ( moreClauses ) {
+                // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield
+                shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher( cursor ), dqo ) );
+                cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned());
+            }
+            else {
+                if( ! cursor->matcher() ) cursor->setMatcher( dqo.matcher( cursor ) );
+                cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() );
+            }
+
+            cc->setChunkManager( dqo.getChunkManager() );
+
+            cursorid = cc->cursorid();
+            DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
+            cc->setPos( n );
+            cc->pq = pq_shared;
+            cc->fields = pq.getFieldPtr();
+            cc->originalMessage = m;
+            cc->updateLocation();
+            if ( !cc->ok() && cc->c()->tailable() )
+                DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
+            if( queryOptions & QueryOption_Exhaust ) {
+                exhaust = ns;
+                curop.debug().exhaust = true;
+            }
+            dqo.finishForOplogReplay(cc);
+        }
+
+        QueryResult *qr = (QueryResult *) result.header();
+        qr->cursorId = cursorid;
+        qr->setResultFlagsToOk();
+        // qr->len is updated automatically by appendData()
+        curop.debug().responseLength = qr->len;
+        qr->setOperation(opReply);
+        qr->startingFrom = 0;
+        qr->nReturned = n;
+
+        int duration = curop.elapsedMillis();
+        bool dbprofile = curop.shouldDBProfile( duration );
+        if ( dbprofile || duration >= cmdLine.slowMS ) {
+            curop.debug().nscanned = (int) nscanned;
+            curop.debug().ntoskip = ntoskip;
+        }
+        curop.debug().nreturned = n;
+        return exhaust;
+    }
+
+} // namespace mongo
diff --git a/db/ops/query.h b/db/ops/query.h
new file mode 100644
index 0000000..ada2e90
--- /dev/null
+++ b/db/ops/query.h
@@ -0,0 +1,250 @@
+// query.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../../util/net/message.h"
+#include "../dbmessage.h"
+#include "../jsobj.h"
+#include "../diskloc.h"
+#include "../projection.h"
+
+// struct QueryOptions, QueryResult, QueryResultFlags in:
+#include "../../client/dbclient.h"
+
+namespace mongo {
+
+    extern const int MaxBytesToReturnToClientAtOnce;
+
+    QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op, int pass, bool& exhaust);
+
+    long long runCount(const char *ns, const BSONObj& cmd, string& err);
+
+    const char * runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result);
+
+    /* This is for languages whose "objects" are not well ordered (JSON is well ordered).
+       [ { a : ... } , { b : ... } ] -> { a : ..., b : ... }
+    */
+    inline BSONObj transformOrderFromArrayFormat(BSONObj order) {
+        /* note: this is slow, but that is ok as order will have very few pieces */
+        BSONObjBuilder b;
+        char p[2] = "0";
+
+        while ( 1 ) {
+            BSONObj j = order.getObjectField(p);
+            if ( j.isEmpty() )
+                break;
+            BSONElement e = j.firstElement();
+            uassert( 10102 , "bad order array", !e.eoo());
+            uassert( 10103 , "bad order array [2]", e.isNumber());
+            b.append(e);
+            (*p)++;
+            uassert( 10104 , "too many ordering elements", *p <= '9');
+        }
+
+        return b.obj();
+    }
+
+    /**
+     * this represents a total user query
+     * includes fields from the query message, both possible query levels
+     * parses everything up front
+     */
+    class ParsedQuery : boost::noncopyable {
+    public:
+        ParsedQuery( QueryMessage& qm )
+            : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ) {
+            init( qm.query );
+            initFields( qm.fields );
+        }
+        ParsedQuery( const char* ns , int ntoskip , int ntoreturn , int queryoptions , const BSONObj& query , const BSONObj& fields )
+            : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ) {
+            init( query );
+            initFields( fields );
+        }
+
+        const char * ns() const { return _ns; }
+        bool isLocalDB() const { return strncmp(_ns, "local.", 6) == 0; }
+
+        const BSONObj& getFilter() const { return _filter; }
+        Projection* getFields() const { return _fields.get(); }
+        shared_ptr<Projection> getFieldPtr() const { return _fields; }
+
+        int getSkip() const { return _ntoskip; }
+        int getNumToReturn() const { return _ntoreturn; }
+        bool wantMore() const { return _wantMore; }
+        int getOptions() const { return _options; }
+        bool hasOption( int x ) const { return x & _options; }
+
+        bool isExplain() const { return _explain; }
+        bool isSnapshot() const { return _snapshot; }
+        bool returnKey() const { return _returnKey; }
+        bool showDiskLoc() const { return _showDiskLoc; }
+
+        const BSONObj& getMin() const { return _min; }
+        const BSONObj& getMax() const { return _max; }
+        const BSONObj& getOrder() const { return _order; }
+        const BSONElement& getHint() const { return _hint; }
+        int getMaxScan() const { return _maxScan; }
+
+        bool couldBeCommand() const {
+            /* we assume you are using findOne() for running a cmd... */
+            return _ntoreturn == 1 && strstr( _ns , ".$cmd" );
+        }
+
+        bool hasIndexSpecifier() const {
+            return ! _hint.eoo() || ! _min.isEmpty() || ! _max.isEmpty();
+        }
+
+        /* if ntoreturn is zero, we return up to 101 objects.  on the subsequent getmore, there
+           is only a size limit.  The idea is that on a find() where one doesn't use much results,
+           we don't return much, but once getmore kicks in, we start pushing significant quantities.
+
+           The n limit (vs. size) is important when someone fetches only one small field from big
+           objects, which causes massive scanning server-side.
+        */
+        bool enoughForFirstBatch( int n , int len ) const {
+            if ( _ntoreturn == 0 )
+                return ( len > 1024 * 1024 ) || n >= 101;
+            return n >= _ntoreturn || len > MaxBytesToReturnToClientAtOnce;
+        }
+
+        bool enough( int n ) const {
+            if ( _ntoreturn == 0 )
+                return false;
+            return n >= _ntoreturn;
+        }
+
+    private:
+        void init( const BSONObj& q ) {
+            _reset();
+            uassert( 10105 , "bad skip value in query", _ntoskip >= 0);
+
+            if ( _ntoreturn < 0 ) {
+                /* _ntoreturn greater than zero is simply a hint on how many objects to send back per
+                   "cursor batch".
+                   A negative number indicates a hard limit.
+                */
+                _wantMore = false;
+                _ntoreturn = -_ntoreturn;
+            }
+
+
+            BSONElement e = q["query"];
+            if ( ! e.isABSONObj() )
+                e = q["$query"];
+
+            if ( e.isABSONObj() ) {
+                _filter = e.embeddedObject();
+                _initTop( q );
+            }
+            else {
+                _filter = q;
+            }
+        }
+
+        void _reset() {
+            _wantMore = true;
+            _explain = false;
+            _snapshot = false;
+            _returnKey = false;
+            _showDiskLoc = false;
+            _maxScan = 0;
+        }
+
+        void _initTop( const BSONObj& top ) {
+            BSONObjIterator i( top );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                const char * name = e.fieldName();
+
+                if ( strcmp( "$orderby" , name ) == 0 ||
+                        strcmp( "orderby" , name ) == 0 ) {
+                    if ( e.type() == Object ) {
+                        _order = e.embeddedObject();
+                    }
+                    else if ( e.type() == Array ) {
+                        _order = transformOrderFromArrayFormat( _order );
+                    }
+                    else {
+                        uasserted(13513, "sort must be an object or array");
+                    }
+                    continue;
+                }
+
+                if( *name == '$' ) {
+                    name++;
+                    if ( strcmp( "explain" , name ) == 0 )
+                        _explain = e.trueValue();
+                    else if ( strcmp( "snapshot" , name ) == 0 )
+                        _snapshot = e.trueValue();
+                    else if ( strcmp( "min" , name ) == 0 )
+                        _min = e.embeddedObject();
+                    else if ( strcmp( "max" , name ) == 0 )
+                        _max = e.embeddedObject();
+                    else if ( strcmp( "hint" , name ) == 0 )
+                        _hint = e;
+                    else if ( strcmp( "returnKey" , name ) == 0 )
+                        _returnKey = e.trueValue();
+                    else if ( strcmp( "maxScan" , name ) == 0 )
+                        _maxScan = e.numberInt();
+                    else if ( strcmp( "showDiskLoc" , name ) == 0 )
+                        _showDiskLoc = e.trueValue();
+                    else if ( strcmp( "comment" , name ) == 0 ) {
+                        ; // no-op
+                    }
+                }
+            }
+
+            if ( _snapshot ) {
+                uassert( 12001 , "E12001 can't sort with $snapshot", _order.isEmpty() );
+                uassert( 12002 , "E12002 can't use hint with $snapshot", _hint.eoo() );
+            }
+
+        }
+
+        void initFields( const BSONObj& fields ) {
+            if ( fields.isEmpty() )
+                return;
+            _fields.reset( new Projection() );
+            _fields->init( fields );
+        }
+
+        const char * const _ns;
+        const int _ntoskip;
+        int _ntoreturn;
+        BSONObj _filter;
+        BSONObj _order;
+        const int _options;
+        shared_ptr< Projection > _fields;
+        bool _wantMore;
+        bool _explain;
+        bool _snapshot;
+        bool _returnKey;
+        bool _showDiskLoc;
+        BSONObj _min;
+        BSONObj _max;
+        BSONElement _hint;
+        int _maxScan;
+    };
+
+
+} // namespace mongo
+
+
diff --git a/db/ops/update.cpp b/db/ops/update.cpp
new file mode 100644
index 0000000..fd9798a
--- /dev/null
+++ b/db/ops/update.cpp
@@ -0,0 +1,1369 @@
+// update.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "query.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../queryoptimizer.h"
+#include "../repl.h"
+#include "../btree.h"
+#include "../../util/stringutils.h"
+#include "update.h"
+
+//#define DEBUGUPDATE(x) cout << x << endl;
+#define DEBUGUPDATE(x)
+
+namespace mongo {
+
+    const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" ,
+                                    "$bitand" , "$bitor" , "$bit" , "$addToSet", "$rename", "$rename"
+                                  };
+    unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*);
+
+    bool Mod::_pullElementMatch( BSONElement& toMatch ) const {
+
+        if ( elt.type() != Object ) {
+            // if elt isn't an object, then comparison will work
+            return toMatch.woCompare( elt , false ) == 0;
+        }
+
+        if ( matcherOnPrimitive )
+            return matcher->matches( toMatch.wrap( "" ) );
+
+        if ( toMatch.type() != Object ) {
+            // looking for an object, so this can't match
+            return false;
+        }
+
+        // now we have an object on both sides
+        return matcher->matches( toMatch.embeddedObject() );
+    }
+
+    template< class Builder >
+    void Mod::appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const {
+        BSONType a = in.type();
+        BSONType b = elt.type();
+
+        if ( a == NumberDouble || b == NumberDouble ) {
+            ms.incType = NumberDouble;
+            ms.incdouble = elt.numberDouble() + in.numberDouble();
+        }
+        else if ( a == NumberLong || b == NumberLong ) {
+            ms.incType = NumberLong;
+            ms.inclong = elt.numberLong() + in.numberLong();
+        }
+        else {
+            int x = elt.numberInt() + in.numberInt();
+            if ( x < 0 && elt.numberInt() > 0 && in.numberInt() > 0 ) {
+                // overflow
+                ms.incType = NumberLong;
+                ms.inclong = elt.numberLong() + in.numberLong();
+            }
+            else {
+                ms.incType = NumberInt;
+                ms.incint = elt.numberInt() + in.numberInt();
+            }
+        }
+
+        ms.appendIncValue( bb , false );
+    }
+
+    template< class Builder >
+    void appendUnset( Builder &b ) {
+    }
+
+    template<>
+    void appendUnset( BSONArrayBuilder &b ) {
+        b.appendNull();
+    }
+
+    template< class Builder >
+    void Mod::apply( Builder& b , BSONElement in , ModState& ms ) const {
+        if ( ms.dontApply ) {
+            return;
+        }
+
+        switch ( op ) {
+
+        case INC: {
+            appendIncremented( b , in , ms );
+            break;
+        }
+
+        case SET: {
+            _checkForAppending( elt );
+            b.appendAs( elt , shortFieldName );
+            break;
+        }
+
+        case UNSET: {
+            appendUnset( b );
+            break;
+        }
+
+        case PUSH: {
+            uassert( 10131 ,  "$push can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+            while ( i.more() ) {
+                bb.append( i.next() );
+                n++;
+            }
+
+            ms.pushStartSize = n;
+
+            bb.appendAs( elt ,  bb.numStr( n ) );
+            bb.done();
+            break;
+        }
+
+        case ADDTOSET: {
+            uassert( 12592 ,  "$addToSet can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+
+            if ( isEach() ) {
+
+                BSONElementSet toadd;
+                parseEach( toadd );
+
+                while ( i.more() ) {
+                    BSONElement cur = i.next();
+                    bb.append( cur );
+                    n++;
+                    toadd.erase( cur );
+                }
+
+                {
+                    BSONObjIterator i( getEach() );
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( toadd.count(e) ) {
+                            bb.appendAs( e , BSONObjBuilder::numStr( n++ ) );
+                            toadd.erase( e );
+                        }
+                    }
+                }
+
+            }
+            else {
+
+                bool found = false;
+
+                while ( i.more() ) {
+                    BSONElement cur = i.next();
+                    bb.append( cur );
+                    n++;
+                    if ( elt.woCompare( cur , false ) == 0 )
+                        found = true;
+                }
+
+                if ( ! found )
+                    bb.appendAs( elt ,  bb.numStr( n ) );
+
+            }
+
+            bb.done();
+            break;
+        }
+
+
+
+        case PUSH_ALL: {
+            uassert( 10132 ,  "$pushAll can only be applied to an array" , in.type() == Array );
+            uassert( 10133 ,  "$pushAll has to be passed an array" , elt.type() );
+
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+            while ( i.more() ) {
+                bb.append( i.next() );
+                n++;
+            }
+
+            ms.pushStartSize = n;
+
+            i = BSONObjIterator( elt.embeddedObject() );
+            while ( i.more() ) {
+                bb.appendAs( i.next() , bb.numStr( n++ ) );
+            }
+
+            bb.done();
+            break;
+        }
+
+        case PULL:
+        case PULL_ALL: {
+            uassert( 10134 ,  "$pull/$pullAll can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            int n = 0;
+
+            BSONObjIterator i( in.embeddedObject() );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                bool allowed = true;
+
+                if ( op == PULL ) {
+                    allowed = ! _pullElementMatch( e );
+                }
+                else {
+                    BSONObjIterator j( elt.embeddedObject() );
+                    while( j.more() ) {
+                        BSONElement arrJ = j.next();
+                        if ( e.woCompare( arrJ, false ) == 0 ) {
+                            allowed = false;
+                            break;
+                        }
+                    }
+                }
+
+                if ( allowed )
+                    bb.appendAs( e , bb.numStr( n++ ) );
+            }
+
+            bb.done();
+            break;
+        }
+
+        case POP: {
+            uassert( 10135 ,  "$pop can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            int n = 0;
+
+            BSONObjIterator i( in.embeddedObject() );
+            if ( elt.isNumber() && elt.number() < 0 ) {
+                // pop from front
+                if ( i.more() ) {
+                    i.next();
+                    n++;
+                }
+
+                while( i.more() ) {
+                    bb.appendAs( i.next() , bb.numStr( n - 1 ) );
+                    n++;
+                }
+            }
+            else {
+                // pop from back
+                while( i.more() ) {
+                    n++;
+                    BSONElement arrI = i.next();
+                    if ( i.more() ) {
+                        bb.append( arrI );
+                    }
+                }
+            }
+
+            ms.pushStartSize = n;
+            assert( ms.pushStartSize == in.embeddedObject().nFields() );
+            bb.done();
+            break;
+        }
+
+        case BIT: {
+            uassert( 10136 ,  "$bit needs an array" , elt.type() == Object );
+            uassert( 10137 ,  "$bit can only be applied to numbers" , in.isNumber() );
+            uassert( 10138 ,  "$bit cannot update a value of type double" , in.type() != NumberDouble );
+
+            int x = in.numberInt();
+            long long y = in.numberLong();
+
+            BSONObjIterator it( elt.embeddedObject() );
+            while ( it.more() ) {
+                BSONElement e = it.next();
+                uassert( 10139 ,  "$bit field must be number" , e.isNumber() );
+                if ( str::equals(e.fieldName(), "and") ) {
+                    switch( in.type() ) {
+                    case NumberInt: x = x&e.numberInt(); break;
+                    case NumberLong: y = y&e.numberLong(); break;
+                    default: assert( 0 );
+                    }
+                }
+                else if ( str::equals(e.fieldName(), "or") ) {
+                    switch( in.type() ) {
+                    case NumberInt: x = x|e.numberInt(); break;
+                    case NumberLong: y = y|e.numberLong(); break;
+                    default: assert( 0 );
+                    }
+                }
+                else {
+                    uasserted(9016, str::stream() << "unknown $bit operation: " << e.fieldName());
+                }
+            }
+
+            switch( in.type() ) {
+            case NumberInt: b.append( shortFieldName , x ); break;
+            case NumberLong: b.append( shortFieldName , y ); break;
+            default: assert( 0 );
+            }
+
+            break;
+        }
+
+        case RENAME_FROM: {
+            break;
+        }
+
+        case RENAME_TO: {
+            ms.handleRename( b, shortFieldName );
+            break;
+        }
+
+        default:
+            stringstream ss;
+            ss << "Mod::apply can't handle type: " << op;
+            throw UserException( 9017, ss.str() );
+        }
+    }
+
+    // -1 inside a non-object (non-object could be array)
+    // 0 missing
+    // 1 found
+    int validRenamePath( BSONObj obj, const char *path ) {
+        while( const char *p = strchr( path, '.' ) ) {
+            string left( path, p - path );
+            BSONElement e = obj.getField( left );
+            if ( e.eoo() ) {
+                return 0;
+            }
+            if ( e.type() != Object ) {
+                return -1;
+            }
+            obj = e.embeddedObject();
+            path = p + 1;
+        }
+        return !obj.getField( path ).eoo();
+    }
+
+    auto_ptr<ModSetState> ModSet::prepare(const BSONObj &obj) const {
+        DEBUGUPDATE( "\t start prepare" );
+        auto_ptr<ModSetState> mss( new ModSetState( obj ) );
+
+
+        // Perform this check first, so that we don't leave a partially modified object on uassert.
+        for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            DEBUGUPDATE( "\t\t prepare : " << i->first );
+            ModState& ms = mss->_mods[i->first];
+
+            const Mod& m = i->second;
+            BSONElement e = obj.getFieldDotted(m.fieldName);
+
+            ms.m = &m;
+            ms.old = e;
+
+            if ( m.op == Mod::RENAME_FROM ) {
+                int source = validRenamePath( obj, m.fieldName );
+                uassert( 13489, "$rename source field invalid", source != -1 );
+                if ( source != 1 ) {
+                    ms.dontApply = true;
+                }
+                continue;
+            }
+
+            if ( m.op == Mod::RENAME_TO ) {
+                int source = validRenamePath( obj, m.renameFrom() );
+                if ( source == 1 ) {
+                    int target = validRenamePath( obj, m.fieldName );
+                    uassert( 13490, "$rename target field invalid", target != -1 );
+                    ms.newVal = obj.getFieldDotted( m.renameFrom() );
+                    mss->amIInPlacePossible( false );
+                }
+                else {
+                    ms.dontApply = true;
+                }
+                continue;
+            }
+
+            if ( e.eoo() ) {
+                mss->amIInPlacePossible( m.op == Mod::UNSET );
+                continue;
+            }
+
+            switch( m.op ) {
+            case Mod::INC:
+                uassert( 10140 ,  "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() );
+                if ( mss->amIInPlacePossible( e.isNumber() ) ) {
+                    // check more typing info here
+                    if ( m.elt.type() != e.type() ) {
+                        // if i'm incrementing with a double, then the storage has to be a double
+                        mss->amIInPlacePossible( m.elt.type() != NumberDouble );
+                    }
+
+                    // check for overflow
+                    if ( e.type() == NumberInt && e.numberLong() + m.elt.numberLong() > numeric_limits<int>::max() ) {
+                        mss->amIInPlacePossible( false );
+                    }
+                }
+                break;
+
+            case Mod::SET:
+                mss->amIInPlacePossible( m.elt.type() == e.type() &&
+                                         m.elt.valuesize() == e.valuesize() );
+                break;
+
+            case Mod::PUSH:
+            case Mod::PUSH_ALL:
+                uassert( 10141 ,  "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() );
+                mss->amIInPlacePossible( false );
+                break;
+
+            case Mod::PULL:
+            case Mod::PULL_ALL: {
+                uassert( 10142 ,  "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() );
+                BSONObjIterator i( e.embeddedObject() );
+                while( mss->_inPlacePossible && i.more() ) {
+                    BSONElement arrI = i.next();
+                    if ( m.op == Mod::PULL ) {
+                        mss->amIInPlacePossible( ! m._pullElementMatch( arrI ) );
+                    }
+                    else if ( m.op == Mod::PULL_ALL ) {
+                        BSONObjIterator j( m.elt.embeddedObject() );
+                        while( mss->_inPlacePossible && j.moreWithEOO() ) {
+                            BSONElement arrJ = j.next();
+                            if ( arrJ.eoo() )
+                                break;
+                            mss->amIInPlacePossible( arrI.woCompare( arrJ, false ) );
+                        }
+                    }
+                }
+                break;
+            }
+
+            case Mod::POP: {
+                uassert( 10143 ,  "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() );
+                mss->amIInPlacePossible( e.embeddedObject().isEmpty() );
+                break;
+            }
+
+            case Mod::ADDTOSET: {
+                uassert( 12591 ,  "Cannot apply $addToSet modifier to non-array", e.type() == Array || e.eoo() );
+
+                BSONObjIterator i( e.embeddedObject() );
+                if ( m.isEach() ) {
+                    BSONElementSet toadd;
+                    m.parseEach( toadd );
+                    while( i.more() ) {
+                        BSONElement arrI = i.next();
+                        toadd.erase( arrI );
+                    }
+                    mss->amIInPlacePossible( toadd.size() == 0 );
+                }
+                else {
+                    bool found = false;
+                    while( i.more() ) {
+                        BSONElement arrI = i.next();
+                        if ( arrI.woCompare( m.elt , false ) == 0 ) {
+                            found = true;
+                            break;
+                        }
+                    }
+                    mss->amIInPlacePossible( found );
+                }
+                break;
+            }
+
+            default:
+                // mods we don't know about shouldn't be done in place
+                mss->amIInPlacePossible( false );
+            }
+        }
+
+        DEBUGUPDATE( "\t mss\n" << mss->toString() << "\t--" );
+
+        return mss;
+    }
+
+    void ModState::appendForOpLog( BSONObjBuilder& b ) const {
+        if ( dontApply ) {
+            return;
+        }
+
+        if ( incType ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog inc fieldname: " << m->fieldName << " short:" << m->shortFieldName );
+            BSONObjBuilder bb( b.subobjStart( "$set" ) );
+            appendIncValue( bb , true );
+            bb.done();
+            return;
+        }
+
+        if ( m->op == Mod::RENAME_FROM ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_FROM fieldName:" << m->fieldName );
+            BSONObjBuilder bb( b.subobjStart( "$unset" ) );
+            bb.append( m->fieldName, 1 );
+            bb.done();
+            return;
+        }
+
+        if ( m->op == Mod::RENAME_TO ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_TO fieldName:" << m->fieldName );
+            BSONObjBuilder bb( b.subobjStart( "$set" ) );
+            bb.appendAs( newVal, m->fieldName );
+            return;
+        }
+
+        const char * name = fixedOpName ? fixedOpName : Mod::modNames[op()];
+
+        DEBUGUPDATE( "\t\t\t\t\t appendForOpLog name:" << name << " fixed: " << fixed << " fn: " << m->fieldName );
+
+        BSONObjBuilder bb( b.subobjStart( name ) );
+        if ( fixed ) {
+            bb.appendAs( *fixed , m->fieldName );
+        }
+        else {
+            bb.appendAs( m->elt , m->fieldName );
+        }
+        bb.done();
+    }
+
+    string ModState::toString() const {
+        stringstream ss;
+        if ( fixedOpName )
+            ss << " fixedOpName: " << fixedOpName;
+        if ( fixed )
+            ss << " fixed: " << fixed;
+        return ss.str();
+    }
+
+    template< class Builder >
+    void ModState::handleRename( Builder &newObjBuilder, const char *shortFieldName ) {
+        newObjBuilder.appendAs( newVal , shortFieldName );
+        BSONObjBuilder b;
+        b.appendAs( newVal, shortFieldName );
+        assert( _objData.isEmpty() );
+        _objData = b.obj();
+        newVal = _objData.firstElement();
+    }
+
+    void ModSetState::applyModsInPlace( bool isOnDisk ) {
+        // TODO i think this assert means that we can get rid of the isOnDisk param
+        //      and just use isOwned as the determination
+        DEV assert( isOnDisk == ! _obj.isOwned() );
+
+        for ( ModStateHolder::iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            ModState& m = i->second;
+
+            if ( m.dontApply ) {
+                continue;
+            }
+
+            switch ( m.m->op ) {
+            case Mod::UNSET:
+            case Mod::ADDTOSET:
+            case Mod::RENAME_FROM:
+            case Mod::RENAME_TO:
+                // this should have been handled by prepare
+                break;
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                // this should have been handled by prepare
+                break;
+            case Mod::POP:
+                assert( m.old.eoo() || ( m.old.isABSONObj() && m.old.Obj().isEmpty() ) );
+                break;
+                // [dm] the BSONElementManipulator statements below are for replication (correct?)
+            case Mod::INC:
+                if ( isOnDisk )
+                    m.m->IncrementMe( m.old );
+                else
+                    m.m->incrementMe( m.old );
+                m.fixedOpName = "$set";
+                m.fixed = &(m.old);
+                break;
+            case Mod::SET:
+                if ( isOnDisk )
+                    BSONElementManipulator( m.old ).ReplaceTypeAndValue( m.m->elt );
+                else
+                    BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt );
+                break;
+            default:
+                uassert( 13478 ,  "can't apply mod in place - shouldn't have gotten here" , 0 );
+            }
+        }
+    }
+
+    void ModSet::extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base ) {
+        if ( top.type() != Object ) {
+            fields[ base + top.fieldName() ] = top;
+            return;
+        }
+        BSONObjIterator i( top.embeddedObject() );
+        bool empty = true;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            extractFields( fields, e, base + top.fieldName() + "." );
+            empty = false;
+        }
+        if ( empty )
+            fields[ base + top.fieldName() ] = top;
+    }
+
+    template< class Builder >
+    void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ) {
+        const char * temp = m.fieldName();
+        temp += root.size();
+        const char * dot = strchr( temp , '.' );
+        if ( dot ) {
+            string nr( m.fieldName() , 0 , 1 + ( dot - m.fieldName() ) );
+            string nf( temp , 0 , dot - temp );
+            if ( onedownseen.count( nf ) )
+                return;
+            onedownseen.insert( nf );
+            BSONObjBuilder bb ( b.subobjStart( nf ) );
+            createNewFromMods( nr , bb , BSONObj() ); // don't infer an array from name
+            bb.done();
+        }
+        else {
+            appendNewFromMod( m , b );
+        }
+
+    }
+
+    template< class Builder >
+    void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ) {
+        DEBUGUPDATE( "\t\t createNewFromMods root: " << root );
+        BSONObjIteratorSorted es( obj );
+        BSONElement e = es.next();
+
+        ModStateHolder::iterator m = _mods.lower_bound( root );
+        StringBuilder buf(root.size() + 2 );
+        buf << root << (char)255;
+        ModStateHolder::iterator mend = _mods.lower_bound( buf.str() );
+
+        set<string> onedownseen;
+
+        while ( e.type() && m != mend ) {
+            string field = root + e.fieldName();
+            FieldCompareResult cmp = compareDottedFieldNames( m->second.m->fieldName , field );
+
+            DEBUGUPDATE( "\t\t\t field:" << field << "\t mod:" << m->second.m->fieldName << "\t cmp:" << cmp << "\t short: " << e.fieldName() );
+
+            switch ( cmp ) {
+
+            case LEFT_SUBFIELD: { // Mod is embedded under this element
+                uassert( 10145 ,  str::stream() << "LEFT_SUBFIELD only supports Object: " << field << " not: " << e.type() , e.type() == Object || e.type() == Array );
+                if ( onedownseen.count( e.fieldName() ) == 0 ) {
+                    onedownseen.insert( e.fieldName() );
+                    if ( e.type() == Object ) {
+                        BSONObjBuilder bb( b.subobjStart( e.fieldName() ) );
+                        stringstream nr; nr << root << e.fieldName() << ".";
+                        createNewFromMods( nr.str() , bb , e.embeddedObject() );
+                        bb.done();
+                    }
+                    else {
+                        BSONArrayBuilder ba( b.subarrayStart( e.fieldName() ) );
+                        stringstream nr; nr << root << e.fieldName() << ".";
+                        createNewFromMods( nr.str() , ba , e.embeddedObject() );
+                        ba.done();
+                    }
+                    // inc both as we handled both
+                    e = es.next();
+                    m++;
+                }
+                else {
+                    // this is a very weird case
+                    // have seen it in production, but can't reproduce
+                    // this assert prevents an inf. loop
+                    // but likely isn't the correct solution
+                    assert(0);
+                }
+                continue;
+            }
+            case LEFT_BEFORE: // Mod on a field that doesn't exist
+                DEBUGUPDATE( "\t\t\t\t creating new field for: " << m->second.m->fieldName );
+                _appendNewFromMods( root , m->second , b , onedownseen );
+                m++;
+                continue;
+            case SAME:
+                DEBUGUPDATE( "\t\t\t\t applying mod on: " << m->second.m->fieldName );
+                m->second.apply( b , e );
+                e = es.next();
+                m++;
+                continue;
+            case RIGHT_BEFORE: // field that doesn't have a MOD
+                DEBUGUPDATE( "\t\t\t\t just copying" );
+                b.append( e ); // if array, ignore field name
+                e = es.next();
+                continue;
+            case RIGHT_SUBFIELD:
+                massert( 10399 ,  "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 );
+                break;
+            default:
+                massert( 10400 ,  "unhandled case" , 0 );
+            }
+        }
+
+        // finished looping the mods, just adding the rest of the elements
+        while ( e.type() ) {
+            DEBUGUPDATE( "\t\t\t copying: " << e.fieldName() );
+            b.append( e );  // if array, ignore field name
+            e = es.next();
+        }
+
+        // do mods that don't have fields already
+        for ( ; m != mend; m++ ) {
+            DEBUGUPDATE( "\t\t\t\t appending from mod at end: " << m->second.m->fieldName );
+            _appendNewFromMods( root , m->second , b , onedownseen );
+        }
+    }
+
+    BSONObj ModSetState::createNewFromMods() {
+        BSONObjBuilder b( (int)(_obj.objsize() * 1.1) );
+        createNewFromMods( "" , b , _obj );
+        return _newFromMods = b.obj();
+    }
+
+    string ModSetState::toString() const {
+        stringstream ss;
+        for ( ModStateHolder::const_iterator i=_mods.begin(); i!=_mods.end(); ++i ) {
+            ss << "\t\t" << i->first << "\t" << i->second.toString() << "\n";
+        }
+        return ss.str();
+    }
+
+    bool ModSetState::FieldCmp::operator()( const string &l, const string &r ) const {
+        return lexNumCmp( l.c_str(), r.c_str() ) < 0;
+    }
+
+    BSONObj ModSet::createNewFromQuery( const BSONObj& query ) {
+        BSONObj newObj;
+
+        {
+            BSONObjBuilder bb;
+            EmbeddedBuilder eb( &bb );
+            BSONObjIteratorSorted i( query );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.fieldName()[0] == '$' ) // for $atomic and anything else we add
+                    continue;
+
+                if ( e.type() == Object && e.embeddedObject().firstElementFieldName()[0] == '$' ) {
+                    // this means this is a $gt type filter, so don't make part of the new object
+                    continue;
+                }
+
+                eb.appendAs( e , e.fieldName() );
+            }
+            eb.done();
+            newObj = bb.obj();
+        }
+
+        auto_ptr<ModSetState> mss = prepare( newObj );
+
+        if ( mss->canApplyInPlace() )
+            mss->applyModsInPlace( false );
+        else
+            newObj = mss->createNewFromMods();
+
+        return newObj;
+    }
+
+    /* get special operations like $inc
+       { $inc: { a:1, b:1 } }
+       { $set: { a:77 } }
+       { $push: { a:55 } }
+       { $pushAll: { a:[77,88] } }
+       { $pull: { a:66 } }
+       { $pullAll : { a:[99,1010] } }
+       NOTE: MODIFIES source from object!
+    */
+    ModSet::ModSet(
+        const BSONObj &from ,
+        const set<string>& idxKeys,
+        const set<string> *backgroundKeys)
+        : _isIndexed(0) , _hasDynamicArray( false ) {
+
+        BSONObjIterator it(from);
+
+        while ( it.more() ) {
+            BSONElement e = it.next();
+            const char *fn = e.fieldName();
+
+            uassert( 10147 ,  "Invalid modifier specified: " + string( fn ), e.type() == Object );
+            BSONObj j = e.embeddedObject();
+            DEBUGUPDATE( "\t" << j );
+
+            BSONObjIterator jt(j);
+            Mod::Op op = opFromStr( fn );
+
+            while ( jt.more() ) {
+                BSONElement f = jt.next(); // x:44
+
+                const char * fieldName = f.fieldName();
+
+                uassert( 10148 ,  "Mod on _id not allowed", strcmp( fieldName, "_id" ) != 0 );
+                uassert( 10149 ,  "Invalid mod field name, may not end in a period", fieldName[ strlen( fieldName ) - 1 ] != '.' );
+                uassert( 10150 ,  "Field name duplication not allowed with modifiers", ! haveModForField( fieldName ) );
+                uassert( 10151 ,  "have conflicting mods in update" , ! haveConflictingMod( fieldName ) );
+                uassert( 10152 ,  "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC );
+                uassert( 10153 ,  "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) );
+
+                if ( op == Mod::RENAME_TO ) {
+                    uassert( 13494, "$rename target must be a string", f.type() == String );
+                    const char *target = f.valuestr();
+                    uassert( 13495, "$rename source must differ from target", strcmp( fieldName, target ) != 0 );
+                    uassert( 13496, "invalid mod field name, source may not be empty", fieldName[0] );
+                    uassert( 13479, "invalid mod field name, target may not be empty", target[0] );
+                    uassert( 13480, "invalid mod field name, source may not begin or end in period", fieldName[0] != '.' && fieldName[ strlen( fieldName ) - 1 ] != '.' );
+                    uassert( 13481, "invalid mod field name, target may not begin or end in period", target[0] != '.' && target[ strlen( target ) - 1 ] != '.' );
+                    uassert( 13482, "$rename affecting _id not allowed", !( fieldName[0] == '_' && fieldName[1] == 'i' && fieldName[2] == 'd' && ( !fieldName[3] || fieldName[3] == '.' ) ) );
+                    uassert( 13483, "$rename affecting _id not allowed", !( target[0] == '_' && target[1] == 'i' && target[2] == 'd' && ( !target[3] || target[3] == '.' ) ) );
+                    uassert( 13484, "field name duplication not allowed with $rename target", !haveModForField( target ) );
+                    uassert( 13485, "conflicting mods not allowed with $rename target", !haveConflictingMod( target ) );
+                    uassert( 13486, "$rename target may not be a parent of source", !( strncmp( fieldName, target, strlen( target ) ) == 0 && fieldName[ strlen( target ) ] == '.' ) );
+                    uassert( 13487, "$rename source may not be dynamic array", strstr( fieldName , ".$" ) == 0 );
+                    uassert( 13488, "$rename target may not be dynamic array", strstr( target , ".$" ) == 0 );
+
+                    Mod from;
+                    from.init( Mod::RENAME_FROM, f );
+                    from.setFieldName( fieldName );
+                    updateIsIndexed( from, idxKeys, backgroundKeys );
+                    _mods[ from.fieldName ] = from;
+
+                    Mod to;
+                    to.init( Mod::RENAME_TO, f );
+                    to.setFieldName( target );
+                    updateIsIndexed( to, idxKeys, backgroundKeys );
+                    _mods[ to.fieldName ] = to;
+
+                    DEBUGUPDATE( "\t\t " << fieldName << "\t" << from.fieldName << "\t" << to.fieldName );
+                    continue;
+                }
+
+                _hasDynamicArray = _hasDynamicArray || strstr( fieldName , ".$" ) > 0;
+
+                Mod m;
+                m.init( op , f );
+                m.setFieldName( f.fieldName() );
+                updateIsIndexed( m, idxKeys, backgroundKeys );
+                _mods[m.fieldName] = m;
+
+                DEBUGUPDATE( "\t\t " << fieldName << "\t" << m.fieldName << "\t" << _hasDynamicArray );
+            }
+        }
+
+    }
+
+    ModSet * ModSet::fixDynamicArray( const char * elemMatchKey ) const {
+        ModSet * n = new ModSet();
+        n->_isIndexed = _isIndexed;
+        n->_hasDynamicArray = _hasDynamicArray;
+        for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ) {
+            string s = i->first;
+            size_t idx = s.find( ".$" );
+            if ( idx == string::npos ) {
+                n->_mods[s] = i->second;
+                continue;
+            }
+            StringBuilder buf(s.size()+strlen(elemMatchKey));
+            buf << s.substr(0,idx+1) << elemMatchKey << s.substr(idx+2);
+            string fixed = buf.str();
+            DEBUGUPDATE( "fixed dynamic: " << s << " -->> " << fixed );
+            n->_mods[fixed] = i->second;
+            ModHolder::iterator temp = n->_mods.find( fixed );
+            temp->second.setFieldName( temp->first.c_str() );
+        }
+        return n;
+    }
+
+    void checkNoMods( BSONObj o ) {
+        BSONObjIterator i( o );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            uassert( 10154 ,  "Modifiers and non-modifiers cannot be mixed", e.fieldName()[ 0 ] != '$' );
+        }
+    }
+
+    class UpdateOp : public MultiCursor::CursorOp {
+    public:
+        UpdateOp( bool hasPositionalField, int orClauseIndex = -1 ) :
+        _nscanned(),
+        _hasPositionalField( hasPositionalField ),
+        _orClauseIndex( orClauseIndex ) {}
+        virtual void _init() {
+            _c = qp().newCursor();
+            if ( ! _c->ok() ) {
+                setComplete();
+            }
+        }
+        virtual bool prepareToYield() {
+            if ( _orClauseIndex > 0 ) {
+                return false;
+            }
+            if ( ! _cc ) {
+                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , qp().ns() ) );
+            }
+            return _cc->prepareToYield( _yieldData );
+        }
+        virtual void recoverFromYield() {
+            if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _c.reset();
+                _cc.reset();
+                massert( 13339, "cursor dropped during update", false );
+            }
+        }
+        virtual long long nscanned() {
+            return _c.get() ? _c->nscanned() : _nscanned;
+        }
+        virtual void next() {
+            if ( ! _c->ok() ) {
+                setComplete();
+                return;
+            }
+            _nscanned = _c->nscanned();
+            if ( _orClauseIndex > 0 && _nscanned >= 100 ) {
+                setComplete();
+                return;
+            }
+            if ( matcher( _c )->matchesCurrent(_c.get(), &_details ) ) {
+                setComplete();
+                return;
+            }
+            _c->advance();
+        }
+
+        virtual bool mayRecordPlan() const { return false; }
+        virtual QueryOp *_createChild() const {
+            return new UpdateOp( _hasPositionalField, _orClauseIndex + 1 );
+        }
+        // already scanned to the first match, so return _c
+        virtual shared_ptr< Cursor > newCursor() const { return _c; }
+        virtual bool alwaysUseRecord() const { return _hasPositionalField; }
+    private:
+        shared_ptr< Cursor > _c;
+        long long _nscanned;
+        bool _hasPositionalField;
+        MatchDetails _details;
+        ClientCursor::CleanupPointer _cc;
+        ClientCursor::YieldData _yieldData;
+        // Avoid yielding in the MultiPlanScanner when not the first $or clause - just a temporary implementaiton for now.  SERVER-3555
+        int _orClauseIndex;
+    };
+
+    static void checkTooLarge(const BSONObj& newObj) {
+        uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= BSONObjMaxUserSize );
+    }
+
+    /* note: this is only (as-is) called for
+
+             - not multi
+             - not mods is indexed
+             - not upsert
+    */
+    static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet *mods, int profile, NamespaceDetails *d,
+                                    NamespaceDetailsTransient *nsdt,
+                                    bool god, const char *ns,
+                                    const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) {
+
+        DiskLoc loc;
+        {
+            IndexDetails& i = d->idx(idIdxNo);
+            BSONObj key = i.getKeyFromQuery( patternOrig );            
+            loc = i.idxInterface().findSingle(i, i.head, key);
+            if( loc.isNull() ) {
+                // no upsert support in _updateById yet, so we are done.
+                return UpdateResult(0, 0, 0);
+            }
+        }
+        Record *r = loc.rec();
+
+        if ( ! r->likelyInPhysicalMemory() ) {
+            {
+                auto_ptr<RWLockRecursive::Shared> lk( new RWLockRecursive::Shared( MongoFile::mmmutex) );
+                dbtempreleasewritelock t;
+                r->touch();
+                lk.reset(0); // we have to release mmmutex before we can re-acquire dbmutex
+            }
+            
+            {
+                // we need to re-find in case something changed
+                d = nsdetails( ns );
+                if ( ! d ) {
+                    // dropped 
+                    return UpdateResult(0, 0, 0);
+                }
+                nsdt = &NamespaceDetailsTransient::get_w(ns);
+                IndexDetails& i = d->idx(idIdxNo);
+                BSONObj key = i.getKeyFromQuery( patternOrig );            
+                loc = i.idxInterface().findSingle(i, i.head, key);
+                if( loc.isNull() ) {
+                    // no upsert support in _updateById yet, so we are done.
+                    return UpdateResult(0, 0, 0);
+                }
+                
+                r = loc.rec();
+            }
+        }
+
+        /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
+           regular ones at the moment. */
+        if ( isOperatorUpdate ) {
+            const BSONObj& onDisk = loc.obj();
+            auto_ptr<ModSetState> mss = mods->prepare( onDisk );
+
+            if( mss->canApplyInPlace() ) {
+                mss->applyModsInPlace(true);
+                DEBUGUPDATE( "\t\t\t updateById doing in place update" );
+            }
+            else {
+                BSONObj newObj = mss->createNewFromMods();
+                checkTooLarge(newObj);
+                assert(nsdt);
+                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+            }
+
+            if ( logop ) {
+                DEV assert( mods->size() );
+
+                BSONObj pattern = patternOrig;
+                if ( mss->haveArrayDepMod() ) {
+                    BSONObjBuilder patternBuilder;
+                    patternBuilder.appendElements( pattern );
+                    mss->appendSizeSpecForArrayDepMods( patternBuilder );
+                    pattern = patternBuilder.obj();
+                }
+
+                if( mss->needOpLogRewrite() ) {
+                    DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+                    logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+                }
+                else {
+                    logOp("u", ns, updateobj, &pattern );
+                }
+            }
+            return UpdateResult( 1 , 1 , 1);
+        } // end $operator update
+
+        // regular update
+        BSONElementManipulator::lookForTimestamps( updateobj );
+        checkNoMods( updateobj );
+        assert(nsdt);
+        theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug );
+        if ( logop ) {
+            logOp("u", ns, updateobj, &patternOrig );
+        }
+        return UpdateResult( 1 , 0 , 1 );
+    }
+
+    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug, RemoveSaver* rs ) {
+        DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi );
+        Client& client = cc();
+        int profile = client.database()->profile;
+        
+        debug.updateobj = updateobj;
+
+        // idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case
+        // The pointers may be left invalid on a failed or terminal yield recovery.
+        NamespaceDetails *d = nsdetails(ns); // can be null if an upsert...
+        NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get_w(ns);
+
+        auto_ptr<ModSet> mods;
+        bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$';
+        int modsIsIndexed = false; // really the # of indexes
+        if ( isOperatorUpdate ) {
+            if( d && d->indexBuildInProgress ) {
+                set<string> bgKeys;
+                d->inProgIdx().keyPattern().getFieldNames(bgKeys);
+                mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) );
+            }
+            else {
+                mods.reset( new ModSet(updateobj, nsdt->indexKeys()) );
+            }
+            modsIsIndexed = mods->isIndexed();
+        }
+
+        if( !multi && isSimpleIdQuery(patternOrig) && d && !modsIsIndexed ) {
+            int idxNo = d->findIdIndex();
+            if( idxNo >= 0 ) {
+                debug.idhack = true;
+                UpdateResult result = _updateById(isOperatorUpdate, idxNo, mods.get(), profile, d, nsdt, god, ns, updateobj, patternOrig, logop, debug);
+                if ( result.existing || ! upsert ) {
+                    return result;
+                }
+                else if ( upsert && ! isOperatorUpdate && ! logop) {
+                    // this handles repl inserts
+                    checkNoMods( updateobj );
+                    debug.upsert = true;
+                    BSONObj no = updateobj;
+                    theDataFileMgr.insertWithObjMod(ns, no, god);
+                    return UpdateResult( 0 , 0 , 1 , no );
+                }
+            }
+        }
+
+        int numModded = 0;
+        long long nscanned = 0;
+        shared_ptr< MultiCursor::CursorOp > opPtr( new UpdateOp( mods.get() && mods->hasDynamicArray() ) );
+        shared_ptr< MultiCursor > c( new MultiCursor( ns, patternOrig, BSONObj(), opPtr, true ) );
+
+        d = nsdetails(ns);
+        nsdt = &NamespaceDetailsTransient::get_w(ns);
+
+        if( c->ok() ) {
+            set<DiskLoc> seenObjects;
+            MatchDetails details;
+            auto_ptr<ClientCursor> cc;
+            do {
+                nscanned++;
+
+                bool atomic = c->matcher()->docMatcher().atomic();
+                
+                if ( !atomic ) {
+                    // *****************
+                    if ( cc.get() == 0 ) {
+                        shared_ptr< Cursor > cPtr = c;
+                        cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                    }
+    
+                    bool didYield;
+                    if ( ! cc->yieldSometimes( ClientCursor::WillNeed, &didYield ) ) {
+                        cc.release();
+                        break;
+                    }
+                    if ( !c->ok() ) {
+                        break;
+                    }
+                
+                    if ( didYield ) {
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get_w(ns);
+                    }
+                    // *****************
+                }
+
+                // May have already matched in UpdateOp, but do again to get details set correctly
+                if ( ! c->matcher()->matchesCurrent( c.get(), &details ) ) {
+                    c->advance();
+
+                    if ( nscanned % 256 == 0 && ! atomic ) {
+                        if ( cc.get() == 0 ) {
+                            shared_ptr< Cursor > cPtr = c;
+                            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                        }
+                        if ( ! cc->yield() ) {
+                            cc.release();
+                            // TODO should we assert or something?
+                            break;
+                        }
+                        if ( !c->ok() ) {
+                            break;
+                        }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get_w(ns);
+                    }
+                    continue;
+                }
+
+                Record *r = c->_current();
+                DiskLoc loc = c->currLoc();
+
+                // TODO Maybe this is unnecessary since we have seenObjects
+                if ( c->getsetdup( loc ) ) {
+                    c->advance();
+                    continue;
+                }
+
+                BSONObj js(r);
+
+                BSONObj pattern = patternOrig;
+
+                if ( logop ) {
+                    BSONObjBuilder idPattern;
+                    BSONElement id;
+                    // NOTE: If the matching object lacks an id, we'll log
+                    // with the original pattern.  This isn't replay-safe.
+                    // It might make sense to suppress the log instead
+                    // if there's no id.
+                    if ( js.getObjectID( id ) ) {
+                        idPattern.append( id );
+                        pattern = idPattern.obj();
+                    }
+                    else {
+                        uassert( 10157 ,  "multi-update requires all modified objects to have an _id" , ! multi );
+                    }
+                }
+
+                if ( profile  && !multi ) 
+                    debug.nscanned = (int) nscanned;
+
+                /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
+                    regular ones at the moment. */
+                if ( isOperatorUpdate ) {
+
+                    if ( multi ) {
+                        c->advance(); // go to next record in case this one moves
+                        if ( seenObjects.count( loc ) )
+                            continue;
+                    }
+
+                    const BSONObj& onDisk = loc.obj();
+
+                    ModSet * useMods = mods.get();
+                    bool forceRewrite = false;
+
+                    auto_ptr<ModSet> mymodset;
+                    if ( details._elemMatchKey && mods->hasDynamicArray() ) {
+                        useMods = mods->fixDynamicArray( details._elemMatchKey );
+                        mymodset.reset( useMods );
+                        forceRewrite = true;
+                    }
+
+                    auto_ptr<ModSetState> mss = useMods->prepare( onDisk );
+
+                    bool indexHack = multi && ( modsIsIndexed || ! mss->canApplyInPlace() );
+
+                    if ( indexHack ) {
+                        if ( cc.get() )
+                            cc->updateLocation();
+                        else
+                            c->noteLocation();
+                    }
+
+                    if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) {
+                        mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) );
+
+                        DEBUGUPDATE( "\t\t\t doing in place update" );
+                        if ( profile && !multi ) 
+                            debug.fastmod = true;
+
+                        if ( modsIsIndexed ) {
+                            seenObjects.insert( loc );
+                        }
+                    }
+                    else {
+                        if ( rs )
+                            rs->goingToDelete( onDisk );
+
+                        BSONObj newObj = mss->createNewFromMods();
+                        checkTooLarge(newObj);
+                        DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+                        if ( newLoc != loc || modsIsIndexed ) {
+                            // object moved, need to make sure we don' get again
+                            seenObjects.insert( newLoc );
+                        }
+
+                    }
+
+                    if ( logop ) {
+                        DEV assert( mods->size() );
+
+                        if ( mss->haveArrayDepMod() ) {
+                            BSONObjBuilder patternBuilder;
+                            patternBuilder.appendElements( pattern );
+                            mss->appendSizeSpecForArrayDepMods( patternBuilder );
+                            pattern = patternBuilder.obj();
+                        }
+
+                        if ( forceRewrite || mss->needOpLogRewrite() ) {
+                            DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+                            logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+                        }
+                        else {
+                            logOp("u", ns, updateobj, &pattern );
+                        }
+                    }
+                    numModded++;
+                    if ( ! multi )
+                        return UpdateResult( 1 , 1 , numModded );
+                    if ( indexHack )
+                        c->checkLocation();
+
+                    if ( nscanned % 64 == 0 && ! atomic ) {
+                        if ( cc.get() == 0 ) {
+                            shared_ptr< Cursor > cPtr = c;
+                            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                        }
+                        if ( ! cc->yield() ) {
+                            cc.release();
+                            break;
+                        }
+                        if ( !c->ok() ) {
+                            break;
+                        }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get_w(ns);
+                    }
+
+                    getDur().commitIfNeeded();
+
+                    continue;
+                }
+
+                uassert( 10158 ,  "multi update only works with $ operators" , ! multi );
+
+                BSONElementManipulator::lookForTimestamps( updateobj );
+                checkNoMods( updateobj );
+                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, god);
+                if ( logop ) {
+                    DEV wassert( !god ); // god doesn't get logged, this would be bad.
+                    logOp("u", ns, updateobj, &pattern );
+                }
+                return UpdateResult( 1 , 0 , 1 );
+            } while ( c->ok() );
+        } // endif
+
+        if ( numModded )
+            return UpdateResult( 1 , 1 , numModded );
+
+        if ( profile )
+            debug.nscanned = (int) nscanned;
+
+        if ( upsert ) {
+            if ( updateobj.firstElementFieldName()[0] == '$' ) {
+                /* upsert of an $inc. build a default */
+                BSONObj newObj = mods->createNewFromQuery( patternOrig );
+                checkNoMods( newObj );
+                debug.fastmodinsert = true;
+                theDataFileMgr.insertWithObjMod(ns, newObj, god);
+                if ( logop )
+                    logOp( "i", ns, newObj );
+
+                return UpdateResult( 0 , 1 , 1 , newObj );
+            }
+            uassert( 10159 ,  "multi update only works with $ operators" , ! multi );
+            checkNoMods( updateobj );
+            debug.upsert = true;
+            BSONObj no = updateobj;
+            theDataFileMgr.insertWithObjMod(ns, no, god);
+            if ( logop )
+                logOp( "i", ns, no );
+            return UpdateResult( 0 , 0 , 1 , no );
+        }
+        return UpdateResult( 0 , 0 , 0 );
+    }
+
+    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) {
+        uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 );
+        if ( strstr(ns, ".system.") ) {
+            /* dm: it's very important that system.indexes is never updated as IndexDetails has pointers into it */
+            uassert( 10156 , str::stream() << "cannot update system collection: " << ns << " q: " << patternOrig << " u: " << updateobj , legalClientSystemNS( ns , true ) );
+        }
+        return _updateObjects(false, ns, updateobj, patternOrig, upsert, multi, logop, debug);
+    }
+
+}
diff --git a/db/ops/update.h b/db/ops/update.h
new file mode 100644
index 0000000..de5805a
--- /dev/null
+++ b/db/ops/update.h
@@ -0,0 +1,681 @@
+// update.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../../util/embedded_builder.h"
+#include "../matcher.h"
+
+namespace mongo {
+
+    // ---------- public -------------
+
+    struct UpdateResult {
+        bool existing; // if existing objects were modified
+        bool mod;      // was this a $ mod
+        long long num; // how many objects touched
+        OID upserted;  // if something was upserted, the new _id of the object
+
+        UpdateResult( bool e, bool m, unsigned long long n , const BSONObj& upsertedObject = BSONObj() )
+            : existing(e) , mod(m), num(n) {
+            upserted.clear();
+
+            BSONElement id = upsertedObject["_id"];
+            if ( ! e && n == 1 && id.type() == jstOID ) {
+                upserted = id.OID();
+            }
+        }
+
+    };
+
+
+    class RemoveSaver;
+
+    /* returns true if an existing object was updated, false if no existing object was found.
+       multi - update multiple objects - mostly useful with things like $set
+       god - allow access to system namespaces
+    */
+    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
+    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern,
+                                bool upsert, bool multi , bool logop , OpDebug& debug , RemoveSaver * rs = 0 );
+
+
+
+    // ---------- private -------------
+
+    class ModState;
+    class ModSetState;
+
+    /* Used for modifiers such as $inc, $set, $push, ...
+     * stores the info about a single operation
+     * once created should never be modified
+     */
+    struct Mod {
+        // See opFromStr below
+        //        0    1    2     3         4     5          6    7      8       9       10    11        12           13
+        enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET, RENAME_FROM, RENAME_TO } op;
+
+        static const char* modNames[];
+        static unsigned modNamesNum;
+
+        const char *fieldName;
+        const char *shortFieldName;
+
+        BSONElement elt; // x:5 note: this is the actual element from the updateobj
+        boost::shared_ptr<Matcher> matcher;
+        bool matcherOnPrimitive;
+
+        void init( Op o , BSONElement& e ) {
+            op = o;
+            elt = e;
+            if ( op == PULL && e.type() == Object ) {
+                BSONObj t = e.embeddedObject();
+                if ( t.firstElement().getGtLtOp() == 0 ) {
+                    matcher.reset( new Matcher( t ) );
+                    matcherOnPrimitive = false;
+                }
+                else {
+                    matcher.reset( new Matcher( BSON( "" << t ) ) );
+                    matcherOnPrimitive = true;
+                }
+            }
+        }
+
+        void setFieldName( const char * s ) {
+            fieldName = s;
+            shortFieldName = strrchr( fieldName , '.' );
+            if ( shortFieldName )
+                shortFieldName++;
+            else
+                shortFieldName = fieldName;
+        }
+
+        /**
+         * @param in incrememnts the actual value inside in
+         */
+        void incrementMe( BSONElement& in ) const {
+            BSONElementManipulator manip( in );
+            switch ( in.type() ) {
+            case NumberDouble:
+                manip.setNumber( elt.numberDouble() + in.numberDouble() );
+                break;
+            case NumberLong:
+                manip.setLong( elt.numberLong() + in.numberLong() );
+                break;
+            case NumberInt:
+                manip.setInt( elt.numberInt() + in.numberInt() );
+                break;
+            default:
+                assert(0);
+            }
+        }
+        void IncrementMe( BSONElement& in ) const {
+            BSONElementManipulator manip( in );
+            switch ( in.type() ) {
+            case NumberDouble:
+                manip.SetNumber( elt.numberDouble() + in.numberDouble() );
+                break;
+            case NumberLong:
+                manip.SetLong( elt.numberLong() + in.numberLong() );
+                break;
+            case NumberInt:
+                manip.SetInt( elt.numberInt() + in.numberInt() );
+                break;
+            default:
+                assert(0);
+            }
+        }
+
+        template< class Builder >
+        void appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const;
+
+        bool operator<( const Mod &other ) const {
+            return strcmp( fieldName, other.fieldName ) < 0;
+        }
+
+        bool arrayDep() const {
+            switch (op) {
+            case PUSH:
+            case PUSH_ALL:
+            case POP:
+                return true;
+            default:
+                return false;
+            }
+        }
+
+        static bool isIndexed( const string& fullName , const set<string>& idxKeys ) {
+            const char * fieldName = fullName.c_str();
+            // check if there is an index key that is a parent of mod
+            for( const char *dot = strchr( fieldName, '.' ); dot; dot = strchr( dot + 1, '.' ) )
+                if ( idxKeys.count( string( fieldName, dot - fieldName ) ) )
+                    return true;
+
+            // check if there is an index key equal to mod
+            if ( idxKeys.count(fullName) )
+                return true;
+            // check if there is an index key that is a child of mod
+            set< string >::const_iterator j = idxKeys.upper_bound( fullName );
+            if ( j != idxKeys.end() && j->find( fullName ) == 0 && (*j)[fullName.size()] == '.' )
+                return true;
+
+            return false;
+        }
+
+        bool isIndexed( const set<string>& idxKeys ) const {
+            string fullName = fieldName;
+
+            if ( isIndexed( fullName , idxKeys ) )
+                return true;
+
+            if ( strstr( fieldName , "." ) ) {
+                // check for a.0.1
+                StringBuilder buf( fullName.size() + 1 );
+                for ( size_t i=0; i<fullName.size(); i++ ) {
+                    char c = fullName[i];
+
+                    if ( c == '$' &&
+                            i > 0 && fullName[i-1] == '.' &&
+                            i+1<fullName.size() &&
+                            fullName[i+1] == '.' ) {
+                        i++;
+                        continue;
+                    }
+
+                    buf << c;
+
+                    if ( c != '.' )
+                        continue;
+
+                    if ( ! isdigit( fullName[i+1] ) )
+                        continue;
+
+                    bool possible = true;
+                    size_t j=i+2;
+                    for ( ; j<fullName.size(); j++ ) {
+                        char d = fullName[j];
+                        if ( d == '.' )
+                            break;
+                        if ( isdigit( d ) )
+                            continue;
+                        possible = false;
+                        break;
+                    }
+
+                    if ( possible )
+                        i = j;
+                }
+                string x = buf.str();
+                if ( isIndexed( x , idxKeys ) )
+                    return true;
+            }
+
+            return false;
+        }
+
+        template< class Builder >
+        void apply( Builder& b , BSONElement in , ModState& ms ) const;
+
+        /**
+         * @return true iff toMatch should be removed from the array
+         */
+        bool _pullElementMatch( BSONElement& toMatch ) const;
+
+        void _checkForAppending( const BSONElement& e ) const {
+            if ( e.type() == Object ) {
+                // this is a tiny bit slow, but rare and important
+                // only when setting something TO an object, not setting something in an object
+                // and it checks for { $set : { x : { 'a.b' : 1 } } }
+                // which is feel has been common
+                uassert( 12527 , "not okForStorage" , e.embeddedObject().okForStorage() );
+            }
+        }
+
+        bool isEach() const {
+            if ( elt.type() != Object )
+                return false;
+            BSONElement e = elt.embeddedObject().firstElement();
+            if ( e.type() != Array )
+                return false;
+            return strcmp( e.fieldName() , "$each" ) == 0;
+        }
+
+        BSONObj getEach() const {
+            return elt.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck();
+        }
+
+        void parseEach( BSONElementSet& s ) const {
+            BSONObjIterator i(getEach());
+            while ( i.more() ) {
+                s.insert( i.next() );
+            }
+        }
+
+        const char *renameFrom() const {
+            massert( 13492, "mod must be RENAME_TO type", op == Mod::RENAME_TO );
+            return elt.fieldName();
+        }
+    };
+
+    /**
+     * stores a set of Mods
+     * once created, should never be changed
+     */
+    class ModSet : boost::noncopyable {
+        typedef map<string,Mod> ModHolder;
+        ModHolder _mods;
+        int _isIndexed;
+        bool _hasDynamicArray;
+
+        static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base );
+
+        FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const {
+            bool mDone = ( m == _mods.end() );
+            bool pDone = ( p == pEnd );
+            assert( ! mDone );
+            assert( ! pDone );
+            if ( mDone && pDone )
+                return SAME;
+            // If one iterator is done we want to read from the other one, so say the other one is lower.
+            if ( mDone )
+                return RIGHT_BEFORE;
+            if ( pDone )
+                return LEFT_BEFORE;
+
+            return compareDottedFieldNames( m->first, p->first.c_str() );
+        }
+
+        bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) {
+            for( string left = EmbeddedBuilder::splitDot( right );
+                    left.length() > 0 && left[ left.length() - 1 ] != '.';
+                    left += "." + EmbeddedBuilder::splitDot( right ) ) {
+                if ( existing.count( left ) > 0 && existing[ left ].type() != Object )
+                    return false;
+                if ( haveModForField( left.c_str() ) )
+                    return false;
+            }
+            return true;
+        }
+        static Mod::Op opFromStr( const char *fn ) {
+            assert( fn[0] == '$' );
+            switch( fn[1] ) {
+            case 'i': {
+                if ( fn[2] == 'n' && fn[3] == 'c' && fn[4] == 0 )
+                    return Mod::INC;
+                break;
+            }
+            case 's': {
+                if ( fn[2] == 'e' && fn[3] == 't' && fn[4] == 0 )
+                    return Mod::SET;
+                break;
+            }
+            case 'p': {
+                if ( fn[2] == 'u' ) {
+                    if ( fn[3] == 's' && fn[4] == 'h' ) {
+                        if ( fn[5] == 0 )
+                            return Mod::PUSH;
+                        if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+                            return Mod::PUSH_ALL;
+                    }
+                    else if ( fn[3] == 'l' && fn[4] == 'l' ) {
+                        if ( fn[5] == 0 )
+                            return Mod::PULL;
+                        if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+                            return Mod::PULL_ALL;
+                    }
+                }
+                else if ( fn[2] == 'o' && fn[3] == 'p' && fn[4] == 0 )
+                    return Mod::POP;
+                break;
+            }
+            case 'u': {
+                if ( fn[2] == 'n' && fn[3] == 's' && fn[4] == 'e' && fn[5] == 't' && fn[6] == 0 )
+                    return Mod::UNSET;
+                break;
+            }
+            case 'b': {
+                if ( fn[2] == 'i' && fn[3] == 't' ) {
+                    if ( fn[4] == 0 )
+                        return Mod::BIT;
+                    if ( fn[4] == 'a' && fn[5] == 'n' && fn[6] == 'd' && fn[7] == 0 )
+                        return Mod::BITAND;
+                    if ( fn[4] == 'o' && fn[5] == 'r' && fn[6] == 0 )
+                        return Mod::BITOR;
+                }
+                break;
+            }
+            case 'a': {
+                if ( fn[2] == 'd' && fn[3] == 'd' ) {
+                    // add
+                    if ( fn[4] == 'T' && fn[5] == 'o' && fn[6] == 'S' && fn[7] == 'e' && fn[8] == 't' && fn[9] == 0 )
+                        return Mod::ADDTOSET;
+
+                }
+                break;
+            }
+            case 'r': {
+                if ( fn[2] == 'e' && fn[3] == 'n' && fn[4] == 'a' && fn[5] == 'm' && fn[6] =='e' ) {
+                    return Mod::RENAME_TO; // with this return code we handle both RENAME_TO and RENAME_FROM
+                }
+                break;
+            }
+            default: break;
+            }
+            uassert( 10161 ,  "Invalid modifier specified " + string( fn ), false );
+            return Mod::INC;
+        }
+
+        ModSet() {}
+
+        void updateIsIndexed( const Mod &m, const set<string> &idxKeys, const set<string> *backgroundKeys ) {
+            if ( m.isIndexed( idxKeys ) ||
+                    (backgroundKeys && m.isIndexed(*backgroundKeys)) ) {
+                _isIndexed++;
+            }
+        }
+
+    public:
+
+        ModSet( const BSONObj &from ,
+                const set<string>& idxKeys = set<string>(),
+                const set<string>* backgroundKeys = 0
+              );
+
+        // TODO: this is inefficient - should probably just handle when iterating
+        ModSet * fixDynamicArray( const char * elemMatchKey ) const;
+
+        bool hasDynamicArray() const { return _hasDynamicArray; }
+
+        /**
+         * creates a ModSetState suitable for operation on obj
+         * doesn't change or modify this ModSet or any underying Mod
+         */
+        auto_ptr<ModSetState> prepare( const BSONObj& obj ) const;
+
+        /**
+         * given a query pattern, builds an object suitable for an upsert
+         * will take the query spec and combine all $ operators
+         */
+        BSONObj createNewFromQuery( const BSONObj& query );
+
+        /**
+         *
+         */
+        int isIndexed() const {
+            return _isIndexed;
+        }
+
+        unsigned size() const { return _mods.size(); }
+
+        bool haveModForField( const char *fieldName ) const {
+            return _mods.find( fieldName ) != _mods.end();
+        }
+
+        bool haveConflictingMod( const string& fieldName ) {
+            size_t idx = fieldName.find( '.' );
+            if ( idx == string::npos )
+                idx = fieldName.size();
+
+            ModHolder::const_iterator start = _mods.lower_bound(fieldName.substr(0,idx));
+            for ( ; start != _mods.end(); start++ ) {
+                FieldCompareResult r = compareDottedFieldNames( fieldName , start->first );
+                switch ( r ) {
+                case LEFT_SUBFIELD: return true;
+                case LEFT_BEFORE: return false;
+                case SAME: return true;
+                case RIGHT_BEFORE: return false;
+                case RIGHT_SUBFIELD: return true;
+                }
+            }
+            return false;
+
+
+        }
+
+    };
+
+    /**
+     * stores any information about a single Mod operating on a single Object
+     */
+    class ModState {
+    public:
+        const Mod * m;
+        BSONElement old;
+        BSONElement newVal;
+        BSONObj _objData;
+
+        const char * fixedOpName;
+        BSONElement * fixed;
+        int pushStartSize;
+
+        BSONType incType;
+        int incint;
+        double incdouble;
+        long long inclong;
+
+        bool dontApply;
+
+        ModState() {
+            fixedOpName = 0;
+            fixed = 0;
+            pushStartSize = -1;
+            incType = EOO;
+            dontApply = false;
+        }
+
+        Mod::Op op() const {
+            return m->op;
+        }
+
+        const char * fieldName() const {
+            return m->fieldName;
+        }
+
+        bool needOpLogRewrite() const {
+            if ( dontApply )
+                return false;
+
+            if ( fixed || fixedOpName || incType )
+                return true;
+
+            switch( op() ) {
+            case Mod::RENAME_FROM:
+            case Mod::RENAME_TO:
+                return true;
+            case Mod::BIT:
+            case Mod::BITAND:
+            case Mod::BITOR:
+                // TODO: should we convert this to $set?
+                return false;
+            default:
+                return false;
+            }
+        }
+
+        void appendForOpLog( BSONObjBuilder& b ) const;
+
+        template< class Builder >
+        void apply( Builder& b , BSONElement in ) {
+            m->apply( b , in , *this );
+        }
+
+        template< class Builder >
+        void appendIncValue( Builder& b , bool useFullName ) const {
+            const char * n = useFullName ? m->fieldName : m->shortFieldName;
+
+            switch ( incType ) {
+            case NumberDouble:
+                b.append( n , incdouble ); break;
+            case NumberLong:
+                b.append( n , inclong ); break;
+            case NumberInt:
+                b.append( n , incint ); break;
+            default:
+                assert(0);
+            }
+        }
+
+        string toString() const;
+
+        template< class Builder >
+        void handleRename( Builder &newObjBuilder, const char *shortFieldName );
+    };
+
+    /**
+     * this is used to hold state, meta data while applying a ModSet to a BSONObj
+     * the goal is to make ModSet const so its re-usable
+     */
+    class ModSetState : boost::noncopyable {
+        struct FieldCmp {
+            bool operator()( const string &l, const string &r ) const;
+        };
+        typedef map<string,ModState,FieldCmp> ModStateHolder;
+        const BSONObj& _obj;
+        ModStateHolder _mods;
+        bool _inPlacePossible;
+        BSONObj _newFromMods; // keep this data alive, as oplog generation may depend on it
+
+        ModSetState( const BSONObj& obj )
+            : _obj( obj ) , _inPlacePossible(true) {
+        }
+
+        /**
+         * @return if in place is still possible
+         */
+        bool amIInPlacePossible( bool inPlacePossible ) {
+            if ( ! inPlacePossible )
+                _inPlacePossible = false;
+            return _inPlacePossible;
+        }
+
+        template< class Builder >
+        void createNewFromMods( const string& root , Builder& b , const BSONObj &obj );
+
+        template< class Builder >
+        void _appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen );
+
+        template< class Builder >
+        void appendNewFromMod( ModState& ms , Builder& b ) {
+            if ( ms.dontApply ) {
+                return;
+            }
+
+            //const Mod& m = *(ms.m); // HACK
+            Mod& m = *((Mod*)(ms.m)); // HACK
+
+            switch ( m.op ) {
+
+            case Mod::PUSH:
+            case Mod::ADDTOSET: {
+                if ( m.isEach() ) {
+                    b.appendArray( m.shortFieldName , m.getEach() );
+                }
+                else {
+                    BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+                    arr.appendAs( m.elt, "0" );
+                    arr.done();
+                }
+                break;
+            }
+
+            case Mod::PUSH_ALL: {
+                b.appendAs( m.elt, m.shortFieldName );
+                break;
+            }
+
+            case Mod::UNSET:
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                // no-op b/c unset/pull of nothing does nothing
+                break;
+
+            case Mod::INC:
+                ms.fixedOpName = "$set";
+            case Mod::SET: {
+                m._checkForAppending( m.elt );
+                b.appendAs( m.elt, m.shortFieldName );
+                break;
+            }
+            // shouldn't see RENAME_FROM here
+            case Mod::RENAME_TO:
+                ms.handleRename( b, m.shortFieldName );
+                break;
+            default:
+                stringstream ss;
+                ss << "unknown mod in appendNewFromMod: " << m.op;
+                throw UserException( 9015, ss.str() );
+            }
+
+        }
+
+    public:
+
+        bool canApplyInPlace() const {
+            return _inPlacePossible;
+        }
+
+        /**
+         * modified underlying _obj
+         * @param isOnDisk - true means this is an on disk object, and this update needs to be made durable
+         */
+        void applyModsInPlace( bool isOnDisk );
+
+        BSONObj createNewFromMods();
+
+        // re-writing for oplog
+
+        bool needOpLogRewrite() const {
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                if ( i->second.needOpLogRewrite() )
+                    return true;
+            return false;
+        }
+
+        BSONObj getOpLogRewrite() const {
+            BSONObjBuilder b;
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                i->second.appendForOpLog( b );
+            return b.obj();
+        }
+
+        bool haveArrayDepMod() const {
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                if ( i->second.m->arrayDep() )
+                    return true;
+            return false;
+        }
+
+        void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const {
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) {
+                const ModState& m = i->second;
+                if ( m.m->arrayDep() ) {
+                    if ( m.pushStartSize == -1 )
+                        b.appendNull( m.fieldName() );
+                    else
+                        b << m.fieldName() << BSON( "$size" << m.pushStartSize );
+                }
+            }
+        }
+
+        string toString() const;
+
+        friend class ModSet;
+    };
+
+}
+
author	Antonin Kral <a.kral@bobek.cz>	2011-09-14 17:08:06 +0200
committer	Antonin Kral <a.kral@bobek.cz>	2011-09-14 17:08:06 +0200
commit	5d342a758c6095b4d30aba0750b54f13b8916f51 (patch)
tree	762e9aa84781f5e3b96db2c02d356c29cf0217c0 /db/ops
parent	cbe2d992e9cd1ea66af9fa91df006106775d3073 (diff)
download	mongodb-5d342a758c6095b4d30aba0750b54f13b8916f51.tar.gz