// query.cpp /** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "pch.h" #include "query.h" #include "pdfile.h" #include "jsobjmanipulator.h" #include "../bson/util/builder.h" #include #include "introspect.h" #include "btree.h" #include "../util/lruishmap.h" #include "json.h" #include "repl.h" #include "replpair.h" #include "scanandorder.h" #include "security.h" #include "curop-inl.h" #include "commands.h" #include "queryoptimizer.h" #include "lasterror.h" #include "../s/d_logic.h" #include "repl_block.h" namespace mongo { /* We cut off further objects once we cross this threshold; thus, you might get a little bit more than this, it is a threshold rather than a limit. */ const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024; //ns->query->DiskLoc // LRUishMap lrutest(123); extern bool useCursors; extern bool useHints; // Just try to identify best plan. class DeleteOp : public MultiCursor::CursorOp { public: DeleteOp( bool justOne, int& bestCount ) : justOne_( justOne ), count_(), bestCount_( bestCount ), _nscanned() { } virtual void _init() { c_ = qp().newCursor(); } virtual bool prepareToYield() { if ( ! _cc ) { _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c_ , qp().ns() ) ); } return _cc->prepareToYield( _yieldData ); } virtual void recoverFromYield() { if ( !ClientCursor::recoverFromYield( _yieldData ) ) { _cc.reset(); c_.reset(); massert( 13340, "cursor dropped during delete", false ); } } virtual long long nscanned() { return c_.get() ? c_->nscanned() : _nscanned; } virtual void next() { if ( !c_->ok() ) { setComplete(); return; } DiskLoc rloc = c_->currLoc(); if ( matcher()->matches(c_->currKey(), rloc ) ) { if ( !c_->getsetdup(rloc) ) ++count_; } c_->advance(); _nscanned = c_->nscanned(); if ( count_ > bestCount_ ) bestCount_ = count_; if ( count_ > 0 ) { if ( justOne_ ) setComplete(); else if ( _nscanned >= 100 && count_ == bestCount_ ) setComplete(); } } virtual bool mayRecordPlan() const { return !justOne_; } virtual QueryOp *_createChild() const { bestCount_ = 0; // should be safe to reset this in contexts where createChild() is called return new DeleteOp( justOne_, bestCount_ ); } virtual shared_ptr newCursor() const { return qp().newCursor(); } private: bool justOne_; int count_; int &bestCount_; long long _nscanned; shared_ptr c_; ClientCursor::CleanupPointer _cc; ClientCursor::YieldData _yieldData; }; /* ns: namespace, e.g. . pattern: the "where" clause / criteria justOne: stop after 1 match god: allow access to system namespaces, and don't yield */ long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) { if( !god ) { if ( strstr(ns, ".system.") ) { /* note a delete from system.indexes would corrupt the db if done here, as there are pointers into those objects in NamespaceDetails. */ uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) ); } if ( strchr( ns , '$' ) ) { log() << "cannot delete from collection with reserved $ in name: " << ns << endl; uassert( 10100 , "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 ); } } NamespaceDetails *d = nsdetails( ns ); if ( ! d ) return 0; uassert( 10101 , "can't remove from a capped collection" , ! d->capped ); long long nDeleted = 0; int best = 0; shared_ptr< MultiCursor::CursorOp > opPtr( new DeleteOp( justOneOrig, best ) ); shared_ptr< MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, !god ) ); if( !creal->ok() ) return nDeleted; shared_ptr< Cursor > cPtr = creal; auto_ptr cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) ); cc->setDoingDeletes( true ); CursorId id = cc->cursorid(); bool justOne = justOneOrig; bool canYield = !god && !creal->matcher()->docMatcher().atomic(); do { if ( canYield && ! cc->yieldSometimes() ) { cc.release(); // has already been deleted elsewhere // TODO should we assert or something? break; } if ( !cc->ok() ) { break; // if we yielded, could have hit the end } // this way we can avoid calling updateLocation() every time (expensive) // as well as some other nuances handled cc->setDoingDeletes( true ); DiskLoc rloc = cc->currLoc(); BSONObj key = cc->currKey(); // NOTE Calling advance() may change the matcher, so it's important // to try to match first. bool match = creal->matcher()->matches( key , rloc ); if ( ! cc->advance() ) justOne = true; if ( ! match ) continue; assert( !cc->c()->getsetdup(rloc) ); // can't be a dup, we deleted it! if ( !justOne ) { /* NOTE: this is SLOW. this is not good, noteLocation() was designed to be called across getMore blocks. here we might call millions of times which would be bad. */ cc->c()->noteLocation(); } if ( logop ) { BSONElement e; if( BSONObj( rloc.rec() ).getObjectID( e ) ) { BSONObjBuilder b; b.append( e ); bool replJustOne = true; logOp( "d", ns, b.done(), 0, &replJustOne ); } else { problem() << "deleted object without id, not logging" << endl; } } if ( rs ) rs->goingToDelete( rloc.obj() /*cc->c->current()*/ ); theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc); nDeleted++; if ( justOne ) { break; } cc->c()->checkLocation(); if( !god ) getDur().commitIfNeeded(); if( debug && god && nDeleted == 100 ) log() << "warning high number of deletes with god=true which could use significant memory" << endl; } while ( cc->ok() ); if ( cc.get() && ClientCursor::find( id , false ) == 0 ) { cc.release(); } return nDeleted; } int otherTraceLevel = 0; int initialExtentSize(int len); bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) { try { return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions); } catch ( AssertionException& e ) { e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" ); } curop.debug().str << " assertion "; anObjBuilder.append("errmsg", "db assertion failure"); anObjBuilder.append("ok", 0.0); BSONObj x = anObjBuilder.done(); b.appendBuf((void*) x.objdata(), x.objsize()); return true; } int nCaught = 0; BSONObj id_obj = fromjson("{\"_id\":1}"); BSONObj empty_obj = fromjson("{}"); //int dump = 0; /* empty result for error conditions */ QueryResult* emptyMoreResult(long long cursorid) { BufBuilder b(32768); b.skip(sizeof(QueryResult)); QueryResult *qr = (QueryResult *) b.buf(); qr->cursorId = 0; // 0 indicates no more data to retrieve. qr->startingFrom = 0; qr->len = b.len(); qr->setOperation(opReply); qr->nReturned = 0; b.decouple(); return qr; } QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) { exhaust = false; ClientCursor::Pointer p(cursorid); ClientCursor *cc = p.c(); int bufSize = 512; if ( cc ) { bufSize += sizeof( QueryResult ); bufSize += MaxBytesToReturnToClientAtOnce; } BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; if ( !cc ) { log() << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { if ( pass == 0 ) cc->updateSlaveLocation( curop ); int queryOptions = cc->queryOptions(); if( pass == 0 ) { StringBuilder& ss = curop.debug().str; ss << " getMore: " << cc->query().toString() << " "; } start = cc->pos(); Cursor *c = cc->c(); c->checkLocation(); DiskLoc last; scoped_ptr keyFieldsOnly; if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields ) keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) ); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however advance() can still be retries as a reactivation attempt. when there is new data, it will return true. that's what we are doing here. */ if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { throw GetMoreWaitException(); } break; } p.release(); bool ok = ClientCursor::erase(cursorid); assert(ok); cursorid = 0; cc = 0; break; } // in some cases (clone collection) there won't be a matcher if ( c->matcher() && !c->matcher()->matches(c->currKey(), c->currLoc() ) ) { } /* TODO else if ( _chunkMatcher && ! _chunkMatcher->belongsToMe( c->currKey(), c->currLoc() ) ){ cout << "TEMP skipping un-owned chunk: " << c->current() << endl; } */ else { if( c->getsetdup(c->currLoc()) ) { //out() << " but it's a dup \n"; } else { last = c->currLoc(); n++; if ( keyFieldsOnly ) { fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) ); } else { BSONObj js = c->current(); // show disk loc should be part of the main query, not in an $or clause, so this should be ok fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0)); } if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); cc->incPos( n ); break; } } } c->advance(); if ( ! cc->yieldSometimes() ) { ClientCursor::erase(cursorid); cursorid = 0; cc = 0; p.deleted(); break; } } if ( cc ) { cc->updateLocation(); cc->mayUpgradeStorage(); cc->storeOpForSlave( last ); exhaust = cc->queryOptions() & QueryOption_Exhaust; } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; } class CountOp : public QueryOp { public: CountOp( const string& ns , const BSONObj &spec ) : _ns(ns), _capped(false), _count(), _myCount(), _skip( spec["skip"].numberLong() ), _limit( spec["limit"].numberLong() ), _nscanned(), _bc() { } virtual void _init() { _c = qp().newCursor(); _capped = _c->capped(); if ( qp().exactKeyMatch() && ! matcher()->needRecord() ) { _query = qp().simplifiedQuery( qp().indexKey() ); _bc = dynamic_cast< BtreeCursor* >( _c.get() ); _bc->forgetEndKey(); } } virtual long long nscanned() { return _c.get() ? _c->nscanned() : _nscanned; } virtual bool prepareToYield() { if ( _c && !_cc ) { _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _ns.c_str() ) ); } if ( _cc ) { return _cc->prepareToYield( _yieldData ); } // no active cursor - ok to yield return true; } virtual void recoverFromYield() { if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) { _c.reset(); _cc.reset(); if ( _capped ) { msgassertedNoTrace( 13337, str::stream() << "capped cursor overrun during count: " << _ns ); } else { // we don't fail query since we're fine with returning partial data if collection dropped } } } virtual void next() { if ( ! _c || !_c->ok() ) { setComplete(); return; } _nscanned = _c->nscanned(); if ( _bc ) { if ( _firstMatch.isEmpty() ) { _firstMatch = _bc->currKeyNode().key.copy(); // if not match if ( _query.woCompare( _firstMatch, BSONObj(), false ) ) { setComplete(); return; } _gotOne(); } else { if ( ! _firstMatch.woEqual( _bc->currKeyNode().key ) ) { setComplete(); return; } _gotOne(); } } else { if ( !matcher()->matches(_c->currKey(), _c->currLoc() ) ) { } else if( !_c->getsetdup(_c->currLoc()) ) { _gotOne(); } } _c->advance(); } virtual QueryOp *_createChild() const { CountOp *ret = new CountOp( _ns , BSONObj() ); ret->_count = _count; ret->_skip = _skip; ret->_limit = _limit; return ret; } long long count() const { return _count; } virtual bool mayRecordPlan() const { return ( _myCount > _limit / 2 ) || ( complete() && !stopRequested() ); } private: void _gotOne() { if ( _skip ) { _skip--; return; } if ( _limit > 0 && _count >= _limit ) { setStop(); return; } _count++; _myCount++; } string _ns; bool _capped; long long _count; long long _myCount; long long _skip; long long _limit; long long _nscanned; shared_ptr _c; BSONObj _query; BtreeCursor * _bc; BSONObj _firstMatch; ClientCursor::CleanupPointer _cc; ClientCursor::YieldData _yieldData; }; /* { count: "collectionname"[, query: ] } returns -1 on ns does not exist error. */ long long runCount( const char *ns, const BSONObj &cmd, string &err ) { Client::Context cx(ns); NamespaceDetails *d = nsdetails( ns ); if ( !d ) { err = "ns missing"; return -1; } BSONObj query = cmd.getObjectField("query"); // count of all objects if ( query.isEmpty() ) { return applySkipLimit( d->stats.nrecords , cmd ); } MultiPlanScanner mps( ns, query, BSONObj(), 0, true, BSONObj(), BSONObj(), false, true ); CountOp original( ns , cmd ); shared_ptr< CountOp > res = mps.runOp( original ); if ( !res->complete() ) { log() << "Count with ns: " << ns << " and query: " << query << " failed with exception: " << res->exception() << endl; return 0; } return res->count(); } class ExplainBuilder { // Note: by default we filter out allPlans and oldPlan in the shell's // explain() function. If you add any recursive structures, make sure to // edit the JS to make sure everything gets filtered. public: ExplainBuilder() : _i() {} void ensureStartScan() { if ( !_a.get() ) { _a.reset( new BSONArrayBuilder() ); } } void noteCursor( Cursor *c ) { BSONObjBuilder b( _a->subobjStart() ); b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds(); b.done(); } void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder, int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) { if ( _i == 1 ) { _c.reset( new BSONArrayBuilder() ); *_c << _b->obj(); } if ( _i == 0 ) { _b.reset( new BSONObjBuilder() ); } else { _b.reset( new BSONObjBuilder( _c->subobjStart() ) ); } *_b << "cursor" << c->toString(); _b->appendNumber( "nscanned", nscanned ); _b->appendNumber( "nscannedObjects", nscannedObjects ); *_b << "n" << n; if ( scanAndOrder ) *_b << "scanAndOrder" << true; *_b << "millis" << millis; *_b << "nYields" << nYields; *_b << "nChunkSkips" << nChunkSkips; *_b << "isMultiKey" << c->isMultiKey(); *_b << "indexOnly" << indexOnly; *_b << "indexBounds" << c->prettyIndexBounds(); if ( !hint ) { *_b << "allPlans" << _a->arr(); } if ( _i != 0 ) { _b->done(); } _a.reset( 0 ); ++_i; } BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) { if ( _i > 1 ) { BSONObjBuilder b; b << "clauses" << _c->arr(); b.appendNumber( "nscanned", nscanned ); b.appendNumber( "nscannedObjects", nscannedObjects ); b << "n" << n; b << "millis" << millis; b.appendElements( suffix ); return b.obj(); } else { _b->appendElements( suffix ); return _b->obj(); } } private: auto_ptr< BSONArrayBuilder > _a; auto_ptr< BSONObjBuilder > _b; auto_ptr< BSONArrayBuilder > _c; int _i; }; // Implements database 'query' requests using the query optimizer's QueryOp interface class UserQueryOp : public QueryOp { public: UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) : _buf( 32768 ) , // TODO be smarter here _pq( pq ) , _ntoskip( pq.getSkip() ) , _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0), _n(0), _oldN(0), _nYields(), _nChunkSkips(), _chunkManager( shardingState.needShardChunkManager(pq.ns()) ? shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ), _inMemSort(false), _capped(false), _saveClientCursor(false), _wouldSaveClientCursor(false), _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ), _response( response ), _eb( eb ), _curop( curop ) {} virtual void _init() { // only need to put the QueryResult fields there if we're building the first buffer in the message. if ( _response.empty() ) { _buf.skip( sizeof( QueryResult ) ); } if ( _oplogReplay ) { _findingStartCursor.reset( new FindingStartCursor( qp() ) ); _capped = true; } else { _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() ); _capped = _c->capped(); // setup check for if we can only use index to extract if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) { _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) ); } } if ( qp().scanAndOrderRequired() ) { _inMemSort = true; _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder() ) ); } if ( _pq.isExplain() ) { _eb.noteCursor( _c.get() ); } } virtual bool prepareToYield() { if ( _findingStartCursor.get() ) { return _findingStartCursor->prepareToYield(); } else { if ( _c && !_cc ) { _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) ); } if ( _cc ) { return _cc->prepareToYield( _yieldData ); } } // no active cursor - ok to yield return true; } virtual void recoverFromYield() { _nYields++; if ( _findingStartCursor.get() ) { _findingStartCursor->recoverFromYield(); } else if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) { _c.reset(); _cc.reset(); _so.reset(); if ( _capped ) { msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() ); } else { // we don't fail query since we're fine with returning partial data if collection dropped // todo: this is wrong. the cursor could be gone if closeAllDatabases command just ran } } } virtual long long nscanned() { if ( _findingStartCursor.get() ) { return 0; // should only be one query plan, so value doesn't really matter. } return _c.get() ? _c->nscanned() : _nscanned; } virtual void next() { if ( _findingStartCursor.get() ) { if ( _findingStartCursor->done() ) { _c = _findingStartCursor->cRelease(); _findingStartCursor.reset( 0 ); } else { _findingStartCursor->next(); } _capped = true; return; } if ( !_c || !_c->ok() ) { finish( false ); return; } bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors; if( 0 ) { cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl; } if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) { finish( true ); //? return; } _nscanned = _c->nscanned(); if ( !matcher()->matches(_c->currKey(), _c->currLoc() , &_details ) ) { // not a match, continue onward if ( _details.loadedObject ) _nscannedObjects++; } else { _nscannedObjects++; DiskLoc cl = _c->currLoc(); if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) { _nChunkSkips++; // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl; } else if( _c->getsetdup(cl) ) { // dup } else { // got a match. if ( _inMemSort ) { // note: no cursors for non-indexed, ordered results. results must be fairly small. _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 ); } else if ( _ntoskip > 0 ) { _ntoskip--; } else { if ( _pq.isExplain() ) { _n++; if ( n() >= _pq.getNumToReturn() && !_pq.wantMore() ) { // .limit() was used, show just that much. finish( true ); //? return; } } else { if ( _pq.returnKey() ) { BSONObjBuilder bb( _buf ); bb.appendKeys( _c->indexKeyPattern() , _c->currKey() ); bb.done(); } else if ( _keyFieldsOnly ) { fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) ); } else { BSONObj js = _c->current(); assert( js.isValid() ); if ( _oplogReplay ) { BSONElement e = js["ts"]; if ( e.type() == Date || e.type() == Timestamp ) _slaveReadTill = e._opTime(); } fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0)); } _n++; if ( ! _c->supportGetMore() ) { if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) { finish( true ); return; } } else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) { /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */ if ( mayCreateCursor1 ) { _wouldSaveClientCursor = true; if ( _c->advance() ) { // more...so save a cursor _saveClientCursor = true; } } finish( true ); return; } } } } } _c->advance(); } // this plan won, so set data for response broadly void finish( bool stop ) { massert( 13638, "client cursor dropped during explain query yield", !_pq.isExplain() || _c.get() ); if ( _pq.isExplain() ) { _n = _inMemSort ? _so->size() : _n; } else if ( _inMemSort ) { if( _so.get() ) _so->fill( _buf, _pq.getFields() , _n ); } if ( _c.get() ) { _nscanned = _c->nscanned(); if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 ) _c->setTailable(); // If the tailing request succeeded. if ( _c->tailable() ) _saveClientCursor = true; } if ( _pq.isExplain() ) { _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(), _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields , _nChunkSkips, _keyFieldsOnly.get() > 0 ); } else { if ( _buf.len() ) { _response.appendData( _buf.buf(), _buf.len() ); _buf.decouple(); } } if ( stop ) { setStop(); } else { setComplete(); } } void finishExplain( const BSONObj &suffix ) { BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix); fillQueryResultFromObj(_buf, 0, obj); _n = 1; _oldN = 0; _response.appendData( _buf.buf(), _buf.len() ); _buf.decouple(); } virtual bool mayRecordPlan() const { return ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) ); } virtual QueryOp *_createChild() const { if ( _pq.isExplain() ) { _eb.ensureStartScan(); } UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop ); ret->_oldN = n(); ret->_oldNscanned = totalNscanned(); ret->_oldNscannedObjects = nscannedObjects(); ret->_ntoskip = _ntoskip; return ret; } bool scanAndOrderRequired() const { return _inMemSort; } shared_ptr cursor() { return _c; } int n() const { return _oldN + _n; } long long totalNscanned() const { return _nscanned + _oldNscanned; } long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; } bool saveClientCursor() const { return _saveClientCursor; } bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; } void finishForOplogReplay( ClientCursor * cc ) { if ( _oplogReplay && ! _slaveReadTill.isNull() ) cc->slaveReadTill( _slaveReadTill ); } private: BufBuilder _buf; const ParsedQuery& _pq; scoped_ptr _keyFieldsOnly; long long _ntoskip; long long _nscanned; long long _oldNscanned; long long _nscannedObjects; long long _oldNscannedObjects; int _n; // found so far int _oldN; int _nYields; int _nChunkSkips; MatchDetails _details; ShardChunkManagerPtr _chunkManager; bool _inMemSort; auto_ptr< ScanAndOrder > _so; shared_ptr _c; ClientCursor::CleanupPointer _cc; ClientCursor::YieldData _yieldData; bool _capped; bool _saveClientCursor; bool _wouldSaveClientCursor; bool _oplogReplay; auto_ptr< FindingStartCursor > _findingStartCursor; Message &_response; ExplainBuilder &_eb; CurOp &_curop; OpTime _slaveReadTill; }; /* run a query -- includes checking for and running a Command \ @return points to ns if exhaust mode. 0=normal mode */ const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { StringBuilder& ss = curop.debug().str; shared_ptr pq_shared( new ParsedQuery(q) ); ParsedQuery& pq( *pq_shared ); int ntoskip = q.ntoskip; BSONObj jsobj = q.query; int queryOptions = q.queryOptions; const char *ns = q.ns; if( logLevel >= 2 ) log() << "query: " << ns << jsobj << endl; ss << ns; { // only say ntoreturn if nonzero. int n = pq.getNumToReturn(); if( n ) ss << " ntoreturn:" << n; } curop.setQuery(jsobj); if ( pq.couldBeCommand() ) { BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { ss << " command: "; jsobj.toString( ss ); curop.markCommand(); auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); ss << " reslen:" << bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData( qr.release(), true ); } else { uasserted(13530, "bad or malformed command request?"); } return 0; } /* --- regular query --- */ int n = 0; BSONElement hint = useHints ? pq.getHint() : BSONElement(); bool explain = pq.isExplain(); bool snapshot = pq.isSnapshot(); BSONObj order = pq.getOrder(); BSONObj query = pq.getFilter(); /* The ElemIter will not be happy if this isn't really an object. So throw exception here when that is true. (Which may indicate bad data from client.) */ if ( query.objsize() == 0 ) { out() << "Bad query object?\n jsobj:"; out() << jsobj.toString() << "\n query:"; out() << query.toString() << endl; uassert( 10110 , "bad query object", false); } /* --- read lock --- */ mongolock lk(false); Client::Context ctx( ns , dbpath , &lk ); replVerifyReadsOk(pq); if ( pq.hasOption( QueryOption_CursorTailable ) ) { NamespaceDetails *d = nsdetails( ns ); uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped ); const BSONObj nat1 = BSON( "$natural" << 1 ); if ( order.isEmpty() ) { order = nat1; } else { uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 ); } } BSONObj snapshotHint; // put here to keep the data in scope if( snapshot ) { NamespaceDetails *d = nsdetails(ns); if ( d ) { int i = d->findIdIndex(); if( i < 0 ) { if ( strstr( ns , ".system." ) == 0 ) log() << "warning: no _id index on $snapshot query, ns:" << ns << endl; } else { /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here. probably need a better way to specify "use the _id index" as a hint. if someone is in the query optimizer please fix this then! */ BSONObjBuilder b; b.append("$hint", d->idx(i).indexName()); snapshotHint = b.obj(); hint = snapshotHint.firstElement(); } } } if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) { bool nsFound = false; bool indexFound = false; BSONObj resObject; Client& c = cc(); bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound ); if ( nsFound == false || indexFound == true ) { BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32); bb.skip(sizeof(QueryResult)); ss << " idhack "; if ( found ) { n = 1; fillQueryResultFromObj( bb , pq.getFields() , resObject ); } auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); ss << " reslen:" << bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = n; result.setData( qr.release(), true ); return false; } } // regular, not QO bypass query BSONObj oldPlan; if ( explain && ! pq.hasIndexSpecifier() ) { MultiPlanScanner mps( ns, query, order ); if ( mps.usingPrerecordedPlan() ) oldPlan = mps.oldExplain(); } auto_ptr< MultiPlanScanner > mps( new MultiPlanScanner( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax(), false, true ) ); BSONObj explainSuffix; if ( explain ) { BSONObjBuilder bb; if ( !oldPlan.isEmpty() ) bb.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() ); explainSuffix = bb.obj(); } ExplainBuilder eb; UserQueryOp original( pq, result, eb, curop ); shared_ptr< UserQueryOp > o = mps->runOp( original ); UserQueryOp &dqo = *o; if ( ! dqo.complete() ) throw MsgAssertionException( dqo.exception() ); if ( explain ) { dqo.finishExplain( explainSuffix ); } n = dqo.n(); long long nscanned = dqo.totalNscanned(); if ( dqo.scanAndOrderRequired() ) ss << " scanAndOrder "; shared_ptr cursor = dqo.cursor(); if( logLevel >= 5 ) log() << " used cursor: " << cursor.get() << endl; long long cursorid = 0; const char * exhaust = 0; if ( dqo.saveClientCursor() || ( dqo.wouldSaveClientCursor() && mps->mayRunMore() ) ) { ClientCursor *cc; bool moreClauses = mps->mayRunMore(); if ( moreClauses ) { // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher(), dqo ) ); cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned()); } else { if( ! cursor->matcher() ) cursor->setMatcher( dqo.matcher() ); cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() ); } cursorid = cc->cursorid(); DEV tlog(2) << "query has more, cursorid: " << cursorid << endl; cc->setPos( n ); cc->pq = pq_shared; cc->fields = pq.getFieldPtr(); cc->originalMessage = m; cc->updateLocation(); if ( !cc->ok() && cc->c()->tailable() ) DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl; if( queryOptions & QueryOption_Exhaust ) { exhaust = ns; ss << " exhaust "; } dqo.finishForOplogReplay(cc); } QueryResult *qr = (QueryResult *) result.header(); qr->cursorId = cursorid; qr->setResultFlagsToOk(); // qr->len is updated automatically by appendData() ss << " reslen:" << qr->len; qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = n; int duration = curop.elapsedMillis(); bool dbprofile = curop.shouldDBProfile( duration ); if ( dbprofile || duration >= cmdLine.slowMS ) { ss << " nscanned:" << nscanned << ' '; if ( ntoskip ) ss << " ntoskip:" << ntoskip; if ( dbprofile ) ss << " \nquery: "; ss << jsobj.toString() << ' '; } ss << " nreturned:" << n; return exhaust; } } // namespace mongo