diff options
author | Antonin Kral <a.kral@bobek.cz> | 2010-03-25 19:21:32 +0100 |
---|---|---|
committer | Antonin Kral <a.kral@bobek.cz> | 2010-03-25 19:21:32 +0100 |
commit | 0ca01a91ae0a3562e54c226e7b9512feb2ea83d0 (patch) | |
tree | 2b3886e435b0217d6afd63a213b04d32bb4b4f6f /db | |
parent | a696359b248adef0cc8576fce3f473535e995136 (diff) | |
download | mongodb-0ca01a91ae0a3562e54c226e7b9512feb2ea83d0.tar.gz |
Imported Upstream version 1.4.0
Diffstat (limited to 'db')
83 files changed, 9695 insertions, 3499 deletions
diff --git a/db/background.h b/db/background.h new file mode 100644 index 0000000..24ea1cb --- /dev/null +++ b/db/background.h @@ -0,0 +1,56 @@ +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* background.h + + Concurrency coordination for administrative operations. +*/ + +#pragma once + +namespace mongo { + + /* these are administrative operations / jobs + for a namespace running in the background, and that only one + at a time per namespace is permitted, and that if in progress, + you aren't allowed to do other NamespaceDetails major manipulations + (such as dropping ns or db) even in the foreground and must + instead uassert. + + It's assumed this is not for super-high RPS things, so we don't do + anything special in the implementation here to be fast. + */ + class BackgroundOperation : public boost::noncopyable { + public: + static bool inProgForDb(const char *db); + static bool inProgForNs(const char *ns); + static void assertNoBgOpInProgForDb(const char *db); + static void assertNoBgOpInProgForNs(const char *ns); + static void dump(stringstream&); + + /* check for in progress before instantiating */ + BackgroundOperation(const char *ns); + + virtual ~BackgroundOperation(); + + private: + NamespaceString _ns; + static map<string, unsigned> dbsInProg; + static set<string> nsInProg; + }; + +} // namespace mongo + diff --git a/db/btree.cpp b/db/btree.cpp index 8b910f5..18f9e76 100644 --- a/db/btree.cpp +++ b/db/btree.cpp @@ -25,6 +25,7 @@ #include "client.h" #include "dbhelpers.h" #include "curop.h" +#include "stats/counters.h" namespace mongo { @@ -41,6 +42,11 @@ namespace mongo { const int split_debug = 0; const int insert_debug = 0; + static void alreadyInIndex() { + // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord() + throw MsgAssertionException(10287, "btree: key+recloc already in index"); + } + /* BucketBasics --------------------------------------------------- */ inline void BucketBasics::modified(const DiskLoc& thisLoc) { @@ -356,9 +362,36 @@ namespace mongo { return false; } + /* @param self - don't complain about ourself already being in the index case. + @return true = there is a duplicate. + */ + bool BtreeBucket::wouldCreateDup( + const IndexDetails& idx, DiskLoc thisLoc, + const BSONObj& key, BSONObj order, + DiskLoc self) + { + int pos; + bool found; + DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc); + + while ( !b.isNull() ) { + // we skip unused keys + BtreeBucket *bucket = b.btree(); + _KeyNode& kn = bucket->k(pos); + if ( kn.isUsed() ) { + if( bucket->keyAt(pos).woEqual(key) ) + return kn.recordLoc != self; + break; + } + b = bucket->advance(b, pos, 1, "BtreeBucket::dupCheck"); + } + + return false; + } + string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ){ stringstream ss; - ss << "E11000 duplicate key error"; + ss << "E11000 duplicate key error "; ss << "index: " << idx.indexNamespace() << " "; ss << "dup key: " << key; return ss.str(); @@ -391,6 +424,9 @@ namespace mongo { } } #endif + + globalIndexCounters.btree( (char*)this ); + /* binary search for this key */ bool dupsChecked = false; int l=0; @@ -407,12 +443,19 @@ namespace mongo { // coding effort in here to make this particularly fast if( !dupsChecked ) { dupsChecked = true; - if( idx.head.btree()->exists(idx, idx.head, key, order) ) - uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) ); + if( idx.head.btree()->exists(idx, idx.head, key, order) ) { + if( idx.head.btree()->wouldCreateDup(idx, idx.head, key, order, recordLoc) ) + uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) ); + else + alreadyInIndex(); + } } } - else + else { + if( M.recordLoc == recordLoc ) + alreadyInIndex(); uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) ); + } } // dup keys allowed. use recordLoc as if it is part of the key @@ -444,7 +487,7 @@ namespace mongo { } void BtreeBucket::delBucket(const DiskLoc& thisLoc, IndexDetails& id) { - ClientCursor::informAboutToDeleteBucket(thisLoc); + ClientCursor::informAboutToDeleteBucket(thisLoc); // slow... assert( !isHead() ); BtreeBucket *p = parent.btreemod(); @@ -466,6 +509,10 @@ namespace mongo { assert(false); } found: + deallocBucket( thisLoc ); + } + + void BtreeBucket::deallocBucket(const DiskLoc &thisLoc) { #if 1 /* as a temporary defensive measure, we zap the whole bucket, AND don't truly delete it (meaning it is ineligible for reuse). @@ -807,13 +854,15 @@ found: return 0; } - out() << "_insert(): key already exists in index\n"; - out() << " " << idx.indexNamespace().c_str() << " thisLoc:" << thisLoc.toString() << '\n'; - out() << " " << key.toString() << '\n'; - out() << " " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl; - out() << " old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl; - out() << " new l r: " << lChild.toString() << ' ' << rChild.toString() << endl; - massert( 10287 , "btree: key+recloc already in index", false); + DEV { + out() << "_insert(): key already exists in index (ok for background:true)\n"; + out() << " " << idx.indexNamespace().c_str() << " thisLoc:" << thisLoc.toString() << '\n'; + out() << " " << key.toString() << '\n'; + out() << " " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl; + out() << " old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl; + out() << " new l r: " << lChild.toString() << ' ' << rChild.toString() << endl; + } + alreadyInIndex(); } DEBUGGING out() << "TEMP: key: " << key.toString() << endl; @@ -926,12 +975,11 @@ namespace mongo { b->k(1).setUnused(); b->dumpTree(id.head, order); - cout << "---\n"; b->bt_insert(id.head, A, key, order, false, id); b->dumpTree(id.head, order); - cout << "---\n";*/ + */ // this should assert. does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing) b->bt_insert(id.head, C, key, order, false, id); @@ -1004,20 +1052,27 @@ namespace mongo { BSONObj k; DiskLoc r; x->popBack(r,k); - if( x->n == 0 ) - log() << "warning: empty bucket on BtreeBuild " << k.toString() << endl; + bool keepX = ( x->n != 0 ); + DiskLoc keepLoc = keepX ? xloc : x->nextChild; - if ( ! up->_pushBack(r, k, order, xloc) ){ + if ( ! up->_pushBack(r, k, order, keepLoc) ){ // current bucket full DiskLoc n = BtreeBucket::addBucket(idx); up->tempNext() = n; upLoc = n; up = upLoc.btreemod(); - up->pushBack(r, k, order, xloc); + up->pushBack(r, k, order, keepLoc); } - xloc = x->tempNext(); /* get next in chain at current level */ - x->parent = upLoc; + DiskLoc nextLoc = x->tempNext(); /* get next in chain at current level */ + if ( keepX ) { + x->parent = upLoc; + } else { + if ( !x->nextChild.isNull() ) + x->nextChild.btreemod()->parent = upLoc; + x->deallocBucket( xloc ); + } + xloc = nextLoc; } loc = upStart; @@ -20,7 +20,7 @@ #include "../stdafx.h" #include "jsobj.h" -#include "storage.h" +#include "diskloc.h" #include "pdfile.h" namespace mongo { @@ -28,8 +28,8 @@ namespace mongo { #pragma pack(1) struct _KeyNode { - DiskLoc prevChildBucket; - DiskLoc recordLoc; + DiskLoc prevChildBucket; // the lchild + DiskLoc recordLoc; // location of the record associated with the key short keyDataOfs() const { return (short) _kdo; } @@ -53,10 +53,10 @@ namespace mongo { */ recordLoc.GETOFS() |= 1; } - int isUnused() { + int isUnused() const { return recordLoc.getOfs() & 1; } - int isUsed() { + int isUsed() const { return !isUnused(); } }; @@ -85,13 +85,18 @@ namespace mongo { bool isHead() { return parent.isNull(); } void assertValid(const BSONObj &order, bool force = false); int fullValidate(const DiskLoc& thisLoc, const BSONObj &order); /* traverses everything */ - protected: - void modified(const DiskLoc& thisLoc); + KeyNode keyNode(int i) const { - assert( i < n ); + if ( i >= n ){ + massert( 13000 , (string)"invalid keyNode: " + BSON( "i" << i << "n" << n ).jsonString() , i < n ); + } return KeyNode(*this, k(i)); } + protected: + + void modified(const DiskLoc& thisLoc); + char * dataAt(short ofs) { return data + ofs; } @@ -151,6 +156,10 @@ namespace mongo { ss << " emptySize: " << emptySize << " topSize: " << topSize << endl; return ss.str(); } + + bool isUsed( int i ) const { + return k(i).isUsed(); + } protected: void _shape(int level, stringstream&); @@ -184,7 +193,13 @@ namespace mongo { */ bool exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, BSONObj order); + bool wouldCreateDup( + const IndexDetails& idx, DiskLoc thisLoc, + const BSONObj& key, BSONObj order, + DiskLoc self); + static DiskLoc addBucket(IndexDetails&); /* start a new index off, empty */ + void deallocBucket(const DiskLoc &thisLoc); // clear bucket memory, placeholder for deallocation static void renameIndexNamespace(const char *oldNs, const char *newNs); @@ -256,6 +271,7 @@ namespace mongo { virtual void noteLocation(); // updates keyAtKeyOfs... virtual void checkLocation(); + virtual bool supportGetMore() { return true; } /* used for multikey index traversal to avoid sending back dups. see Matcher::matches(). if a multikey index traversal: @@ -318,15 +334,20 @@ namespace mongo { return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable(); } - virtual BSONObj prettyStartKey() const { - return prettyKey( startKey ); - } - virtual BSONObj prettyEndKey() const { - return prettyKey( endKey ); + virtual BSONObj prettyIndexBounds() const { + BSONArrayBuilder ba; + if ( bounds_.size() == 0 ) { + ba << BSON_ARRAY( prettyKey( startKey ) << prettyKey( endKey ) ); + } else { + for( BoundList::const_iterator i = bounds_.begin(); i != bounds_.end(); ++i ) { + ba << BSON_ARRAY( prettyKey( i->first ) << prettyKey( i->second ) ); + } + } + return ba.arr(); } void forgetEndKey() { endKey = BSONObj(); } - + private: /* Our btrees may (rarely) have "unused" keys when items are deleted. Skip past them. @@ -362,6 +383,7 @@ namespace mongo { DiskLoc locAtKeyOfs; BoundList bounds_; unsigned boundIndex_; + const IndexSpec& _spec; }; #pragma pack() @@ -369,6 +391,9 @@ namespace mongo { inline bool IndexDetails::hasKey(const BSONObj& key) { return head.btree()->exists(*this, head, key, keyPattern()); } + inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) { + return head.btree()->wouldCreateDup(*this, head, key, keyPattern(), self); + } /* build btree from the bottom up */ /* _ TODO dropDups */ diff --git a/db/btreecursor.cpp b/db/btreecursor.cpp index bb477d6..ab15c44 100644 --- a/db/btreecursor.cpp +++ b/db/btreecursor.cpp @@ -36,7 +36,8 @@ namespace mongo { indexDetails( _id ), order( _id.keyPattern() ), direction( _direction ), - boundIndex_() + boundIndex_(), + _spec( _id.getSpec() ) { audit(); init(); @@ -51,7 +52,8 @@ namespace mongo { order( _id.keyPattern() ), direction( _direction ), bounds_( _bounds ), - boundIndex_() + boundIndex_(), + _spec( _id.getSpec() ) { assert( !bounds_.empty() ); audit(); @@ -74,6 +76,10 @@ namespace mongo { } void BtreeCursor::init() { + if ( _spec.getType() ){ + startKey = _spec.getType()->fixKey( startKey ); + endKey = _spec.getType()->fixKey( endKey ); + } bool found; bucket = indexDetails.head.btree()-> locate(indexDetails, indexDetails.head, startKey, order, keyOfs, found, direction > 0 ? minDiskLoc : maxDiskLoc, direction); @@ -88,7 +94,7 @@ namespace mongo { init(); } while ( !ok() && ++boundIndex_ < bounds_.size() ); } - + /* skip unused keys. */ void BtreeCursor::skipUnusedKeys() { int u = 0; diff --git a/db/client.cpp b/db/client.cpp index 68a0c9e..dc82a25 100644 --- a/db/client.cpp +++ b/db/client.cpp @@ -1,5 +1,5 @@ -// client.cpp
-
+// client.cpp + /** * Copyright (C) 2009 10gen Inc. * @@ -25,40 +25,41 @@ #include "client.h" #include "curop.h" #include "json.h" - +#include "security.h" + namespace mongo { - boost::mutex Client::clientsMutex; + mongo::mutex Client::clientsMutex; set<Client*> Client::clients; // always be in clientsMutex when manipulating this boost::thread_specific_ptr<Client> currentClient; Client::Client(const char *desc) : - _curOp(new CurOp()), - _database(0), _ns("")/*, _nsstr("")*/ - ,_shutdown(false), + _context(0), + _shutdown(false), _desc(desc), _god(0) - { - ai = new AuthenticationInfo(); - boostlock bl(clientsMutex); + { + _curOp = new CurOp( this ); + scoped_lock bl(clientsMutex); clients.insert(this); } Client::~Client() { delete _curOp; - delete ai; - ai = 0; _god = 0; - if ( !_shutdown ) { - cout << "ERROR: Client::shutdown not called!" << endl; - } + + if ( _context ) + cout << "ERROR: Client::~Client _context should be NULL: " << _desc << endl; + if ( !_shutdown ) + cout << "ERROR: Client::shutdown not called: " << _desc << endl; } bool Client::shutdown(){ _shutdown = true; - + if ( inShutdown() ) + return false; { - boostlock bl(clientsMutex); + scoped_lock bl(clientsMutex); clients.erase(this); } @@ -68,8 +69,10 @@ namespace mongo { didAnything = true; for ( list<string>::iterator i = _tempCollections.begin(); i!=_tempCollections.end(); i++ ){ string ns = *i; + Top::global.collectionDropped( ns ); + dblock l; - setClient( ns.c_str() ); + Client::Context ctx( ns ); if ( ! nsdetails( ns.c_str() ) ) continue; try { @@ -88,12 +91,158 @@ namespace mongo { } BSONObj CurOp::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}"); - WrappingInt CurOp::_nextOpNum; + AtomicUInt CurOp::_nextOpNum; - Client::Context::Context( string ns , Database * db ) - : _client( currentClient.get() ) { + Client::Context::Context( string ns , Database * db, bool doauth ) + : _client( currentClient.get() ) , _oldContext( _client->_context ) , + _path( dbpath ) , _lock(0) , _justCreated(false) { assert( db && db->isOk() ); - _client->setns( ns.c_str() , db ); + _ns = ns; + _db = db; + _client->_context = this; + if ( doauth ) + _auth(); + } + + void Client::Context::_finishInit( bool doauth ){ + int lockState = dbMutex.getState(); + assert( lockState ); + + _db = dbHolder.get( _ns , _path ); + if ( _db ){ + _justCreated = false; + } + else if ( dbMutex.getState() > 0 ){ + // already in a write lock + _db = dbHolder.getOrCreate( _ns , _path , _justCreated ); + assert( _db ); + } + else if ( dbMutex.getState() < -1 ){ + // nested read lock :( + assert( _lock ); + _lock->releaseAndWriteLock(); + _db = dbHolder.getOrCreate( _ns , _path , _justCreated ); + assert( _db ); + } + else { + // we have a read lock, but need to get a write lock for a bit + // we need to be in a write lock since we're going to create the DB object + // to do that, we're going to unlock, then get a write lock + // this is so that if this is the first query and its long doesn't block db + // we just have to check that the db wasn't closed in the interim where we unlock + for ( int x=0; x<2; x++ ){ + { + dbtemprelease unlock; + writelock lk( _ns ); + dbHolder.getOrCreate( _ns , _path , _justCreated ); + } + + _db = dbHolder.get( _ns , _path ); + + if ( _db ) + break; + + log() << "db was closed on us right after we opened it: " << _ns << endl; + } + + uassert( 13005 , "can't create db, keeps getting closed" , _db ); + } + + _client->_context = this; + _client->_curOp->enter( this ); + if ( doauth ) + _auth( lockState ); + } + + void Client::Context::_auth( int lockState ){ + if ( _client->_ai.isAuthorizedForLock( _db->name , lockState ) ) + return; + + // before we assert, do a little cleanup + _client->_context = _oldContext; // note: _oldContext may be null + + stringstream ss; + ss << "unauthorized for db [" << _db->name << "] lock type: " << lockState << endl; + massert( 10057 , ss.str() , 0 ); + } + + Client::Context::~Context() { + DEV assert( _client == currentClient.get() ); + _client->_curOp->leave( this ); + _client->_context = _oldContext; // note: _oldContext may be null + } + + string Client::toString() const { + stringstream ss; + if ( _curOp ) + ss << _curOp->infoNoauth().jsonString(); + return ss.str(); + } + + string sayClientState(){ + Client* c = currentClient.get(); + if ( ! c ) + return "no client"; + return c->toString(); + } + + void curopWaitingForLock( int type ){ + Client * c = currentClient.get(); + assert( c ); + CurOp * co = c->curop(); + if ( co ){ + co->waitingForLock( type ); + } + } + void curopGotLock(){ + Client * c = currentClient.get(); + assert(c); + CurOp * co = c->curop(); + if ( co ){ + co->gotLock(); + } + } + + BSONObj CurOp::infoNoauth() { + BSONObjBuilder b; + b.append("opid", _opNum); + bool a = _active && _start; + b.append("active", a); + if ( _lockType ) + b.append("lockType" , _lockType > 0 ? "write" : "read" ); + b.append("waitingForLock" , _waitingForLock ); + + if( a ){ + b.append("secs_running", elapsedSeconds() ); + } + + b.append( "op" , opToString( _op ) ); + + b.append("ns", _ns); + + if( haveQuery() ) { + b.append("query", query()); + } + // b.append("inLock", ?? + stringstream clientStr; + clientStr << inet_ntoa( _remote.sin_addr ) << ":" << ntohs( _remote.sin_port ); + b.append("client", clientStr.str()); + + if ( _client ) + b.append( "desc" , _client->desc() ); + + if ( ! _message.empty() ){ + if ( _progressMeter.isActive() ){ + StringBuilder buf(128); + buf << _message << " " << _progressMeter.toString(); + b.append( "msg" , buf.str() ); + } + else { + b.append( "msg" , _message ); + } + } + + return b.obj(); } } diff --git a/db/client.h b/db/client.h index 99092ca..ab43509 100644 --- a/db/client.h +++ b/db/client.h @@ -1,5 +1,5 @@ -// client.h
-
+// client.h + /** * Copyright (C) 2008 10gen Inc. * @@ -25,9 +25,10 @@ #pragma once #include "../stdafx.h" +#include "security.h" #include "namespace.h" #include "lasterror.h" -#include "../util/top.h" +#include "stats/top.h" namespace mongo { @@ -39,12 +40,9 @@ namespace mongo { extern boost::thread_specific_ptr<Client> currentClient; - bool setClient(const char *ns, const string& path=dbpath, mongolock *lock = 0); - - class Client : boost::noncopyable { public: - static boost::mutex clientsMutex; + static mongo::mutex clientsMutex; static set<Client*> clients; // always be in clientsMutex when manipulating this class GodScope { @@ -57,71 +55,125 @@ namespace mongo { /* Set database we want to use, then, restores when we finish (are out of scope) Note this is also helpful if an exception happens as the state if fixed up. */ - class Context { + class Context : boost::noncopyable{ Client * _client; - Database * _olddb; - string _oldns; + Context * _oldContext; + + string _path; + mongolock * _lock; + bool _justCreated; + + string _ns; + Database * _db; + + /** + * at this point _client, _oldContext and _ns have to be set + * _db should not have been touched + * this will set _db and create if needed + * will also set _client->_context to this + */ + void _finishInit( bool doauth=true); + + void _auth( int lockState = dbMutex.getState() ); public: - Context(const char *ns) - : _client( currentClient.get() ) { - _olddb = _client->_database; - _oldns = _client->_ns; - setClient(ns); - } - Context(string ns) - : _client( currentClient.get() ){ - _olddb = _client->_database; - _oldns = _client->_ns; - setClient(ns.c_str()); + Context(const string& ns, string path=dbpath, mongolock * lock = 0 , bool doauth=true ) + : _client( currentClient.get() ) , _oldContext( _client->_context ) , + _path( path ) , _lock( lock ) , + _ns( ns ){ + _finishInit( doauth ); } /* this version saves the context but doesn't yet set the new one: */ - Context() - : _client( currentClient.get() ) { - _olddb = _client->database(); - _oldns = _client->ns(); + Context() + : _client( currentClient.get() ) , _oldContext( _client->_context ), + _path( dbpath ) , _lock(0) , _justCreated(false){ + _client->_context = this; + clear(); } /** * if you are doing this after allowing a write there could be a race condition * if someone closes that db. this checks that the DB is still valid */ - Context( string ns , Database * db ); + Context( string ns , Database * db, bool doauth=true ); + + ~Context(); + + Client* getClient() const { return _client; } + + Database* db() const { + return _db; + } - ~Context() { - DEV assert( _client == currentClient.get() ); - _client->setns( _oldns.c_str(), _olddb ); + const char * ns() const { + return _ns.c_str(); + } + + bool justCreated() const { + return _justCreated; } - }; + bool equals( const string& ns , const string& path=dbpath ) const { + return _ns == ns && _path == path; + } + + bool inDB( const string& db , const string& path=dbpath ) const { + if ( _path != path ) + return false; + + if ( db == _ns ) + return true; + + string::size_type idx = _ns.find( db ); + if ( idx != 0 ) + return false; + + return _ns[db.size()] == '.'; + } + void clear(){ + _ns = ""; + _db = 0; + } + + /** + * call before unlocking, so clear any non-thread safe state + */ + void unlocked(){ + _db = 0; + } + + /** + * call after going back into the lock, will re-establish non-thread safe stuff + */ + void relocked(){ + _finishInit(); + } + + friend class CurOp; + }; + private: - CurOp * const _curOp; - Database *_database; - Namespace _ns; - //NamespaceString _nsstr; + CurOp * _curOp; + Context * _context; bool _shutdown; list<string> _tempCollections; const char *_desc; bool _god; + AuthenticationInfo _ai; + public: - AuthenticationInfo *ai; - Top top; + + AuthenticationInfo * getAuthenticationInfo(){ return &_ai; } + bool isAdmin() { return _ai.isAuthorized( "admin" ); } CurOp* curop() { return _curOp; } - Database* database() { - return _database; - } - const char *ns() { return _ns.buf; } - - void setns(const char *ns, Database *db) { - _database = db; - _ns = ns; - //_nsstr = ns; - } - void clearns() { setns("", 0); } - + + Context* getContext(){ return _context; } + Database* database() { return _context ? _context->db() : 0; } + const char *ns() { return _context->ns(); } + Client(const char *desc); ~Client(); @@ -143,6 +195,10 @@ namespace mongo { bool shutdown(); bool isGod() const { return _god; } + + friend class CurOp; + + string toString() const; }; inline Client& cc() { @@ -182,12 +238,15 @@ namespace mongo { dbMutex.unlock_shared(); dbMutex.lock(); - /* this is defensive; as we were unlocked for a moment above, - the Database object we reference could have been deleted: - */ - cc().clearns(); + if ( cc().getContext() ) + cc().getContext()->unlocked(); } } - + + string sayClientState(); + + inline bool haveClient(){ + return currentClient.get() > 0; + } }; diff --git a/db/clientcursor.cpp b/db/clientcursor.cpp index 0de0b2e..be0bd2f 100644 --- a/db/clientcursor.cpp +++ b/db/clientcursor.cpp @@ -36,7 +36,7 @@ namespace mongo { boost::recursive_mutex ClientCursor::ccmutex; unsigned ClientCursor::byLocSize() { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); return byLoc.size(); } @@ -63,7 +63,7 @@ namespace mongo { /* todo: this implementation is incomplete. we use it as a prefix for dropDatabase, which works fine as the prefix will end with '.'. however, when used with drop and - deleteIndexes, this could take out cursors that belong to something else -- if you + dropIndexes, this could take out cursors that belong to something else -- if you drop "foo", currently, this will kill cursors for "foobar". */ void ClientCursor::invalidate(const char *nsPrefix) { @@ -73,7 +73,7 @@ namespace mongo { assert( len > 0 && strchr(nsPrefix, '.') ); { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); for ( CCByLoc::iterator i = byLoc.begin(); i != byLoc.end(); ++i ) { ClientCursor *cc = i->second; @@ -88,7 +88,7 @@ namespace mongo { /* called every 4 seconds. millis is amount of idle time passed since the last call -- could be zero */ void ClientCursor::idleTimeReport(unsigned millis) { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); for ( CCByLoc::iterator i = byLoc.begin(); i != byLoc.end(); ) { CCByLoc::iterator j = i; i++; @@ -104,7 +104,7 @@ namespace mongo { note this is potentially slow */ void ClientCursor::informAboutToDeleteBucket(const DiskLoc& b) { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); RARELY if ( byLoc.size() > 70 ) { log() << "perf warning: byLoc.size=" << byLoc.size() << " in aboutToDeleteBucket\n"; } @@ -117,7 +117,7 @@ namespace mongo { /* must call this on a delete so we clean up the cursors. */ void ClientCursor::aboutToDelete(const DiskLoc& dl) { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); CCByLoc::iterator j = byLoc.lower_bound(dl); CCByLoc::iterator stop = byLoc.upper_bound(dl); @@ -170,7 +170,7 @@ namespace mongo { assert( pos != -2 ); { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); setLastLoc_inlock( DiskLoc() ); // removes us from bylocation multimap clientCursorsById.erase(cursorid); @@ -193,7 +193,7 @@ namespace mongo { return; } { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); setLastLoc_inlock(cl); c->noteLocation(); } @@ -217,7 +217,7 @@ namespace mongo { static bool inEmpty = false; if( test && !inEmpty ) { inEmpty = true; - log() << "TEST: manipulate collection during remove" << endl; + log() << "TEST: manipulate collection during cc:yield" << endl; if( test == 1 ) Helpers::emptyCollection(ns.c_str()); else if( test == 2 ) { @@ -267,8 +267,9 @@ namespace mongo { virtual void help( stringstream& help ) const { help << " example: { cursorInfo : 1 }"; } + virtual LockType locktype(){ return NONE; } bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ - recursive_boostlock lock(ClientCursor::ccmutex); + recursive_scoped_lock lock(ClientCursor::ccmutex); result.append("byLocation_size", unsigned( ClientCursor::byLoc.size() ) ); result.append("clientCursors_size", unsigned( ClientCursor::clientCursorsById.size() ) ); return true; diff --git a/db/clientcursor.h b/db/clientcursor.h index 03f20e9..42919e3 100644 --- a/db/clientcursor.h +++ b/db/clientcursor.h @@ -28,7 +28,7 @@ #include "cursor.h" #include "jsobj.h" #include "../util/message.h" -#include "storage.h" +#include "diskloc.h" #include "dbhelpers.h" #include "matcher.h" @@ -83,7 +83,7 @@ namespace mongo { _c = 0; } Pointer(long long cursorid) { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); _c = ClientCursor::find_inlock(cursorid, true); if( _c ) { if( _c->_pinValue >= 100 ) { @@ -105,8 +105,15 @@ namespace mongo { int pos; // # objects into the cursor so far BSONObj query; - ClientCursor() : _idleAgeMillis(0), _pinValue(0), _doingDeletes(false), pos(0) { - recursive_boostlock lock(ccmutex); + ClientCursor(auto_ptr<Cursor>& _c, const char *_ns, bool okToTimeout) : + _idleAgeMillis(0), _pinValue(0), + _doingDeletes(false), + ns(_ns), c(_c), + pos(0) + { + if( !okToTimeout ) + noTimeout(); + recursive_scoped_lock lock(ccmutex); cursorid = allocCursorId_inlock(); clientCursorsById.insert( make_pair(cursorid, this) ); } @@ -116,11 +123,11 @@ namespace mongo { return _lastLoc; } - auto_ptr< FieldMatcher > filter; // which fields query wants returned + shared_ptr< FieldMatcher > fields; // which fields query wants returned Message originalMessage; // this is effectively an auto ptr for data the matcher points to /* Get rid of cursors for namespaces that begin with nsprefix. - Used by drop, deleteIndexes, dropDatabase. + Used by drop, dropIndexes, dropDatabase. */ static void invalidate(const char *nsPrefix); @@ -130,7 +137,8 @@ namespace mongo { * we don't do herein as this->matcher (above) is only initialized for true queries/getmore. * (ie not set for remote/update) * @return if the cursor is still valid. - * if false is returned, then this ClientCursor should be considered deleted + * if false is returned, then this ClientCursor should be considered deleted - + * in fact, the whole database could be gone. */ bool yield(); private: @@ -147,16 +155,16 @@ namespace mongo { } public: static ClientCursor* find(CursorId id, bool warn = true) { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); ClientCursor *c = find_inlock(id, warn); // if this asserts, your code was not thread safe - you either need to set no timeout // for the cursor or keep a ClientCursor::Pointer in scope for it. - massert( 12521, "internal error: use of an unlocked ClientCursor", c->_pinValue ); + massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue ); return c; } static bool erase(CursorId id) { - recursive_boostlock lock(ccmutex); + recursive_scoped_lock lock(ccmutex); ClientCursor *cc = find_inlock(id); if ( cc ) { assert( cc->_pinValue < 100 ); // you can't still have an active ClientCursor::Pointer @@ -195,13 +203,13 @@ namespace mongo { } static void idleTimeReport(unsigned millis); - +private: // cursors normally timeout after an inactivy period to prevent excess memory use // setting this prevents timeout of the cursor in question. void noTimeout() { _pinValue++; } - +public: void setDoingDeletes( bool doingDeletes ){ _doingDeletes = doingDeletes; } diff --git a/db/cloner.cpp b/db/cloner.cpp index 862f37c..d300721 100644 --- a/db/cloner.cpp +++ b/db/cloner.cpp @@ -46,6 +46,7 @@ namespace mongo { snapshot - use $snapshot mode for copying collections. note this should not be used when it isn't required, as it will be slower. for example repairDatabase need not use it. */ + void setConnection( DBClientWithCommands *c ) { conn.reset( c ); } bool go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot); bool startCloneCollection( const char *fromhost, const char *ns, const BSONObj &query, string& errmsg, bool logForRepl, bool copyIndexes, int logSizeMb, long long &cursorId ); bool finishCloneCollection( const char *fromhost, const char *ns, const BSONObj &query, long long cursorId, string &errmsg ); @@ -97,11 +98,11 @@ namespace mongo { list<BSONObj> storedForLater; - assert( c.get() ); + massert( 13055 , "socket error in Cloner:copy" , c.get() ); long long n = 0; time_t saveLast = time( 0 ); while ( 1 ) { - { + if( !c->moreInCurrentBatch() || n % 128 == 127 /*yield some*/ ) { dbtemprelease r; if ( !c->more() ) break; @@ -111,7 +112,7 @@ namespace mongo { /* assure object is valid. note this will slow us down a little. */ if ( !tmp.valid() ) { stringstream ss; - ss << "skipping corrupt object from " << from_collection; + ss << "Cloner: skipping corrupt object from " << from_collection; BSONElement e = tmp.firstElement(); try { e.validate(); @@ -191,7 +192,9 @@ namespace mongo { auto_ptr<DBClientCursor> c; { - if ( !masterSameProcess ) { + if ( conn.get() ) { + // nothing to do + } else if ( !masterSameProcess ) { auto_ptr< DBClientConnection > c( new DBClientConnection() ); if ( !c->connect( masterHost, errmsg ) ) return false; @@ -215,7 +218,7 @@ namespace mongo { log(2) << "\t cloner got " << collection << endl; - BSONElement e = collection.findElement("name"); + BSONElement e = collection.getField("name"); if ( e.eoo() ) { string s = "bad system.namespaces object " + collection.toString(); massert( 10290 , s.c_str(), false); @@ -231,12 +234,11 @@ namespace mongo { continue; } } - else if( strchr(from_name, '$') ) { + if( strchr(from_name, '$') ) { // don't clone index namespaces -- we take care of those separately below. log(2) << "\t\t not cloning because has $ " << endl; continue; } - toClone.push_back( collection.getOwned() ); } } @@ -414,6 +416,7 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } virtual void help( stringstream &help ) const { help << "clone this database from an instance of the db on another host\n"; help << "example: { clone : \"host13\" }"; @@ -436,6 +439,7 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } CmdCloneCollection() : Command("cloneCollection") { } virtual void help( stringstream &help ) const { help << " example: { cloneCollection: <collection ns>, from: <hostname>, query: <query> }"; @@ -462,7 +466,7 @@ namespace mongo { /* replication note: we must logOp() not the command, but the cloned data -- if the slave were to clone it would get a different point-in-time and not match. */ - setClient( collection.c_str() ); + Client::Context ctx( collection ); log() << "cloneCollection. db:" << ns << " collection:" << collection << " from: " << fromhost << " query: " << query << " logSizeMb: " << logSizeMb << ( copyIndexes ? "" : ", not copying indexes" ) << endl; @@ -479,6 +483,7 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } CmdStartCloneCollection() : Command("startCloneCollection") { } virtual void help( stringstream &help ) const { help << " example: { startCloneCollection: <collection ns>, from: <hostname>, query: <query> }"; @@ -506,7 +511,7 @@ namespace mongo { /* replication note: we must logOp() not the command, but the cloned data -- if the slave were to clone it would get a different point-in-time and not match. */ - setClient( collection.c_str() ); + Client::Context ctx(collection); log() << "startCloneCollection. db:" << ns << " collection:" << collection << " from: " << fromhost << " query: " << query << endl; @@ -532,6 +537,7 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } CmdFinishCloneCollection() : Command("finishCloneCollection") { } virtual void help( stringstream &help ) const { help << " example: { finishCloneCollection: <finishToken> }"; @@ -562,7 +568,7 @@ namespace mongo { cursorId = cursorIdToken._numberLong(); } - setClient( collection.c_str() ); + Client::Context ctx( collection ); log() << "finishCloneCollection. db:" << ns << " collection:" << collection << " from: " << fromhost << " query: " << query << endl; @@ -571,8 +577,50 @@ namespace mongo { } } cmdfinishclonecollection; + thread_specific_ptr< DBClientConnection > authConn_; + /* Usage: + admindb.$cmd.findOne( { copydbgetnonce: 1, fromhost: <hostname> } ); + */ + class CmdCopyDbGetNonce : public Command { + public: + CmdCopyDbGetNonce() : Command("copydbgetnonce") { } + virtual bool adminOnly() { + return true; + } + virtual bool slaveOk() { + return false; + } + virtual LockType locktype(){ return WRITE; } + virtual void help( stringstream &help ) const { + help << "get a nonce for subsequent copy db request from secure server\n"; + help << "usage: {copydbgetnonce: 1, fromhost: <hostname>}"; + } + virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string fromhost = cmdObj.getStringField("fromhost"); + if ( fromhost.empty() ) { + /* copy from self */ + stringstream ss; + ss << "localhost:" << cmdLine.port; + fromhost = ss.str(); + } + authConn_.reset( new DBClientConnection() ); + BSONObj ret; + { + dbtemprelease t; + if ( !authConn_->connect( fromhost, errmsg ) ) + return false; + if( !authConn_->runCommand( "admin", BSON( "getnonce" << 1 ), ret ) ) { + errmsg = "couldn't get nonce " + string( ret ); + return false; + } + } + result.appendElements( ret ); + return true; + } + } cmdcopydbgetnonce; + /* Usage: - admindb.$cmd.findOne( { copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db> } ); + admindb.$cmd.findOne( { copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, username: <username>, nonce: <nonce>, key: <key>] } ); */ class CmdCopyDb : public Command { public: @@ -583,9 +631,10 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } virtual void help( stringstream &help ) const { - help << "copy a database from antoher host to this host\n"; - help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}"; + help << "copy a database from another host to this host\n"; + help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, username: <username>, nonce: <nonce>, key: <key>]}"; } virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string fromhost = cmdObj.getStringField("fromhost"); @@ -601,9 +650,24 @@ namespace mongo { errmsg = "parms missing - {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}"; return false; } - setClient(todb.c_str()); - bool res = cloneFrom(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, /*slaveok*/false, /*replauth*/false, /*snapshot*/true); - cc().clearns(); + Cloner c; + string username = cmdObj.getStringField( "username" ); + string nonce = cmdObj.getStringField( "nonce" ); + string key = cmdObj.getStringField( "key" ); + if ( !username.empty() && !nonce.empty() && !key.empty() ) { + uassert( 13008, "must call copydbgetnonce first", authConn_.get() ); + BSONObj ret; + { + dbtemprelease t; + if ( !authConn_->runCommand( fromdb, BSON( "authenticate" << 1 << "user" << username << "nonce" << nonce << "key" << key ), ret ) ) { + errmsg = "unable to login " + string( ret ); + return false; + } + } + c.setConnection( authConn_.release() ); + } + Client::Context ctx(todb); + bool res = c.go(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, /*slaveok*/false, /*replauth*/false, /*snapshot*/true); return res; } } cmdcopydb; @@ -617,6 +681,7 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } virtual bool logTheOp() { return true; // can't log steps when doing fast rename within a db, so always log the op rather than individual steps comprising it. } @@ -631,16 +696,19 @@ namespace mongo { return false; } - setClient( source.c_str() ); - NamespaceDetails *nsd = nsdetails( source.c_str() ); - uassert( 10026 , "source namespace does not exist", nsd ); - bool capped = nsd->capped; + bool capped = false; long long size = 0; - if ( capped ) - for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext ) - size += i.ext()->length; + { + Client::Context ctx( source ); + NamespaceDetails *nsd = nsdetails( source.c_str() ); + uassert( 10026 , "source namespace does not exist", nsd ); + capped = nsd->capped; + if ( capped ) + for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext ) + size += i.ext()->length; + } - setClient( target.c_str() ); + Client::Context ctx( target ); if ( nsdetails( target.c_str() ) ){ uassert( 10027 , "target namespace exists", cmdObj["dropTarget"].trueValue() ); @@ -715,8 +783,10 @@ namespace mongo { theDataFileMgr.insert( targetIndexes.c_str(), n ); } - setClient( source.c_str() ); - dropCollection( source, errmsg, result ); + { + Client::Context ctx( source ); + dropCollection( source, errmsg, result ); + } return true; } } cmdrenamecollection; diff --git a/db/cmdline.cpp b/db/cmdline.cpp new file mode 100644 index 0000000..59eafdd --- /dev/null +++ b/db/cmdline.cpp @@ -0,0 +1,162 @@ +// cmdline.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "stdafx.h" +#include "cmdline.h" +#include "commands.h" + +namespace po = boost::program_options; + +namespace mongo { + CmdLine cmdLine; + + void setupSignals(); + BSONArray argvArray; + + void CmdLine::addGlobalOptions( boost::program_options::options_description& general , + boost::program_options::options_description& hidden ){ + /* support for -vv -vvvv etc. */ + for (string s = "vv"; s.length() <= 12; s.append("v")) { + hidden.add_options()(s.c_str(), "verbose"); + } + + general.add_options() + ("help,h", "show this usage information") + ("version", "show version information") + ("config,f", po::value<string>(), "configuration file specifying additional options") + ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)") + ("quiet", "quieter output") + ("port", po::value<int>(&cmdLine.port), "specify port number") + ("logpath", po::value<string>() , "file to send all output to instead of stdout" ) + ("logappend" , "append to logpath instead of over-writing" ) +#ifndef _WIN32 + ("fork" , "fork server process" ) +#endif + ; + + } + + + bool CmdLine::store( int argc , char ** argv , + boost::program_options::options_description& visible, + boost::program_options::options_description& hidden, + boost::program_options::positional_options_description& positional, + boost::program_options::variables_map ¶ms ){ + + + /* don't allow guessing - creates ambiguities when some options are + * prefixes of others. allow long disguises and don't allow guessing + * to get away with our vvvvvvv trick. */ + int style = (((po::command_line_style::unix_style ^ + po::command_line_style::allow_guessing) | + po::command_line_style::allow_long_disguise) ^ + po::command_line_style::allow_sticky); + + + try { + + po::options_description all; + all.add( visible ); + all.add( hidden ); + + po::store( po::command_line_parser(argc, argv) + .options( all ) + .positional( positional ) + .style( style ) + .run(), + params ); + + if ( params.count("config") ){ + ifstream f( params["config"].as<string>().c_str() ); + if ( ! f.is_open() ){ + cout << "ERROR: could not read from config file" << endl << endl; + cout << visible << endl; + return false; + } + + po::store( po::parse_config_file( f , all ) , params ); + f.close(); + } + + po::notify(params); + } + catch (po::error &e) { + cout << "ERROR: " << e.what() << endl << endl; + cout << visible << endl; + return false; + } + + if (params.count("verbose")) { + logLevel = 1; + } + + for (string s = "vv"; s.length() <= 12; s.append("v")) { + if (params.count(s)) { + logLevel = s.length(); + } + } + + if (params.count("quiet")) { + cmdLine.quiet = true; + } + +#ifndef _WIN32 + if (params.count("fork")) { + if ( ! params.count( "logpath" ) ){ + cout << "--fork has to be used with --logpath" << endl; + ::exit(-1); + } + pid_t c = fork(); + if ( c ){ + cout << "forked process: " << c << endl; + ::exit(0); + } + setsid(); + setupSignals(); + } +#endif + if (params.count("logpath")) { + string lp = params["logpath"].as<string>(); + uassert( 10033 , "logpath has to be non-zero" , lp.size() ); + initLogging( lp , params.count( "logappend" ) ); + } + + { + BSONArrayBuilder b; + for (int i=0; i < argc; i++) + b << argv[i]; + argvArray = b.arr(); + } + + return true; + } + + class CmdGetCmdLineOpts : Command{ + public: + CmdGetCmdLineOpts(): Command("getCmdLineOpts") {} + virtual LockType locktype() { return NONE; } + virtual bool adminOnly() { return true; } + virtual bool slaveOk() { return true; } + + virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + result.append("argv", argvArray); + return true; + } + + } cmdGetCmdLineOpts; +} diff --git a/db/cmdline.h b/db/cmdline.h index b071259..3e46c5e 100644 --- a/db/cmdline.h +++ b/db/cmdline.h @@ -16,6 +16,8 @@ #pragma once +#include "../stdafx.h" + namespace mongo { /* command line options @@ -23,6 +25,7 @@ namespace mongo { /* concurrency: OK/READ */ struct CmdLine { int port; // --port + bool rest; // --rest string source; // --source string only; // --only @@ -47,11 +50,25 @@ namespace mongo { }; CmdLine() : - port(DefaultDBPort), quiet(false), notablescan(false), prealloc(true), smallfiles(false), + port(DefaultDBPort), rest(false), quiet(false), notablescan(false), prealloc(true), smallfiles(false), quota(false), quotaFiles(8), cpu(false), oplogSize(0), defaultProfile(0), slowMS(100) { } + - }; + static void addGlobalOptions( boost::program_options::options_description& general , + boost::program_options::options_description& hidden ); + + /** + * @return true if should run program, false if should exit + */ + static bool store( int argc , char ** argv , + boost::program_options::options_description& visible, + boost::program_options::options_description& hidden, + boost::program_options::positional_options_description& positional, + boost::program_options::variables_map &output ); + }; + extern CmdLine cmdLine; + } diff --git a/db/commands.cpp b/db/commands.cpp index 3078ea1..83d7219 100644 --- a/db/commands.cpp +++ b/db/commands.cpp @@ -20,6 +20,8 @@ #include "stdafx.h" #include "jsobj.h" #include "commands.h" +#include "client.h" +#include "replset.h" namespace mongo { @@ -72,9 +74,14 @@ namespace mongo { ok = c->run(ns, jsobj, errmsg, anObjBuilder, false); } - anObjBuilder.append( "ok" , ok ? 1.0 : 0.0 ); + BSONObj tmp = anObjBuilder.asTempObj(); + bool have_ok = tmp.hasField("ok"); + bool have_errmsg = tmp.hasField("errmsg"); + + if (!have_ok) + anObjBuilder.append( "ok" , ok ? 1.0 : 0.0 ); - if ( !ok ) { + if ( !ok && !have_errmsg) { anObjBuilder.append("errmsg", errmsg); uassert_nothrow(errmsg.c_str()); } @@ -92,11 +99,12 @@ namespace mongo { } - bool Command::readOnly( const string& name ){ + Command::LockType Command::locktype( const string& name ){ Command * c = findCommand( name ); if ( ! c ) - return false; - return c->readOnly(); + return WRITE; + return c->locktype(); } + } // namespace mongo diff --git a/db/commands.h b/db/commands.h index 20fb98c..518dcb7 100644 --- a/db/commands.h +++ b/db/commands.h @@ -25,11 +25,15 @@ namespace mongo { class BSONObj; class BSONObjBuilder; class BufBuilder; - + class Client; + // db "commands" (sent via db.$cmd.findOne(...)) // subclass to make a command. class Command { public: + + enum LockType { READ = -1 , NONE = 0 , WRITE = 1 }; + string name; /* run the given command @@ -42,12 +46,12 @@ namespace mongo { */ virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) = 0; - /* true if a read lock is sufficient - note: logTheTop() MUST be false if readOnly + /* + note: logTheTop() MUST be false if READ + if NONE, can't use Client::Context setup + use with caution */ - virtual bool readOnly() { - return false; - } + virtual LockType locktype() = 0; /* Return true if only the admin ns has privileges to run this command. */ virtual bool adminOnly() { @@ -105,10 +109,11 @@ namespace mongo { public: static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder); - static bool readOnly( const string& name ); + static LockType locktype( const string& name ); static Command * findCommand( const string& name ); }; bool _runCommands(const char *ns, BSONObj& jsobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions); + } // namespace mongo diff --git a/db/common.cpp b/db/common.cpp new file mode 100644 index 0000000..a199bd1 --- /dev/null +++ b/db/common.cpp @@ -0,0 +1,14 @@ +// common.cpp + +#include "stdafx.h" +#include "concurrency.h" + +/** + * this just has globals + */ +namespace mongo { + + /* we use new here so we don't have to worry about destructor orders at program shutdown */ + MongoMutex &dbMutex( *(new MongoMutex) ); + +} diff --git a/db/concurrency.h b/db/concurrency.h index daf09b6..de8f242 100644 --- a/db/concurrency.h +++ b/db/concurrency.h @@ -1,3 +1,19 @@ +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + /* concurrency.h mongod concurrency rules & notes will be placed here. @@ -17,19 +33,34 @@ #include <boost/thread/shared_mutex.hpp> #undef assert #define assert xassert +#define HAVE_READLOCK #else -#warning built with boost version 1.34 or older limited concurrency +#warning built with boost version 1.34 or older - limited concurrency #endif namespace mongo { + inline bool readLockSupported(){ +#ifdef HAVE_READLOCK + return true; +#else + return false; +#endif + } + + string sayClientState(); + bool haveClient(); + + void curopWaitingForLock( int type ); + void curopGotLock(); + /* mutex time stats */ class MutexInfo { unsigned long long start, enter, timeLocked; // all in microseconds int locked; public: - MutexInfo() : locked(0) { + MutexInfo() : timeLocked(0) , locked(0) { start = curTimeMicros64(); } void entered() { @@ -51,9 +82,12 @@ namespace mongo { s = start; tl = timeLocked; } + unsigned long long getTimeLocked() const { + return timeLocked; + } }; -#if BOOST_VERSION >= 103500 +#ifdef HAVE_READLOCK //#if 0 class MongoMutex { MutexInfo _minfo; @@ -80,19 +114,25 @@ namespace mongo { void assertAtLeastReadLocked() { assert(atLeastReadLocked()); } void lock() { - DEV cout << "LOCK" << endl; + //DEV cout << "LOCK" << endl; + DEV assert( haveClient() ); + int s = _state.get(); if( s > 0 ) { _state.set(s+1); return; } - massert( 10293 , "internal error: locks are not upgradeable", s == 0 ); + massert( 10293 , (string)"internal error: locks are not upgradeable: " + sayClientState() , s == 0 ); _state.set(1); + + curopWaitingForLock( 1 ); _m.lock(); + curopGotLock(); + _minfo.entered(); } void unlock() { - DEV cout << "UNLOCK" << endl; + //DEV cout << "UNLOCK" << endl; int s = _state.get(); if( s > 1 ) { _state.set(s-1); @@ -103,7 +143,7 @@ namespace mongo { _releasedEarly.set(false); return; } - assert(false); // attempt to unlock when wasn't in a write lock + massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false); } _state.set(0); _minfo.leaving(); @@ -121,7 +161,7 @@ namespace mongo { } void lock_shared() { - DEV cout << " LOCKSHARED" << endl; + //DEV cout << " LOCKSHARED" << endl; int s = _state.get(); if( s ) { if( s > 0 ) { @@ -136,10 +176,29 @@ namespace mongo { } } _state.set(-1); + curopWaitingForLock( -1 ); _m.lock_shared(); + curopGotLock(); + } + + bool lock_shared_try( int millis ) { + int s = _state.get(); + if ( s ){ + // we already have a lock, so no need to try + lock_shared(); + return true; + } + + boost::system_time until = get_system_time(); + until += boost::posix_time::milliseconds(2); + bool got = _m.timed_lock_shared( until ); + if ( got ) + _state.set(-1); + return got; } + void unlock_shared() { - DEV cout << " UNLOCKSHARED" << endl; + //DEV cout << " UNLOCKSHARED" << endl; int s = _state.get(); if( s > 0 ) { assert( s > 1 ); /* we must have done a lock write first to have s > 1 */ @@ -154,6 +213,7 @@ namespace mongo { _state.set(0); _m.unlock_shared(); } + MutexInfo& info() { return _minfo; } }; #else @@ -165,7 +225,7 @@ namespace mongo { public: MongoMutex() { } void lock() { -#if BOOST_VERSION >= 103500 +#ifdef HAVE_READLOCK m.lock(); #else boost::detail::thread::lock_ops<boost::recursive_mutex>::lock(m); @@ -182,7 +242,7 @@ namespace mongo { void _unlock() { _minfo.leaving(); -#if BOOST_VERSION >= 103500 +#ifdef HAVE_READLOCK m.unlock(); #else boost::detail::thread::lock_ops<boost::recursive_mutex>::unlock(m); @@ -197,6 +257,18 @@ namespace mongo { } void lock_shared() { lock(); } + bool lock_shared_try( int millis ) { + while ( millis-- ){ + if ( getState() ){ + sleepmillis(1); + continue; + } + lock_shared(); + return true; + } + return false; + } + void unlock_shared() { unlock(); } MutexInfo& info() { return _minfo; } void assertWriteLocked() { @@ -220,8 +292,10 @@ namespace mongo { dbMutex.lock(); } ~writelock() { - dbunlocking_write(); - dbMutex.unlock(); + DESTRUCTOR_GUARD( + dbunlocking_write(); + dbMutex.unlock(); + ); } }; @@ -230,11 +304,43 @@ namespace mongo { dbMutex.lock_shared(); } ~readlock() { - dbunlocking_read(); - dbMutex.unlock_shared(); + DESTRUCTOR_GUARD( + dbunlocking_read(); + dbMutex.unlock_shared(); + ); } + }; + + struct readlocktry { + readlocktry( const string&ns , int tryms ){ + _got = dbMutex.lock_shared_try( tryms ); + } + ~readlocktry() { + if ( _got ){ + dbunlocking_read(); + dbMutex.unlock_shared(); + } + } + bool got(){ + return _got; + } + bool _got; }; + struct atleastreadlock { + atleastreadlock( const string& ns ){ + _prev = dbMutex.getState(); + if ( _prev == 0 ) + dbMutex.lock_shared(); + } + ~atleastreadlock(){ + if ( _prev == 0 ) + dbMutex.unlock_shared(); + } + + int _prev; + }; + class mongolock { bool _writelock; public: @@ -246,14 +352,15 @@ namespace mongo { dbMutex.lock_shared(); } ~mongolock() { - if( _writelock ) { - dbunlocking_write(); - dbMutex.unlock(); - } - else { - dbunlocking_read(); - dbMutex.unlock_shared(); - } + DESTRUCTOR_GUARD( + if( _writelock ) { + dbunlocking_write(); + dbMutex.unlock(); + } else { + dbunlocking_read(); + dbMutex.unlock_shared(); + } + ); } /* this unlocks, does NOT upgrade. that works for our current usage */ void releaseAndWriteLock(); @@ -1,10 +1,27 @@ // curop.h +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + #pragma once #include "namespace.h" -#include "security.h" #include "client.h" +#include "../util/atomic_int.h" +#include "db.h" namespace mongo { @@ -20,19 +37,53 @@ namespace mongo { /* Current operation (for the current Client). an embedded member of Client class, and typically used from within the mutex there. */ class CurOp : boost::noncopyable { - static WrappingInt _nextOpNum; + static AtomicUInt _nextOpNum; static BSONObj _tooBig; // { $msg : "query not recording (too large)" } + + Client * _client; + CurOp * _wrapped; + + unsigned long long _start; + unsigned long long _checkpoint; + unsigned long long _end; bool _active; - Timer _timer; int _op; - WrappingInt _opNum; + bool _command; + int _lockType; // see concurrency.h for values + bool _waitingForLock; + int _dbprofile; // 0=off, 1=slow, 2=all + AtomicUInt _opNum; char _ns[Namespace::MaxNsLen+2]; - struct sockaddr_in client; - + struct sockaddr_in _remote; + char _queryBuf[256]; - bool haveQuery() const { return *((int *) _queryBuf) != 0; } + void resetQuery(int x=0) { *((int *)_queryBuf) = x; } + + OpDebug _debug; + + ThreadSafeString _message; + ProgressMeter _progressMeter; + + void _reset(){ + _command = false; + _lockType = 0; + _dbprofile = 0; + _end = 0; + _waitingForLock = false; + _message = ""; + _progressMeter.finished(); + } + + void setNS(const char *ns) { + strncpy(_ns, ns, Namespace::MaxNsLen); + } + + public: + + bool haveQuery() const { return *((int *) _queryBuf) != 0; } + BSONObj query() { if( *((int *) _queryBuf) == 1 ) { return _tooBig; @@ -41,37 +92,108 @@ namespace mongo { return o; } - OpDebug _debug; - public: - void reset( const sockaddr_in &_client) { + void ensureStarted(){ + if ( _start == 0 ) + _start = _checkpoint = curTimeMicros64(); + } + void enter( Client::Context * context ){ + ensureStarted(); + setNS( context->ns() ); + if ( context->_db && context->_db->profile > _dbprofile ) + _dbprofile = context->_db->profile; + } + + void leave( Client::Context * context ){ + unsigned long long now = curTimeMicros64(); + Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command ); + _checkpoint = now; + } + + void reset( const sockaddr_in & remote, int op ) { + _reset(); + _start = _checkpoint = 0; _active = true; - _opNum = _nextOpNum.atomicIncrement(); - _timer.reset(); + _opNum = _nextOpNum++; _ns[0] = '?'; // just in case not set later _debug.reset(); resetQuery(); - client = _client; + _remote = remote; + _op = op; + } + + void markCommand(){ + _command = true; + } + + void waitingForLock( int type ){ + _waitingForLock = true; + if ( type > 0 ) + _lockType = 1; + else + _lockType = -1; + } + void gotLock(){ + _waitingForLock = false; } OpDebug& debug(){ return _debug; } + + int profileLevel() const { + return _dbprofile; + } - WrappingInt opNum() const { return _opNum; } - bool active() const { return _active; } + const char * getNS() const { + return _ns; + } - int elapsedMillis(){ return _timer.millis(); } + bool shouldDBProfile( int ms ) const { + if ( _dbprofile <= 0 ) + return false; + + return _dbprofile >= 2 || ms >= cmdLine.slowMS; + } + + AtomicUInt opNum() const { return _opNum; } + + /** if this op is running */ + bool active() const { return _active; } + + int getLockType() const { return _lockType; } + bool isWaitingForLock() const { return _waitingForLock; } + int getOp() const { return _op; } + /** micros */ - unsigned long long startTime(){ - return _timer.startTime(); + unsigned long long startTime() { + ensureStarted(); + return _start; } - void setActive(bool active) { _active = active; } - void setNS(const char *ns) { - strncpy(_ns, ns, Namespace::MaxNsLen); + void done() { + _active = false; + _end = curTimeMicros64(); + } + + unsigned long long totalTimeMicros() { + massert( 12601 , "CurOp not marked done yet" , ! _active ); + return _end - startTime(); + } + + int totalTimeMillis() { + return (int) (totalTimeMicros() / 1000); } - void setOp(int op) { _op = op; } + + int elapsedMillis() { + unsigned long long total = curTimeMicros64() - startTime(); + return (int) (total / 1000); + } + + int elapsedSeconds() { + return elapsedMillis() / 1000; + } + void setQuery(const BSONObj& query) { if( query.objsize() > (int) sizeof(_queryBuf) ) { resetQuery(1); // flag as too big and return @@ -80,9 +202,15 @@ namespace mongo { memcpy(_queryBuf, query.objdata(), query.objsize()); } - CurOp() { + CurOp( Client * client , CurOp * wrapped = 0 ) { + _client = client; + _wrapped = wrapped; + if ( _wrapped ){ + _client->_curOp = this; + } + _start = _checkpoint = 0; _active = false; -// opNum = 0; + _reset(); _op = 0; // These addresses should never be written to again. The zeroes are // placed here as a precaution because currentOp may be accessed @@ -90,10 +218,14 @@ namespace mongo { memset(_ns, 0, sizeof(_ns)); memset(_queryBuf, 0, sizeof(_queryBuf)); } + + ~CurOp(){ + if ( _wrapped ) + _client->_curOp = _wrapped; + } BSONObj info() { - AuthenticationInfo *ai = currentClient.get()->ai; - if( !ai->isAuthorized("admin") ) { + if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) { BSONObjBuilder b; b.append("err", "unauthorized"); return b.obj(); @@ -101,35 +233,30 @@ namespace mongo { return infoNoauth(); } - BSONObj infoNoauth() { - BSONObjBuilder b; - b.append("opid", _opNum); - b.append("active", _active); - if( _active ) - b.append("secs_running", _timer.seconds() ); - if( _op == 2004 ) - b.append("op", "query"); - else if( _op == 2005 ) - b.append("op", "getMore"); - else if( _op == 2001 ) - b.append("op", "update"); - else if( _op == 2002 ) - b.append("op", "insert"); - else if( _op == 2006 ) - b.append("op", "delete"); - else - b.append("op", _op); - b.append("ns", _ns); + BSONObj infoNoauth(); + + string getRemoteString(){ + stringstream ss; + ss << inet_ntoa( _remote.sin_addr ) << ":" << ntohs( _remote.sin_port ); + return ss.str(); + } - if( haveQuery() ) { - b.append("query", query()); + ProgressMeter& setMessage( const char * msg , long long progressMeterTotal = 0 , int secondsBetween = 3 ){ + _message = msg; + if ( progressMeterTotal ){ + assert( ! _progressMeter.isActive() ); + _progressMeter.reset( progressMeterTotal , secondsBetween ); + } + else { + _progressMeter.finished(); } - // b.append("inLock", ?? - stringstream clientStr; - clientStr << inet_ntoa( client.sin_addr ) << ":" << ntohs( client.sin_port ); - b.append("client", clientStr.str()); - return b.obj(); + return _progressMeter; } + + string getMessage() const { return _message; } + ProgressMeter getProgressMeter() { return _progressMeter; } + + friend class Client; }; /* 0 = ok @@ -137,12 +264,12 @@ namespace mongo { future: maybe use this as a "going away" thing on process termination with a higher flag value */ extern class KillCurrentOp { - enum { Off, On, All } state; - WrappingInt toKill; + enum { Off, On, All } state; + AtomicUInt toKill; public: void killAll() { state = All; } - void kill(WrappingInt i) { toKill = i; state = On; } - + void kill(AtomicUInt i) { toKill = i; state = On; } + void checkForInterrupt() { if( state != Off ) { if( state == All ) diff --git a/db/cursor.h b/db/cursor.h index 3868cca..69e5d67 100644 --- a/db/cursor.h +++ b/db/cursor.h @@ -19,7 +19,7 @@ #include "../stdafx.h" #include "jsobj.h" -#include "storage.h" +#include "diskloc.h" namespace mongo { @@ -76,6 +76,8 @@ namespace mongo { /* called before query getmore block is iterated */ virtual void checkLocation() { } + + virtual bool supportGetMore() = 0; virtual string toString() { return "abstract?"; @@ -91,10 +93,10 @@ namespace mongo { */ virtual bool getsetdup(DiskLoc loc) = 0; - virtual BSONObj prettyStartKey() const { return BSONObj(); } - virtual BSONObj prettyEndKey() const { return BSONObj(); } + virtual BSONObj prettyIndexBounds() const { return BSONObj(); } virtual bool capped() const { return false; } + }; // strategy object implementing direction of traversal. @@ -157,6 +159,8 @@ namespace mongo { return tailable_; } virtual bool getsetdup(DiskLoc loc) { return false; } + + virtual bool supportGetMore() { return true; } }; /* used for order { $natural: -1 } */ diff --git a/db/database.h b/db/database.h index 0fcf386..868af0b 100644 --- a/db/database.h +++ b/db/database.h @@ -36,7 +36,7 @@ namespace mongo { : name(nm), path(_path), namespaceIndex( path, name ) { { // check db name is valid - int L = strlen(nm); + size_t L = strlen(nm); uassert( 10028 , "db name is empty", L > 0 ); uassert( 10029 , "bad db name [1]", *nm != '.' ); uassert( 10030 , "bad db name [2]", nm[L-1] != '.' ); @@ -63,8 +63,8 @@ namespace mongo { ~Database() { magic = 0; btreeStore->closeFiles(name, path); - int n = files.size(); - for ( int i = 0; i < n; i++ ) + size_t n = files.size(); + for ( size_t i = 0; i < n; i++ ) delete files[i]; } @@ -79,12 +79,19 @@ namespace mongo { return ! namespaceIndex.allocated(); } - bool exists(int n) { + boost::filesystem::path fileName( int n ) { stringstream ss; ss << name << '.' << n; boost::filesystem::path fullName; - fullName = boost::filesystem::path(path) / ss.str(); - return boost::filesystem::exists(fullName); + fullName = boost::filesystem::path(path); + if ( directoryperdb ) + fullName /= name; + fullName /= ss.str(); + return fullName; + } + + bool exists(int n) { + return boost::filesystem::exists( fileName( n ) ); } void openAllFiles() { @@ -124,10 +131,7 @@ namespace mongo { p = files[n]; } if ( p == 0 ) { - stringstream ss; - ss << name << '.' << n; - boost::filesystem::path fullName; - fullName = boost::filesystem::path(path) / ss.str(); + boost::filesystem::path fullName = fileName( n ); string fullNameString = fullName.string(); p = new MongoDataFile(n); int minSize = 0; @@ -29,6 +29,7 @@ #include "instance.h" #include "clientcursor.h" #include "pdfile.h" +#include "stats/counters.h" #if !defined(_WIN32) #include <sys/file.h> #endif @@ -40,6 +41,7 @@ #include "../scripting/engine.h" #include "module.h" #include "cmdline.h" +#include "stats/snapshots.h" namespace mongo { @@ -54,10 +56,11 @@ namespace mongo { extern string bind_ip; extern char *appsrvPath; - extern bool autoresync; extern int diagLogging; extern int lenForNewNsFiles; extern int lockFile; + + extern string repairpath; void setupSignals(); void closeAllSockets(); @@ -65,9 +68,14 @@ namespace mongo { void pairWith(const char *remoteEnd, const char *arb); void setRecCacheSize(unsigned MB); + void exitCleanly( ExitCode code ); + const char *ourgetns() { Client *c = currentClient.get(); - return c ? c->ns() : ""; + if ( ! c ) + return ""; + Client::Context* cc = c->getContext(); + return cc ? cc->ns() : ""; } struct MyStartupTests { @@ -80,7 +88,7 @@ namespace mongo { void testTheDb() { OpDebug debug; - setClient("sys.unittest.pdfile"); + Client::Context ctx("sys.unittest.pdfile"); /* this is not validly formatted, if you query this namespace bad things will happen */ theDataFileMgr.insert("sys.unittest.pdfile", (void *) "hello worldx", 13); @@ -99,8 +107,6 @@ namespace mongo { c->advance(); } out() << endl; - - cc().clearns(); } MessagingPort *connGrab = 0; @@ -137,13 +143,11 @@ namespace mongo { }; void webServerThread(); - void pdfileInit(); void listen(int port) { log() << mongodVersion() << endl; printGitVersion(); printSysInfo(); - pdfileInit(); //testTheDb(); log() << "waiting for connections on port " << port << endl; OurListener l(bind_ip, port); @@ -193,7 +197,7 @@ namespace mongo { try { - c.ai->isLocalHost = dbMsgPort.farEnd.isLocalHost(); + c.getAuthenticationInfo()->isLocalHost = dbMsgPort.farEnd.isLocalHost(); Message m; while ( 1 ) { @@ -206,6 +210,11 @@ namespace mongo { break; } + if ( inShutdown() ) { + log() << "got request after shutdown()" << endl; + break; + } + lastError.startRequest( m , le ); DbResponse dbresponse; @@ -236,6 +245,9 @@ namespace mongo { problem() << "SocketException in connThread, closing client connection" << endl; dbMsgPort.shutdown(); } + catch ( const ClockSkewException & ) { + exitCleanly( EXIT_CLOCK_SKEW ); + } catch ( std::exception &e ) { problem() << "Uncaught std::exception: " << e.what() << ", terminating" << endl; dbexit( EXIT_UNCAUGHT ); @@ -263,8 +275,10 @@ namespace mongo { // SockAddr db("172.16.0.179", MessagingPort::DBPort); MessagingPort p; - if ( !p.connect(db) ) + if ( !p.connect(db) ){ + out() << "msg couldn't connect" << endl; return; + } const int Loops = 1; for ( int q = 0; q < Loops; q++ ) { @@ -280,8 +294,9 @@ namespace mongo { Timer t; bool ok = p.call(send, response); double tm = ((double) t.micros()) + 1; - out() << " ****ok. response.data:" << ok << " time:" << tm / 1000.0 << "ms " << - ((double) len) * 8 / 1000000 / (tm/1000000) << "Mbps" << endl; + out() << " ****ok. response.data:" << ok << " time:" << tm / 1000.0 << "ms " + << "len: " << len << " data: " << response.data->_data << endl; + if ( q+1 < Loops ) { out() << "\t\tSLEEP 8 then sending again as a test" << endl; sleepsecs(8); @@ -327,15 +342,22 @@ namespace mongo { return repairDatabase( dbName.c_str(), errmsg ); } + extern bool checkNsFilesOnLoad; + void repairDatabases() { + Client::GodScope gs; log(1) << "enter repairDatabases" << endl; + + assert(checkNsFilesOnLoad); + checkNsFilesOnLoad = false; // we are mainly just checking the header - don't scan the whole .ns file for every db here. + dblock lk; vector< string > dbNames; getDatabaseNames( dbNames ); for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) { string dbName = *i; log(1) << "\t" << dbName << endl; - assert( !setClient( dbName.c_str() ) ); + Client::Context ctx( dbName ); MongoDataFile *p = cc().database()->getFile( 0 ); MDFHeader *h = p->getHeader(); if ( !h->currentVersion() || forceRepair ) { @@ -369,6 +391,8 @@ namespace mongo { cc().shutdown(); dbexit( EXIT_CLEAN ); } + + checkNsFilesOnLoad = true; } void clearTmpFiles() { @@ -377,12 +401,13 @@ namespace mongo { i != boost::filesystem::directory_iterator(); ++i ) { string fileName = boost::filesystem::path(*i).leaf(); if ( boost::filesystem::is_directory( *i ) && - fileName.length() > 2 && fileName.substr( 0, 3 ) == "tmp" ) + fileName.length() && fileName[ 0 ] == '$' ) boost::filesystem::remove_all( *i ); } } - + void clearTmpCollections() { + Client::GodScope gs; vector< string > toDelete; DBDirectClient cli; auto_ptr< DBClientCursor > c = cli.query( "local.system.namespaces", Query( fromjson( "{name:/^local.temp./}" ) ) ); @@ -395,7 +420,7 @@ namespace mongo { cli.dropCollection( *i ); } } - + /** * does background async flushes of mmapped files */ @@ -403,15 +428,23 @@ namespace mongo { public: void run(){ log(1) << "will flush memory every: " << _sleepsecs << " seconds" << endl; + int time_flushing = 0; while ( ! inShutdown() ){ if ( _sleepsecs == 0 ){ // in case at some point we add an option to change at runtime sleepsecs(5); continue; } - sleepmillis( (int)(_sleepsecs * 1000) ); - MemoryMappedFile::flushAll( false ); - log(1) << "flushing mmmap" << endl; + + sleepmillis( (int)(std::max(0.0, (_sleepsecs * 1000) - time_flushing)) ); + + Date_t start = jsTime(); + MemoryMappedFile::flushAll( true ); + time_flushing = (int) (jsTime() - start); + + globalFlushCounters.flushed(time_flushing); + + log(1) << "flushing mmap took " << time_flushing << "ms" << endl; } } @@ -445,14 +478,21 @@ namespace mongo { bool is32bit = sizeof(int*) == 4; log() << "Mongo DB : starting : pid = " << pid << " port = " << cmdLine.port << " dbpath = " << dbpath - << " master = " << master << " slave = " << (int) slave << " " << ( is32bit ? "32" : "64" ) << "-bit " << endl; - + << " master = " << replSettings.master << " slave = " << (int) replSettings.slave << " " << ( is32bit ? "32" : "64" ) << "-bit " << endl; + DEV log() << " FULL DEBUG ENABLED " << endl; show_32_warning(); - stringstream ss; - ss << "dbpath (" << dbpath << ") does not exist"; - massert( 10296 , ss.str().c_str(), boost::filesystem::exists( dbpath ) ); - + { + stringstream ss; + ss << "dbpath (" << dbpath << ") does not exist"; + massert( 10296 , ss.str().c_str(), boost::filesystem::exists( dbpath ) ); + } + { + stringstream ss; + ss << "repairpath (" << repairpath << ") does not exist"; + massert( 12590 , ss.str().c_str(), boost::filesystem::exists( repairpath ) ); + } + acquirePathLock(); remove_all( dbpath + "/_tmp/" ); @@ -461,11 +501,10 @@ namespace mongo { BOOST_CHECK_EXCEPTION( clearTmpFiles() ); Client::initThread("initandlisten"); + _diaglog.init(); clearTmpCollections(); - _diaglog.init(); - Module::initAll(); #if 0 @@ -493,6 +532,7 @@ namespace mongo { /* this is for security on certain platforms (nonce generation) */ srand((unsigned) (curTimeMicros() ^ startupSrandTimer.micros())); + snapshotThread.go(); listen(listenPort); // listen() will return when exit code closes its socket. @@ -557,6 +597,7 @@ string arg_error_check(int argc, char* argv[]) { int main(int argc, char* argv[], char *envp[] ) { + static StaticObserver staticObserver; getcurns = ourgetns; po::options_description general_options("General options"); @@ -564,25 +605,17 @@ int main(int argc, char* argv[], char *envp[] ) po::options_description sharding_options("Sharding options"); po::options_description visible_options("Allowed options"); po::options_description hidden_options("Hidden options"); - po::options_description cmdline_options("Command line options"); po::positional_options_description positional_options; + CmdLine::addGlobalOptions( general_options , hidden_options ); + general_options.add_options() - ("help,h", "show this usage information") - ("version", "show version information") - ("config,f", po::value<string>(), "configuration file specifying additional options") - ("port", po::value<int>(&cmdLine.port)/*->default_value(CmdLine::DefaultDBPort)*/, "specify port number") ("bind_ip", po::value<string>(&bind_ip), "local ip address to bind listener - all local ips bound by default") - ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)") ("dbpath", po::value<string>()->default_value("/data/db/"), "directory for datafiles") - ("quiet", "quieter output") - ("logpath", po::value<string>() , "file to send all output to instead of stdout" ) - ("logappend" , "appnd to logpath instead of over-writing" ) -#ifndef _WIN32 - ("fork" , "fork server process" ) -#endif + ("directoryperdb", "each database will be stored in a separate directory") + ("repairpath", po::value<string>() , "root directory for repair files - defaults to dbpath" ) ("cpu", "periodically show cpu and iowait utilization") ("noauth", "run without security") ("auth", "run with security") @@ -593,6 +626,7 @@ int main(int argc, char* argv[], char *envp[] ) ("nocursors", "diagnostic/debugging option") ("nohints", "ignore query hints") ("nohttpinterface", "disable http interface") + ("rest","turn on simple rest api") ("noscripting", "disable scripting engine") ("noprealloc", "disable data file preallocation") ("smallfiles", "use a smaller default file size") @@ -620,8 +654,10 @@ int main(int argc, char* argv[], char *envp[] ) ("only", po::value<string>(), "when slave: specify a single database to replicate") ("pairwith", po::value<string>(), "address of server to pair with") ("arbiter", po::value<string>(), "address of arbiter server") + ("slavedelay", po::value<int>(), "specify delay (in seconds) to be used when applying master ops to slave") + ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer") ("autoresync", "automatically resync if slave data is stale") - ("oplogSize", po::value<long>(), "size limit (in MB) for op log") + ("oplogSize", po::value<int>(), "size limit (in MB) for op log") ("opIdMem", po::value<long>(), "size limit (in bytes) for in memory storage of op ids") ; @@ -635,18 +671,12 @@ int main(int argc, char* argv[], char *envp[] ) ("cacheSize", po::value<long>(), "cache size (in MB) for rec store") ; - /* support for -vv -vvvv etc. */ - for (string s = "vv"; s.length() <= 10; s.append("v")) { - hidden_options.add_options()(s.c_str(), "verbose"); - } positional_options.add("command", 3); visible_options.add(general_options); visible_options.add(replication_options); visible_options.add(sharding_options); Module::addOptions( visible_options ); - cmdline_options.add(visible_options); - cmdline_options.add(hidden_options); setupSignals(); @@ -677,7 +707,7 @@ int main(int argc, char* argv[], char *envp[] ) bool removeService = false; bool startService = false; po::variables_map params; - + string error_message = arg_error_check(argc, argv); if (error_message != "") { cout << error_message << endl << endl; @@ -685,37 +715,9 @@ int main(int argc, char* argv[], char *envp[] ) return 0; } - /* don't allow guessing - creates ambiguities when some options are - * prefixes of others. allow long disguises and don't allow guessing - * to get away with our vvvvvvv trick. */ - int command_line_style = (((po::command_line_style::unix_style ^ - po::command_line_style::allow_guessing) | - po::command_line_style::allow_long_disguise) ^ - po::command_line_style::allow_sticky); - try { - po::store(po::command_line_parser(argc, argv).options(cmdline_options). - positional(positional_options). - style(command_line_style).run(), params); - - if (params.count("config")) { - ifstream config_file (params["config"].as<string>().c_str()); - if (config_file.is_open()) { - po::store(po::parse_config_file(config_file, cmdline_options), params); - config_file.close(); - } else { - cout << "ERROR: could not read from config file" << endl << endl; - cout << visible_options << endl; - return 0; - } - } - - po::notify(params); - } catch (po::error &e) { - cout << "ERROR: " << e.what() << endl << endl; - cout << visible_options << endl; + if ( ! CmdLine::store( argc , argv , visible_options , hidden_options , positional_options , params ) ) return 0; - } if (params.count("help")) { show_help_text(visible_options); @@ -727,16 +729,8 @@ int main(int argc, char* argv[], char *envp[] ) return 0; } dbpath = params["dbpath"].as<string>(); - if (params.count("quiet")) { - cmdLine.quiet = true; - } - if (params.count("verbose")) { - logLevel = 1; - } - for (string s = "vv"; s.length() <= 10; s.append("v")) { - if (params.count(s)) { - logLevel = s.length(); - } + if ( params.count("directoryperdb")) { + directoryperdb = true; } if (params.count("cpu")) { cmdLine.cpu = true; @@ -761,25 +755,11 @@ int main(int argc, char* argv[], char *envp[] ) /* casting away the const-ness here */ appsrvPath = (char*)(params["appsrvpath"].as<string>().c_str()); } -#ifndef _WIN32 - if (params.count("fork")) { - if ( ! params.count( "logpath" ) ){ - cout << "--fork has to be used with --logpath" << endl; - return -1; - } - pid_t c = fork(); - if ( c ){ - cout << "forked process: " << c << endl; - ::exit(0); - } - setsid(); - setupSignals(); - } -#endif - if (params.count("logpath")) { - string lp = params["logpath"].as<string>(); - uassert( 10033 , "logpath has to be non-zero" , lp.size() ); - initLogging( lp , params.count( "logappend" ) ); + if (params.count("repairpath")) { + repairpath = params["repairpath"].as<string>(); + uassert( 12589, "repairpath has to be non-zero", repairpath.size() ); + } else { + repairpath = dbpath; } if (params.count("nocursors")) { useCursors = false; @@ -790,6 +770,9 @@ int main(int argc, char* argv[], char *envp[] ) if (params.count("nohttpinterface")) { noHttpInterface = true; } + if (params.count("rest")) { + cmdLine.rest = true; + } if (params.count("noscripting")) { useJNI = false; } @@ -831,13 +814,19 @@ int main(int argc, char* argv[], char *envp[] ) startService = true; } if (params.count("master")) { - master = true; + replSettings.master = true; } if (params.count("slave")) { - slave = SimpleSlave; + replSettings.slave = SimpleSlave; + } + if (params.count("slavedelay")) { + replSettings.slavedelay = params["slavedelay"].as<int>(); + } + if (params.count("fastsync")) { + replSettings.fastsync = true; } if (params.count("autoresync")) { - autoresync = true; + replSettings.autoresync = true; } if (params.count("source")) { /* specifies what the source in local.sources should be */ @@ -864,7 +853,7 @@ int main(int argc, char* argv[], char *envp[] ) assert(lenForNewNsFiles > 0); } if (params.count("oplogSize")) { - long x = params["oplogSize"].as<long>(); + long x = params["oplogSize"].as<int>(); uassert( 10035 , "bad --oplogSize arg", x > 0); cmdLine.oplogSize = x * 1024 * 1024; assert(cmdLine.oplogSize > 0); @@ -872,8 +861,8 @@ int main(int argc, char* argv[], char *envp[] ) if (params.count("opIdMem")) { long x = params["opIdMem"].as<long>(); uassert( 10036 , "bad --opIdMem arg", x > 0); - opIdMem = x; - assert(opIdMem > 0); + replSettings.opIdMem = x; + assert(replSettings.opIdMem > 0); } if (params.count("cacheSize")) { long x = params["cacheSize"].as<long>(); @@ -974,13 +963,13 @@ namespace mongo { #undef out - void exitCleanly() { + void exitCleanly( ExitCode code ) { goingAway = true; killCurrentOp.killAll(); { dblock lk; log() << "now exiting" << endl; - dbexit( EXIT_KILL ); + dbexit( code ); } } @@ -1026,9 +1015,18 @@ namespace mongo { int x; sigwait( &asyncSignals, &x ); log() << "got kill or ctrl c signal " << x << " (" << strsignal( x ) << "), will terminate after current cmd ends" << endl; - exitCleanly(); + Client::initThread( "interruptThread" ); + exitCleanly( EXIT_KILL ); } + // this will be called in certain c++ error cases, for example if there are two active + // exceptions + void myterminate() { + rawOut( "terminate() called, printing stack:\n" ); + printStackTrace(); + abort(); + } + void setupSignals() { assert( signal(SIGSEGV, abruptQuit) != SIG_ERR ); assert( signal(SIGFPE, abruptQuit) != SIG_ERR ); @@ -1044,12 +1042,15 @@ namespace mongo { sigaddset( &asyncSignals, SIGTERM ); assert( pthread_sigmask( SIG_SETMASK, &asyncSignals, 0 ) == 0 ); boost::thread it( interruptThread ); + + set_terminate( myterminate ); } #else void ctrlCTerminate() { - log() << "got kill or ctrl c signal, will terminate after current cmd ends" << endl; - exitCleanly(); + log() << "got kill or ctrl-c signal, will terminate after current cmd ends" << endl; + Client::initThread( "ctrlCTerminate" ); + exitCleanly( EXIT_KILL ); } BOOL CtrlHandler( DWORD fdwCtrlType ) { @@ -1086,14 +1087,6 @@ BOOL CtrlHandler( DWORD fdwCtrlType ) } #endif -void temptestfoo() { - MongoMutex m; - m.lock(); -// m.lock_upgrade(); - m.lock_shared(); -} - - } // namespace mongo #include "recstore.h" @@ -18,7 +18,6 @@ #include "../stdafx.h" #include "../util/message.h" -#include "../util/top.h" #include "boost/version.hpp" #include "concurrency.h" #include "pdfile.h" @@ -47,16 +46,36 @@ namespace mongo { */ class DatabaseHolder { public: + typedef map<string,Database*> DBs; + typedef map<string,DBs> Paths; + DatabaseHolder() : _size(0){ } - Database * get( const string& ns , const string& path ){ + bool isLoaded( const string& ns , const string& path ) const { dbMutex.assertAtLeastReadLocked(); - map<string,Database*>& m = _paths[path]; + Paths::const_iterator x = _paths.find( path ); + if ( x == _paths.end() ) + return false; + const DBs& m = x->second; string db = _todb( ns ); - map<string,Database*>::iterator it = m.find(db); + DBs::const_iterator it = m.find(db); + return it != m.end(); + } + + + Database * get( const string& ns , const string& path ) const { + dbMutex.assertAtLeastReadLocked(); + Paths::const_iterator x = _paths.find( path ); + if ( x == _paths.end() ) + return 0; + const DBs& m = x->second; + + string db = _todb( ns ); + + DBs::const_iterator it = m.find(db); if ( it != m.end() ) return it->second; return 0; @@ -64,20 +83,42 @@ namespace mongo { void put( const string& ns , const string& path , Database * db ){ dbMutex.assertWriteLocked(); - map<string,Database*>& m = _paths[path]; + DBs& m = _paths[path]; Database*& d = m[_todb(ns)]; if ( ! d ) _size++; d = db; } + Database* getOrCreate( const string& ns , const string& path , bool& justCreated ){ + dbMutex.assertWriteLocked(); + DBs& m = _paths[path]; + + string dbname = _todb( ns ); + + Database* & db = m[dbname]; + if ( db ){ + justCreated = false; + return db; + } + + log(1) << "Accessing: " << dbname << " for the first time" << endl; + db = new Database( dbname.c_str() , justCreated , path ); + _size++; + return db; + } + + + + void erase( const string& ns , const string& path ){ dbMutex.assertWriteLocked(); - map<string,Database*>& m = _paths[path]; - _size -= m.erase( _todb( ns ) ); + DBs& m = _paths[path]; + _size -= (int)m.erase( _todb( ns ) ); } - bool closeAll( const string& path , BSONObjBuilder& result ); + /* force - force close even if something underway - use at shutdown */ + bool closeAll( const string& path , BSONObjBuilder& result, bool force ); int size(){ return _size; @@ -86,107 +127,68 @@ namespace mongo { /** * gets all unique db names, ignoring paths */ - void getAllShortNames( set<string>& all ) const{ + void getAllShortNames( set<string>& all ) const { dbMutex.assertAtLeastReadLocked(); - for ( map<string, map<string,Database*> >::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ){ - map<string,Database*> m = i->second; - for( map<string,Database*>::const_iterator j=m.begin(); j!=m.end(); j++ ){ + for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ){ + DBs m = i->second; + for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ){ all.insert( j->first ); } } } - + private: - string _todb( const string& ns ){ + string _todb( const string& ns ) const { size_t i = ns.find( '.' ); if ( i == string::npos ) return ns; return ns.substr( 0 , i ); } - map<string, map<string,Database*> > _paths; + Paths _paths; int _size; }; extern DatabaseHolder dbHolder; - /* returns true if the database ("database") did not exist, and it was created on this call - path - datafiles directory, if not the default, so we can differentiate between db's of the same - name in different places (for example temp ones on repair). - */ - inline bool setClient(const char *ns, const string& path , mongolock *lock ) { - if( logLevel > 5 ) - log() << "setClient: " << ns << endl; - - dbMutex.assertAtLeastReadLocked(); - - Client& c = cc(); - c.top.clientStart( ns ); - - Database * db = dbHolder.get( ns , path ); - if ( db ){ - c.setns(ns, db ); - return false; - } - - if( lock ) - lock->releaseAndWriteLock(); - - assertInWriteLock(); - - char cl[256]; - nsToDatabase(ns, cl); - bool justCreated; - Database *newdb = new Database(cl, justCreated, path); - dbHolder.put(ns,path,newdb); - c.setns(ns, newdb); - - newdb->finishInit(); - - return justCreated; - } - // shared functionality for removing references to a database from this program instance // does not delete the files on disk void closeDatabase( const char *cl, const string& path = dbpath ); - + struct dbtemprelease { - string clientname; - string clientpath; - int locktype; + Client::Context * _context; + int _locktype; + dbtemprelease() { - Client& client = cc(); - Database *database = client.database(); - if ( database ) { - clientname = database->name; - clientpath = database->path; - } - client.top.clientStop(); - locktype = dbMutex.getState(); - assert( locktype ); - if ( locktype > 0 ) { - massert( 10298 , "can't temprelease nested write lock", locktype == 1); + _context = cc().getContext(); + _locktype = dbMutex.getState(); + assert( _locktype ); + + if ( _locktype > 0 ) { + massert( 10298 , "can't temprelease nested write lock", _locktype == 1); + if ( _context ) _context->unlocked(); dbMutex.unlock(); } else { - massert( 10299 , "can't temprelease nested read lock", locktype == -1); + massert( 10299 , "can't temprelease nested read lock", _locktype == -1); + if ( _context ) _context->unlocked(); dbMutex.unlock_shared(); } + } ~dbtemprelease() { - if ( locktype > 0 ) + if ( _locktype > 0 ) dbMutex.lock(); else dbMutex.lock_shared(); - if ( clientname.empty() ) - cc().setns("", 0); - else - setClient(clientname.c_str(), clientpath.c_str()); + + if ( _context ) _context->relocked(); } }; + /** only does a temp release if we're not nested and have a lock */ @@ -212,7 +214,6 @@ namespace mongo { extern TicketHolder connTicketHolder; - } // namespace mongo //#include "dbinfo.h" @@ -15,10 +15,7 @@ EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
ProjectSection(SolutionItems) = preProject
..\tools\bridge.cpp = ..\tools\bridge.cpp
- ..\tools\export.cpp = ..\tools\export.cpp
- ..\tools\files.cpp = ..\tools\files.cpp
..\tools\sniffer.cpp = ..\tools\sniffer.cpp
- ..\tools\tool.cpp = ..\tools\tool.cpp
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}"
diff --git a/db/db.vcproj b/db/db.vcproj index 6dc0aae..3ea7506 100644 --- a/db/db.vcproj +++ b/db/db.vcproj @@ -144,7 +144,7 @@ />
<Tool
Name="VCLinkerTool"
- AdditionalDependencies="ws2_32.lib"
+ AdditionalDependencies="ws2_32.lib psapi.lib"
LinkIncremental="1"
AdditionalLibraryDirectories=""c:\program files\boost\boost_1_35_0\lib""
GenerateDebugInformation="true"
@@ -350,48 +350,8 @@ >
</File>
<File
- RelativePath="..\..\js\js\Debug\js.lib"
+ RelativePath="..\..\js\src\js.lib"
>
- <FileConfiguration
- Name="Release|Win32"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\js\js\Release\js.lib"
- >
- <FileConfiguration
- Name="Debug|Win32"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug Recstore|Win32"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="C:\Program Files\Java\jdk\lib\jvm.lib"
- >
- <FileConfiguration
- Name="release_nojni|Win32"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- />
- </FileConfiguration>
</File>
<File
RelativePath="..\pcre-7.4\pcrecpp.cc"
@@ -1342,30 +1302,18 @@ >
</File>
<File
- RelativePath="..\client\quorum.cpp"
+ RelativePath="..\client\syncclusterconnection.cpp"
>
</File>
- <Filter
- Name="btree related"
- >
- <File
- RelativePath=".\btree.cpp"
- >
- </File>
- <File
- RelativePath=".\btree.h"
- >
- </File>
- <File
- RelativePath=".\btreecursor.cpp"
- >
- </File>
- </Filter>
</Filter>
<Filter
Name="db"
>
<File
+ RelativePath=".\background.h"
+ >
+ </File>
+ <File
RelativePath=".\client.h"
>
</File>
@@ -1374,6 +1322,10 @@ >
</File>
<File
+ RelativePath=".\cmdline.cpp"
+ >
+ </File>
+ <File
RelativePath=".\cmdline.h"
>
</File>
@@ -1414,6 +1366,14 @@ >
</File>
<File
+ RelativePath=".\diskloc.h"
+ >
+ </File>
+ <File
+ RelativePath=".\index.h"
+ >
+ </File>
+ <File
RelativePath=".\introspect.h"
>
</File>
@@ -1485,6 +1445,10 @@ RelativePath="..\stdafx.h"
>
</File>
+ <File
+ RelativePath=".\update.h"
+ >
+ </File>
<Filter
Name="cpp"
Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
@@ -1507,6 +1471,10 @@ >
</File>
<File
+ RelativePath=".\common.cpp"
+ >
+ </File>
+ <File
RelativePath=".\cursor.cpp"
>
</File>
@@ -1539,10 +1507,6 @@ >
</File>
<File
- RelativePath=".\dbstats.cpp"
- >
- </File>
- <File
RelativePath=".\dbwebserver.cpp"
>
</File>
@@ -1555,6 +1519,10 @@ >
</File>
<File
+ RelativePath=".\index_geo2d.cpp"
+ >
+ </File>
+ <File
RelativePath=".\instance.cpp"
>
</File>
@@ -1671,10 +1639,6 @@ >
</File>
<File
- RelativePath="..\util\top.cpp"
- >
- </File>
- <File
RelativePath=".\update.cpp"
>
</File>
@@ -1884,6 +1848,42 @@ />
</FileConfiguration>
</File>
+ <File
+ RelativePath="..\scripting\utils.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="stats"
+ >
+ <File
+ RelativePath=".\stats\counters.cpp"
+ >
+ </File>
+ <File
+ RelativePath=".\stats\snapshots.cpp"
+ >
+ </File>
+ <File
+ RelativePath=".\stats\top.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="btree related"
+ >
+ <File
+ RelativePath=".\btree.cpp"
+ >
+ </File>
+ <File
+ RelativePath=".\btree.h"
+ >
+ </File>
+ <File
+ RelativePath=".\btreecursor.cpp"
+ >
+ </File>
</Filter>
</Files>
<Globals>
diff --git a/db/dbcommands.cpp b/db/dbcommands.cpp index ff072a1..6d1aa5a 100644 --- a/db/dbcommands.cpp +++ b/db/dbcommands.cpp @@ -36,7 +36,8 @@ #include "security.h" #include "queryoptimizer.h" #include "../scripting/engine.h" -#include "dbstats.h" +#include "stats/counters.h" +#include "background.h" namespace mongo { @@ -56,13 +57,15 @@ namespace mongo { virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return WRITE; } virtual void help( stringstream& help ) const { help << "shutdown the database. must be ran against admin db and either (1) ran from localhost or (2) authenticated.\n"; } CmdShutdown() : Command("shutdown") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + cc().shutdown(); log() << "terminating, shutdown command received" << endl; - dbexit( EXIT_CLEAN ); + dbexit( EXIT_CLEAN ); // this never returns return true; } } cmdShutdown; @@ -75,7 +78,7 @@ namespace mongo { */ class CmdResetError : public Command { public: - virtual bool readOnly() { return true; } + virtual LockType locktype(){ return NONE; } virtual bool requiresAuth() { return false; } virtual bool logTheOp() { return false; @@ -98,7 +101,7 @@ namespace mongo { /* for diagnostic / testing purposes. */ class CmdSleep : public Command { public: - virtual bool readOnly() { return true; } + virtual LockType locktype(){ return READ; } virtual bool adminOnly() { return true; } virtual bool logTheOp() { return false; @@ -118,7 +121,7 @@ namespace mongo { class CmdGetLastError : public Command { public: - virtual bool readOnly() { return true; } + virtual LockType locktype(){ return NONE; } virtual bool requiresAuth() { return false; } virtual bool logTheOp() { return false; @@ -155,6 +158,7 @@ namespace mongo { virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return NONE; } CmdForceError() : Command("forceerror") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { uassert( 10038 , "forced error", false); @@ -164,7 +168,7 @@ namespace mongo { class CmdGetPrevError : public Command { public: - virtual bool readOnly() { return true; } + virtual LockType locktype(){ return NONE; } virtual bool requiresAuth() { return false; } virtual bool logTheOp() { return false; @@ -199,6 +203,7 @@ namespace mongo { virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return NONE; } CmdSwitchToClientErrors() : Command("switchtoclienterrors") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { if ( lastError.getID() ){ @@ -223,9 +228,10 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } CmdDropDatabase() : Command("dropDatabase") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - BSONElement e = cmdObj.findElement(name); + BSONElement e = cmdObj.getField(name); log() << "dropDatabase " << ns << endl; int p = (int) e.number(); if ( p != 1 ) @@ -247,16 +253,17 @@ namespace mongo { virtual void help( stringstream& help ) const { help << "repair database. also compacts. note: slow."; } + virtual LockType locktype(){ return WRITE; } CmdRepairDatabase() : Command("repairDatabase") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - BSONElement e = cmdObj.findElement(name); + BSONElement e = cmdObj.getField(name); log() << "repairDatabase " << ns << endl; int p = (int) e.number(); if ( p != 1 ) return false; - e = cmdObj.findElement( "preserveClonedFilesOnFailure" ); + e = cmdObj.getField( "preserveClonedFilesOnFailure" ); bool preserveClonedFilesOnFailure = e.isBoolean() && e.boolean(); - e = cmdObj.findElement( "backupOriginalFiles" ); + e = cmdObj.getField( "backupOriginalFiles" ); bool backupOriginalFiles = e.isBoolean() && e.boolean(); return repairDatabase( ns, errmsg, preserveClonedFilesOnFailure, backupOriginalFiles ); } @@ -274,9 +281,10 @@ namespace mongo { virtual void help( stringstream& help ) const { help << "enable or disable performance profiling"; } + virtual LockType locktype(){ return WRITE; } CmdProfile() : Command("profile") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - BSONElement e = cmdObj.findElement(name); + BSONElement e = cmdObj.getField(name); result.append("was", (double) cc().database()->profile); int p = (int) e.number(); bool ok = false; @@ -302,9 +310,15 @@ namespace mongo { CmdServerStatus() : Command("serverStatus") { started = time(0); } + + virtual LockType locktype(){ return NONE; } + bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + + bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin"); result.append("uptime",(double) (time(0)-started)); + result.appendDate( "localTime" , jsTime() ); { BSONObjBuilder t; @@ -316,19 +330,19 @@ namespace mongo { double tl = (double) timeLocked; t.append("totalTime", tt); t.append("lockTime", tl); - t.append("ratio", tl/tt); + t.append("ratio", (tt ? tl/tt : 0)); result.append( "globalLock" , t.obj() ); } - { - + if ( authed ){ + BSONObjBuilder t( result.subobjStart( "mem" ) ); ProcessInfo p; if ( p.supported() ){ - t.append( "resident" , p.getResidentSize() ); - t.append( "virtual" , p.getVirtualMemorySize() ); + t.appendNumber( "resident" , p.getResidentSize() ); + t.appendNumber( "virtual" , p.getVirtualMemorySize() ); t.appendBool( "supported" , true ); } else { @@ -336,7 +350,7 @@ namespace mongo { t.appendBool( "supported" , false ); } - t.append( "mapped" , MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) ); + t.appendNumber( "mapped" , MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) ); t.done(); @@ -348,7 +362,8 @@ namespace mongo { bb.append( "available" , connTicketHolder.available() ); bb.done(); } - { + + if ( authed ){ BSONObjBuilder bb( result.subobjStart( "extra_info" ) ); bb.append("note", "fields vary by platform"); ProcessInfo p; @@ -356,8 +371,40 @@ namespace mongo { bb.done(); } + + { + BSONObjBuilder bb( result.subobjStart( "indexCounters" ) ); + globalIndexCounters.append( bb ); + bb.done(); + } + + { + BSONObjBuilder bb( result.subobjStart( "backgroundFlushing" ) ); + globalFlushCounters.append( bb ); + bb.done(); + } + + if ( anyReplEnabled() ){ + BSONObjBuilder bb( result.subobjStart( "repl" ) ); + appendReplicationInfo( bb , authed , cmdObj["repl"].numberInt() ); + bb.done(); + } + result.append( "opcounters" , globalOpCounters.getObj() ); + { + BSONObjBuilder asserts( result.subobjStart( "asserts" ) ); + asserts.append( "regular" , assertionCount.regular ); + asserts.append( "warning" , assertionCount.warning ); + asserts.append( "msg" , assertionCount.msg ); + asserts.append( "user" , assertionCount.user ); + asserts.append( "rollovers" , assertionCount.rollovers ); + asserts.done(); + } + + if ( ! authed ) + result.append( "note" , "run against admin for more info" ); + return true; } time_t started; @@ -372,6 +419,7 @@ namespace mongo { virtual void help( stringstream& help ) const { help << "check if any asserts have occurred on the server"; } + virtual LockType locktype(){ return WRITE; } CmdAssertInfo() : Command("assertinfo") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { result.appendBool("dbasserted", lastAssert[0].isSet() || lastAssert[1].isSet() || lastAssert[2].isSet()); @@ -389,8 +437,10 @@ namespace mongo { virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return NONE; } CmdGetOpTime() : Command("getoptime") { } bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + writelock l( "" ); result.appendDate("optime", OpTime::now().asDate()); return true; } @@ -416,6 +466,7 @@ namespace mongo { bool adminOnly() { return true; } + virtual LockType locktype(){ return WRITE; } bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() ); stringstream ss; @@ -451,7 +502,12 @@ namespace mongo { } } dbc_unittest; - bool deleteIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool mayDeleteIdIndex ) { + void assureSysIndexesEmptied(const char *ns, IndexDetails *exceptForIdIndex); + int removeFromSysIndexes(const char *ns, const char *idxName); + + bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool mayDeleteIdIndex ) { + + BackgroundOperation::assertNoBgOpInProgForNs(ns); d->aboutToDeleteAnIndex(); @@ -479,7 +535,10 @@ namespace mongo { } /* assuming here that id index is not multikey: */ d->multiKeyIndexBits = 0; - anObjBuilder.append("msg", "all indexes deleted for collection"); + assureSysIndexesEmptied(ns, idIndex); + anObjBuilder.append("msg", mayDeleteIdIndex ? + "indexes dropped for collection" : + "non-_id indexes dropped for collection"); } else { // delete just one index @@ -503,7 +562,11 @@ namespace mongo { for ( int i = x; i < d->nIndexes; i++ ) d->idx(i) = d->idx(i+1); } else { - log() << "deleteIndexes: " << name << " not found" << endl; + int n = removeFromSysIndexes(ns, name); // just in case an orphaned listing there - i.e. should have been repaired but wasn't + if( n ) { + log() << "info: removeFromSysIndexes cleaned up " << n << " entries" << endl; + } + log() << "dropIndexes: " << name << " not found" << endl; errmsg = "index not found"; return false; } @@ -524,8 +587,9 @@ namespace mongo { virtual bool adminOnly() { return false; } + virtual LockType locktype(){ return WRITE; } virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { - string nsToDrop = cc().database()->name + '.' + cmdObj.findElement(name).valuestr(); + string nsToDrop = cc().database()->name + '.' + cmdObj.getField(name).valuestr(); NamespaceDetails *d = nsdetails(nsToDrop.c_str()); if ( !cmdLine.quiet ) log() << "CMD: drop " << nsToDrop << endl; @@ -542,14 +606,14 @@ namespace mongo { /* select count(*) */ class CmdCount : public Command { public: - virtual bool readOnly() { return true; } + virtual LockType locktype(){ return READ; } CmdCount() : Command("count") { } virtual bool logTheOp() { return false; } virtual bool slaveOk() { // ok on --slave setups, not ok for nonmaster of a repl pair (unless override) - return slave == SimpleSlave; + return replSettings.slave == SimpleSlave; } virtual bool slaveOverrideOk() { return true; @@ -558,7 +622,7 @@ namespace mongo { return false; } virtual bool run(const char *_ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { - string ns = cc().database()->name + '.' + cmdObj.findElement(name).valuestr(); + string ns = cc().database()->name + '.' + cmdObj.getField(name).valuestr(); string err; long long n = runCount(ns.c_str(), cmdObj, err); long long nn = n; @@ -591,11 +655,12 @@ namespace mongo { virtual bool adminOnly() { return false; } + virtual LockType locktype(){ return WRITE; } virtual void help( stringstream& help ) const { help << "create a collection"; } virtual bool run(const char *_ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { - string ns = cc().database()->name + '.' + cmdObj.findElement(name).valuestr(); + string ns = cc().database()->name + '.' + cmdObj.getField(name).valuestr(); string err; bool ok = userCreateNS(ns.c_str(), cmdObj, err, true); if ( !ok && !err.empty() ) @@ -604,7 +669,8 @@ namespace mongo { } } cmdCreate; - class CmdDeleteIndexes : public Command { + /* "dropIndexes" is now the preferred form - "deleteIndexes" deprecated */ + class CmdDropIndexes : public Command { public: virtual bool logTheOp() { return true; @@ -612,21 +678,34 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } virtual void help( stringstream& help ) const { - help << "delete indexes for a collection"; + help << "drop indexes for a collection"; } - CmdDeleteIndexes() : Command("deleteIndexes") { } + CmdDropIndexes(const char *cmdname = "dropIndexes") : Command(cmdname) { } bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) { - /* note: temp implementation. space not reclaimed! */ - BSONElement e = jsobj.findElement(name.c_str()); + BSONElement e = jsobj.getField(name.c_str()); string toDeleteNs = cc().database()->name + '.' + e.valuestr(); NamespaceDetails *d = nsdetails(toDeleteNs.c_str()); if ( !cmdLine.quiet ) - log() << "CMD: deleteIndexes " << toDeleteNs << endl; + log() << "CMD: dropIndexes " << toDeleteNs << endl; if ( d ) { - BSONElement f = jsobj.findElement("index"); + BSONElement f = jsobj.getField("index"); if ( f.type() == String ) { - return deleteIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false ); + return dropIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false ); + } + else if ( f.type() == Object ){ + int idxId = d->findIndexByKeyPattern( f.embeddedObject() ); + if ( idxId < 0 ){ + errmsg = "can't find index with key:"; + errmsg += f.embeddedObject(); + return false; + } + else { + IndexDetails& ii = d->idx( idxId ); + string iName = ii.indexName(); + return dropIndexes( d, toDeleteNs.c_str(), iName.c_str() , errmsg, anObjBuilder, false ); + } } else { errmsg = "invalid index name spec"; @@ -638,6 +717,10 @@ namespace mongo { return false; } } + } cmdDropIndexes; + class CmdDeleteIndexes : public CmdDropIndexes { + public: + CmdDeleteIndexes() : CmdDropIndexes("deleteIndexes") { } } cmdDeleteIndexes; class CmdReIndex : public Command { @@ -648,14 +731,17 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } virtual void help( stringstream& help ) const { help << "re-index a collection"; } CmdReIndex() : Command("reIndex") { } bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { + BackgroundOperation::assertNoBgOpInProgForNs(ns); + static DBDirectClient db; - BSONElement e = jsobj.findElement(name.c_str()); + BSONElement e = jsobj.getField(name.c_str()); string toDeleteNs = cc().database()->name + '.' + e.valuestr(); NamespaceDetails *d = nsdetails(toDeleteNs.c_str()); log() << "CMD: reIndex " << toDeleteNs << endl; @@ -675,9 +761,9 @@ namespace mongo { } - bool ok = deleteIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true ); + bool ok = dropIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true ); if ( ! ok ){ - errmsg = "deleteIndexes failed"; + errmsg = "dropIndexes failed"; return false; } @@ -693,8 +779,6 @@ namespace mongo { } } cmdReIndex; - - class CmdListDatabases : public Command { public: virtual bool logTheOp() { @@ -709,6 +793,7 @@ namespace mongo { virtual bool adminOnly() { return true; } + virtual LockType locktype(){ return WRITE; } CmdListDatabases() : Command("listDatabases") {} bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { vector< string > dbNames; @@ -722,8 +807,8 @@ namespace mongo { b.append( "name", i->c_str() ); boost::intmax_t size = dbSize( i->c_str() ); b.append( "sizeOnDisk", (double) size ); - setClient( i->c_str() ); - b.appendBool( "empty", cc().database()->isEmpty() ); + Client::Context ctx( *i ); + b.appendBool( "empty", ctx.db()->isEmpty() ); totalSize += size; dbInfos.push_back( b.obj() ); @@ -741,8 +826,8 @@ namespace mongo { BSONObjBuilder b; b << "name" << name << "sizeOnDisk" << double( 1 ); - setClient( name.c_str() ); - b.appendBool( "empty", cc().database()->isEmpty() ); + Client::Context ctx( name ); + b.appendBool( "empty", ctx.db()->isEmpty() ); dbInfos.push_back( b.obj() ); } @@ -753,13 +838,17 @@ namespace mongo { } } cmdListDatabases; + /* note an access to a database right after this will open it back up - so this is mainly + for diagnostic purposes. + */ class CmdCloseAllDatabases : public Command { public: virtual bool adminOnly() { return true; } virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {} bool run(const char *ns, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { - return dbHolder.closeAll( dbpath , result ); + return dbHolder.closeAll( dbpath , result, false ); } } cmdCloseAllDatabases; @@ -772,6 +861,7 @@ namespace mongo { virtual void help( stringstream& help ) const { help << " example: { filemd5 : ObjectId(aaaaaaa) , key : { ts : 1 } }"; } + virtual LockType locktype(){ return READ; } bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ static DBDirectClient db; @@ -831,6 +921,7 @@ namespace mongo { public: CmdMedianKey() : Command( "medianKey" ) {} virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return READ; } virtual void help( stringstream &help ) const { help << " example: { medianKey:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }\n" "NOTE: This command may take awhile to run"; @@ -840,6 +931,8 @@ namespace mongo { BSONObj min = jsobj.getObjectField( "min" ); BSONObj max = jsobj.getObjectField( "max" ); BSONObj keyPattern = jsobj.getObjectField( "keyPattern" ); + + Client::Context ctx( ns ); IndexDetails *id = cmdIndexDetailsForRange( ns, errmsg, min, max, keyPattern ); if ( id == 0 ) @@ -872,6 +965,7 @@ namespace mongo { public: CmdDatasize() : Command( "datasize" ) {} virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return READ; } virtual void help( stringstream &help ) const { help << "\ndetermine data size for a set of data in a certain range" @@ -885,9 +979,10 @@ namespace mongo { BSONObj max = jsobj.getObjectField( "max" ); BSONObj keyPattern = jsobj.getObjectField( "keyPattern" ); + Client::Context ctx( ns ); + auto_ptr< Cursor > c; if ( min.isEmpty() && max.isEmpty() ) { - setClient( ns ); c = theDataFileMgr.findAll( ns ); } else if ( min.isEmpty() || max.isEmpty() ) { errmsg = "only one of min or max specified"; @@ -923,19 +1018,40 @@ namespace mongo { } } cmdDatasize; + namespace { + long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ){ + DBDirectClient client; + auto_ptr<DBClientCursor> indexes = + client.query(db + ".system.indexes", QUERY( "ns" << ns)); + + long long totalSize = 0; + while (indexes->more()){ + BSONObj index = indexes->nextSafe(); + NamespaceDetails * nsd = nsdetails( (ns + ".$" + index["name"].valuestrsafe()).c_str() ); + if (!nsd) + continue; // nothing to do here + totalSize += nsd->datasize; + if (details) + details->appendNumber(index["name"].valuestrsafe(), nsd->datasize / scale ); + } + return totalSize; + } + } + class CollectionStats : public Command { public: CollectionStats() : Command( "collstats" ) {} virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return READ; } virtual void help( stringstream &help ) const { help << " example: { collstats:\"blog.posts\" } "; } - bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ - string ns = dbname; - if ( ns.find( "." ) != string::npos ) - ns = ns.substr( 0 , ns.find( "." ) ); - ns += "."; - ns += jsobj.firstElement().valuestr(); + bool run(const char *dbname_c, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + string dbname = dbname_c; + if ( dbname.find( "." ) != string::npos ) + dbname = dbname.substr( 0 , dbname.find( "." ) ); + + string ns = dbname + "." + jsobj.firstElement().valuestr(); NamespaceDetails * nsd = nsdetails( ns.c_str() ); if ( ! nsd ){ @@ -944,12 +1060,25 @@ namespace mongo { } result.append( "ns" , ns.c_str() ); - - result.append( "count" , nsd->nrecords ); - result.append( "size" , nsd->datasize ); - result.append( "storageSize" , nsd->storageSize() ); + + int scale = 1; + if ( jsobj["scale"].isNumber() ) + scale = jsobj["scale"].numberInt(); + + result.appendNumber( "count" , nsd->nrecords ); + result.appendNumber( "size" , nsd->datasize / scale ); + int numExtents; + result.appendNumber( "storageSize" , nsd->storageSize( &numExtents ) / scale ); + result.append( "numExtents" , numExtents ); result.append( "nindexes" , nsd->nIndexes ); + result.append( "lastExtentSize" , nsd->lastExtentSize / scale ); + result.append( "paddingFactor" , nsd->paddingFactor ); + result.append( "flags" , nsd->flags ); + BSONObjBuilder indexSizes; + result.appendNumber( "totalIndexSize" , getIndexSizeForCollection(dbname, ns, &indexSizes, scale) / scale ); + result.append("indexSizes", indexSizes.obj()); + if ( nsd->capped ){ result.append( "capped" , nsd->capped ); result.append( "max" , nsd->max ); @@ -959,11 +1088,70 @@ namespace mongo { } } cmdCollectionStatis; + + class DBStats : public Command { + public: + DBStats() : Command( "dbstats" ) {} + virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return READ; } + virtual void help( stringstream &help ) const { + help << " example: { dbstats:1 } "; + } + bool run(const char *dbname_c, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + string dbname = dbname_c; + if ( dbname.find( "." ) != string::npos ) + dbname = dbname.substr( 0 , dbname.find( "." ) ); + + DBDirectClient client; + const list<string> collections = client.getCollectionNames(dbname); + + long long ncollections = 0; + long long objects = 0; + long long size = 0; + long long storageSize = 0; + long long numExtents = 0; + long long indexes = 0; + long long indexSize = 0; + + for (list<string>::const_iterator it = collections.begin(); it != collections.end(); ++it){ + const string ns = *it; + + NamespaceDetails * nsd = nsdetails( ns.c_str() ); + if ( ! nsd ){ + // should this assert here? + continue; + } + + ncollections += 1; + objects += nsd->nrecords; + size += nsd->datasize; + + int temp; + storageSize += nsd->storageSize( &temp ); + numExtents += temp; + + indexes += nsd->nIndexes; + indexSize += getIndexSizeForCollection(dbname, ns); + } + + result.appendNumber( "collections" , ncollections ); + result.appendNumber( "objects" , objects ); + result.appendNumber( "dataSize" , size ); + result.appendNumber( "storageSize" , storageSize); + result.appendNumber( "numExtents" , numExtents ); + result.appendNumber( "indexes" , indexes ); + result.appendNumber( "indexSize" , indexSize ); + + return true; + } + } cmdDBStats; + class CmdBuildInfo : public Command { public: CmdBuildInfo() : Command( "buildinfo" ) {} virtual bool slaveOk() { return true; } virtual bool adminOnly() { return true; } + virtual LockType locktype(){ return NONE; } virtual void help( stringstream &help ) const { help << "example: { buildinfo:1 }"; } @@ -974,10 +1162,12 @@ namespace mongo { } } cmdBuildInfo; + /* convertToCapped seems to use this */ class CmdCloneCollectionAsCapped : public Command { public: CmdCloneCollectionAsCapped() : Command( "cloneCollectionAsCapped" ) {} virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } virtual void help( stringstream &help ) const { help << "example: { cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }"; } @@ -996,15 +1186,13 @@ namespace mongo { string fromNs = string( realDbName ) + "." + from; string toNs = string( realDbName ) + "." + to; - massert( 10300 , "source collection " + fromNs + " does not exist", !setClient( fromNs.c_str() ) ); NamespaceDetails *nsd = nsdetails( fromNs.c_str() ); massert( 10301 , "source collection " + fromNs + " does not exist", nsd ); - long long excessSize = nsd->datasize - size * 2; + long long excessSize = nsd->datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size' DiskLoc extent = nsd->firstExtent; - for( ; excessSize > 0 && extent != nsd->lastExtent; extent = extent.ext()->xnext ) { + for( ; excessSize > extent.ext()->length && extent != nsd->lastExtent; extent = extent.ext()->xnext ) { excessSize -= extent.ext()->length; - if ( excessSize > 0 ) - log( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl; + log( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl; log( 6 ) << "excessSize: " << excessSize << endl; } DiskLoc startLoc = extent.ext()->firstRecord; @@ -1012,15 +1200,13 @@ namespace mongo { CursorId id; { auto_ptr< Cursor > c = theDataFileMgr.findAll( fromNs.c_str(), startLoc ); - ClientCursor *cc = new ClientCursor(); - cc->c = c; - cc->ns = fromNs; + ClientCursor *cc = new ClientCursor(c, fromNs.c_str(), true); cc->matcher.reset( new CoveredIndexMatcher( BSONObj(), fromjson( "{$natural:1}" ) ) ); id = cc->cursorid; } DBDirectClient client; - setClient( toNs.c_str() ); + Client::Context ctx( toNs ); BSONObjBuilder spec; spec.appendBool( "capped", true ); spec.append( "size", double( size ) ); @@ -1037,14 +1223,22 @@ namespace mongo { } } cmdCloneCollectionAsCapped; + /* jan2010: + Converts the given collection to a capped collection w/ the specified size. + This command is not highly used, and is not currently supported with sharded + environments. + */ class CmdConvertToCapped : public Command { public: CmdConvertToCapped() : Command( "convertToCapped" ) {} virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } virtual void help( stringstream &help ) const { help << "example: { convertToCapped:<fromCollectionName>, size:<sizeInBytes> }"; } bool run(const char *dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + BackgroundOperation::assertNoBgOpInProgForDb(dbname); + string from = jsobj.getStringField( "convertToCapped" ); long long size = (long long)jsobj.getField( "size" ).number(); @@ -1086,6 +1280,7 @@ namespace mongo { class GroupCommand : public Command { public: GroupCommand() : Command("group"){} + virtual LockType locktype(){ return READ; } virtual bool slaveOk() { return true; } virtual void help( stringstream &help ) const { help << "see http://www.mongodb.org/display/DOCS/Aggregation"; @@ -1260,7 +1455,7 @@ namespace mongo { public: DistinctCommand() : Command("distinct"){} virtual bool slaveOk() { return true; } - + virtual LockType locktype(){ return READ; } virtual void help( stringstream &help ) const { help << "{ distinct : 'collection name' , key : 'a.b' }"; } @@ -1268,7 +1463,7 @@ namespace mongo { bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ static DBDirectClient db; - string ns = cc().database()->name + '.' + cmdObj.findElement(name).valuestr(); + string ns = cc().database()->name + '.' + cmdObj.getField(name).valuestr(); string key = cmdObj["key"].valuestrsafe(); BSONObj keyPattern = BSON( key << 1 ); @@ -1319,6 +1514,7 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } virtual bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { static DBDirectClient db; @@ -1355,23 +1551,232 @@ namespace mongo { } } cmdFindAndModify; - bool commandIsReadOnly(BSONObj& _cmdobj) { - BSONObj jsobj; - { - BSONElement e = _cmdobj.firstElement(); - if ( e.type() == Object && string("query") == e.fieldName() ) { - jsobj = e.embeddedObject(); + /* Returns client's uri */ + class CmdWhatsMyUri : public Command { + public: + CmdWhatsMyUri() : Command("whatsmyuri") { } + virtual bool logTheOp() { + return false; // the modification will be logged directly + } + virtual bool slaveOk() { + return true; + } + virtual LockType locktype(){ return NONE; } + virtual bool requiresAuth() { + return false; + } + virtual void help( stringstream &help ) const { + help << "{whatsmyuri:1}"; + } + virtual bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + BSONObj info = cc().curop()->infoNoauth(); + result << "you" << info[ "client" ]; + return true; + } + } cmdWhatsMyUri; + + /* For testing only, not for general use */ + class GodInsert : public Command { + public: + GodInsert() : Command( "godinsert" ) { } + virtual bool logTheOp() { + return true; + } + virtual bool slaveOk() { + return false; + } + virtual LockType locktype() { return WRITE; } + virtual bool requiresAuth() { + return true; + } + virtual void help( stringstream &help ) const { + help << "[for testing only]"; + } + virtual bool run(const char *dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { + string coll = cmdObj[ "godinsert" ].valuestrsafe(); + uassert( 13049, "godinsert must specify a collection", !coll.empty() ); + string ns = nsToDatabase( dbname ) + "." + coll; + BSONObj obj = cmdObj[ "obj" ].embeddedObjectUserCheck(); + DiskLoc loc = theDataFileMgr.insert( ns.c_str(), obj, true ); + return true; + } + } cmdGodInsert; + + class DBHashCmd : public Command { + public: + DBHashCmd() : Command( "dbhash" ){} + virtual bool slaveOk() { return true; } + virtual LockType locktype() { return READ; } + virtual bool run(const char * badns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ + string dbname = nsToDatabase( badns ); + + list<string> colls = _db.getCollectionNames( dbname ); + colls.sort(); + + result.appendNumber( "numCollections" , (long long)colls.size() ); + + md5_state_t globalState; + md5_init(&globalState); + + BSONObjBuilder bb( result.subobjStart( "collections" ) ); + for ( list<string>::iterator i=colls.begin(); i != colls.end(); i++ ){ + string c = *i; + if ( c.find( ".system.profil" ) != string::npos ) + continue; + + auto_ptr<Cursor> cursor; + + NamespaceDetails * nsd = nsdetails( c.c_str() ); + int idNum = nsd->findIdIndex(); + if ( idNum >= 0 ){ + cursor.reset( new BtreeCursor( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) ); + } + else if ( c.find( ".system." ) != string::npos ){ + continue; + } + else if ( nsd->capped ){ + cursor = findTableScan( c.c_str() , BSONObj() ); + } + else { + bb.done(); + errmsg = (string)"can't find _id index for: " + c; + return 0; + } + + md5_state_t st; + md5_init(&st); + + long long n = 0; + while ( cursor->ok() ){ + BSONObj c = cursor->current(); + md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() ); + n++; + cursor->advance(); + } + md5digest d; + md5_finish(&st, d); + string hash = digestToString( d ); + + bb.append( c.c_str() + ( dbname.size() + 1 ) , hash ); + + md5_append( &globalState , (const md5_byte_t*)hash.c_str() , hash.size() ); } - else { - jsobj = _cmdobj; + bb.done(); + + md5digest d; + md5_finish(&globalState, d); + string hash = digestToString( d ); + + result.append( "md5" , hash ); + + return 1; + } + + DBDirectClient _db; + } dbhashCmd; + + /** + * this handles + - auth + - locking + - context + then calls run() + */ + bool execCommand( Command * c , + Client& client , int queryOptions , + const char *ns, BSONObj& cmdObj , + BSONObjBuilder& result, + bool fromRepl ){ + + string dbname = nsToDatabase( ns ); + + AuthenticationInfo *ai = client.getAuthenticationInfo(); + + if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) { + result.append( "errmsg" , + "unauthorized: this command must run from localhost when running db without auth" ); + log() << "command denied: " << cmdObj.toString() << endl; + return false; + } + + + if ( c->adminOnly() && ! fromRepl && dbname != "admin" ) { + result.append( "errmsg" , "access denied" ); + log() << "command denied: " << cmdObj.toString() << endl; + return false; + } + + if ( cmdObj["help"].trueValue() ){ + stringstream ss; + ss << "help for: " << c->name << " "; + c->help( ss ); + result.append( "help" , ss.str() ); + result.append( "lockType" , c->locktype() ); + return true; + } + + bool canRunHere = + isMaster( dbname.c_str() ) || + c->slaveOk() || + ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) || + fromRepl; + + if ( ! canRunHere ){ + result.append( "errmsg" , "not master" ); + return false; + } + + if ( c->locktype() == Command::NONE ){ + // we also trust that this won't crash + string errmsg; + int ok = c->run( ns , cmdObj , errmsg , result , fromRepl ); + if ( ! ok ) + result.append( "errmsg" , errmsg ); + return ok; + } + + bool needWriteLock = c->locktype() == Command::WRITE; + + if ( ! c->requiresAuth() && + ( ai->isAuthorizedReads( dbname ) && + ! ai->isAuthorized( dbname ) ) ){ + // this means that they can read, but not write + // so only get a read lock + needWriteLock = false; + } + + if ( ! needWriteLock ){ + assert( ! c->logTheOp() ); + } + + mongolock lk( needWriteLock ); + Client::Context ctx( ns , dbpath , &lk , c->requiresAuth() ); + + if ( c->adminOnly() ) + log( 2 ) << "command: " << cmdObj << endl; + + try { + string errmsg; + if ( ! c->run(ns, cmdObj, errmsg, result, fromRepl ) ){ + result.append( "errmsg" , errmsg ); + return false; } } - BSONElement e = jsobj.firstElement(); - if ( ! e.type() ) + catch ( AssertionException& e ){ + stringstream ss; + ss << "assertion: " << e.what(); + result.append( "errmsg" , ss.str() ); return false; - return Command::readOnly( e.fieldName() ); + } + + if ( c->logTheOp() && ! fromRepl ){ + logOp("c", ns, cmdObj); + } + + return true; } + /* TODO make these all command objects -- legacy stuff here usage: @@ -1380,9 +1785,11 @@ namespace mongo { returns true if ran a cmd */ bool _runCommands(const char *ns, BSONObj& _cmdobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) { + string dbname = nsToDatabase( ns ); + if( logLevel >= 1 ) log() << "run command " << ns << ' ' << _cmdobj << endl; - + const char *p = strchr(ns, '.'); if ( !p ) return false; if ( strcmp(p, ".$cmd") != 0 ) return false; @@ -1398,59 +1805,14 @@ namespace mongo { } } + Client& client = cc(); bool ok = false; BSONElement e = jsobj.firstElement(); - + Command * c = e.type() ? Command::findCommand( e.fieldName() ) : 0; if ( c ){ - string errmsg; - AuthenticationInfo *ai = currentClient.get()->ai; - uassert( 10045 , "unauthorized", ai->isAuthorized(cc().database()->name.c_str()) || !c->requiresAuth()); - - bool admin = c->adminOnly(); - - if( admin && c->localHostOnlyIfNoAuth(jsobj) && noauth && !ai->isLocalHost ) { - ok = false; - errmsg = "unauthorized: this command must run from localhost when running db without auth"; - log() << "command denied: " << jsobj.toString() << endl; - } - else if ( admin && !fromRepl && strncmp(ns, "admin", 5) != 0 ) { - ok = false; - errmsg = "access denied"; - log() << "command denied: " << jsobj.toString() << endl; - } - else if ( isMaster() || - c->slaveOk() || - ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) || - fromRepl ){ - if ( jsobj.getBoolField( "help" ) ) { - stringstream help; - help << "help for: " << e.fieldName() << " "; - c->help( help ); - anObjBuilder.append( "help" , help.str() ); - } - else { - if( admin ) - log( 2 ) << "command: " << jsobj << endl; - try { - ok = c->run(ns, jsobj, errmsg, anObjBuilder, fromRepl); - } - catch ( AssertionException& e ){ - ok = false; - errmsg = "assertion: "; - errmsg += e.what(); - } - if ( ok && c->logTheOp() && !fromRepl ) - logOp("c", ns, jsobj); - } - } - else { - ok = false; - errmsg = "not master"; - } - if ( !ok ) - anObjBuilder.append("errmsg", errmsg); + ok = execCommand( c , client , queryOptions , ns , jsobj , anObjBuilder , fromRepl ); } else { anObjBuilder.append("errmsg", "no such cmd"); diff --git a/db/dbcommands_admin.cpp b/db/dbcommands_admin.cpp index 91052bf..7265002 100644 --- a/db/dbcommands_admin.cpp +++ b/db/dbcommands_admin.cpp @@ -31,15 +31,36 @@ #include "btree.h" #include "curop.h" #include "../util/background.h" +#include "../scripting/engine.h" namespace mongo { + class FeaturesCmd : public Command { + public: + FeaturesCmd() : Command( "features" ){} + + virtual bool slaveOk(){ return true; } + virtual bool readOnly(){ return true; } + virtual LockType locktype(){ return READ; } + virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + result.append( "readlock" , readLockSupported() ); + if ( globalScriptEngine ){ + BSONObjBuilder bb( result.subobjStart( "js" ) ); + result.append( "utf8" , globalScriptEngine->utf8Ok() ); + bb.done(); + } + return true; + } + + } featuresCmd; + class CleanCmd : public Command { public: CleanCmd() : Command( "clean" ){} virtual bool slaveOk(){ return true; } - + virtual LockType locktype(){ return WRITE; } + bool run(const char *nsRaw, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ string dropns = cc().database()->name + "." + cmdObj.firstElement().valuestrsafe(); @@ -70,6 +91,7 @@ namespace mongo { return true; } + virtual LockType locktype(){ return WRITE; } //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] } */ bool run(const char *nsRaw, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ @@ -159,7 +181,7 @@ namespace mongo { nlen += r->netLength(); c->advance(); } - if ( d->capped ) { + if ( d->capped && !d->capLooped() ) { ss << " capped outOfOrder:" << outOfOrder; if ( outOfOrder > 1 ) { valid = false; @@ -252,7 +274,7 @@ namespace mongo { extern bool unlockRequested; extern unsigned lockedForWriting; - extern boost::mutex lockedForWritingMutex; + extern mongo::mutex lockedForWritingMutex; /* class UnlockCommand : public Command { @@ -283,8 +305,10 @@ namespace mongo { class LockDBJob : public BackgroundJob { protected: void run() { + Client::initThread("fsyncjob"); + Client& c = cc(); { - boostlock lk(lockedForWritingMutex); + scoped_lock lk(lockedForWritingMutex); lockedForWriting++; } readlock lk(""); @@ -299,9 +323,10 @@ namespace mongo { sleepmillis(20); } { - boostlock lk(lockedForWritingMutex); + scoped_lock lk(lockedForWritingMutex); lockedForWriting--; } + c.shutdown(); } public: bool& _ready; @@ -312,7 +337,7 @@ namespace mongo { }; public: FSyncCommand() : Command( "fsync" ){} - + virtual LockType locktype(){ return WRITE; } virtual bool slaveOk(){ return true; } virtual bool adminOnly(){ return true; } /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { @@ -351,6 +376,18 @@ namespace mongo { } } fsyncCmd; - + + class LogRotateCmd : public Command { + public: + LogRotateCmd() : Command( "logRotate" ){} + virtual LockType locktype(){ return NONE; } + virtual bool slaveOk(){ return true; } + virtual bool adminOnly(){ return true; } + virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + rotateLogs(); + return 1; + } + + } logRotateCmd; } diff --git a/db/dbeval.cpp b/db/dbeval.cpp index e729135..a3be894 100644 --- a/db/dbeval.cpp +++ b/db/dbeval.cpp @@ -73,7 +73,7 @@ namespace mongo { BSONObj args; { - BSONElement argsElement = cmd.findElement("args"); + BSONElement argsElement = cmd.getField("args"); if ( argsElement.type() == Array ) { args = argsElement.embeddedObject(); if ( edebug ) { @@ -111,8 +111,16 @@ namespace mongo { virtual bool slaveOk() { return false; } + // We need at least read only access to run db.eval - auth for eval'd writes will be checked + // as they are requested. + virtual bool requiresAuth() { + return false; + } + virtual LockType locktype(){ return WRITE; } CmdEval() : Command("$eval") { } bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + AuthenticationInfo *ai = cc().getAuthenticationInfo(); + uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(cc().database()->name.c_str())); return dbEval(ns, cmdObj, result, errmsg); } } cmdeval; diff --git a/db/dbmessage.h b/db/dbmessage.h index 54a2ac3..ba5cf94 100644 --- a/db/dbmessage.h +++ b/db/dbmessage.h @@ -16,7 +16,7 @@ #pragma once -#include "storage.h" +#include "diskloc.h" #include "jsobj.h" #include "namespace.h" #include "../util/message.h" @@ -133,8 +133,10 @@ namespace mongo { return nextjsobj != 0; } BSONObj nextJsObj() { - if ( nextjsobj == data ) + if ( nextjsobj == data ) { nextjsobj += strlen(data) + 1; // skip namespace + massert( 13066 , "Message contains no documents", theEnd > nextjsobj ); + } massert( 10304 , "Remaining data too small for BSON object", theEnd - nextjsobj > 3 ); BSONObj js(nextjsobj); massert( 10305 , "Invalid object size", js.objsize() > 3 ); @@ -180,7 +182,7 @@ namespace mongo { int ntoreturn; int queryOptions; BSONObj query; - auto_ptr< FieldMatcher > fields; + BSONObj fields; /* parses the message into the above fields */ QueryMessage(DbMessage& d) { @@ -189,11 +191,7 @@ namespace mongo { ntoreturn = d.pullInt(); query = d.nextJsObj(); if ( d.moreJSObjs() ) { - BSONObj o = d.nextJsObj(); - if (!o.isEmpty()){ - fields = auto_ptr< FieldMatcher >(new FieldMatcher() ); - fields->add( o ); - } + fields = d.nextJsObj(); } queryOptions = d.msg().data->dataAsInt(); } @@ -222,9 +220,8 @@ namespace mongo { qr->startingFrom = startingFrom; qr->nReturned = nReturned; b.decouple(); - Message *resp = new Message(); - resp->setData(qr, true); // transport will free - p->reply(requestMsg, *resp, requestMsg.data->id); + Message resp(qr, true); + p->reply(requestMsg, resp, requestMsg.data->id); } } // namespace mongo diff --git a/db/dbstats.cpp b/db/dbstats.cpp deleted file mode 100644 index 902b57b..0000000 --- a/db/dbstats.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// dbstats.cpp - -#include "stdafx.h" -#include "dbstats.h" - -namespace mongo { - - OpCounters::OpCounters(){ - int zero = 0; - - BSONObjBuilder b; - b.append( "insert" , zero ); - b.append( "query" , zero ); - b.append( "update" , zero ); - b.append( "delete" , zero ); - b.append( "getmore" , zero ); - _obj = b.obj(); - - _insert = (int*)_obj["insert"].value(); - _query = (int*)_obj["query"].value(); - _update = (int*)_obj["update"].value(); - _delete = (int*)_obj["delete"].value(); - _getmore = (int*)_obj["getmore"].value(); - } - - void OpCounters::gotOp( int op ){ - switch ( op ){ - case dbInsert: gotInsert(); break; - case dbQuery: gotQuery(); break; - case dbUpdate: gotUpdate(); break; - case dbDelete: gotDelete(); break; - case dbGetMore: gotGetMore(); break; - case dbKillCursors: - case opReply: - case dbMsg: - break; - default: log() << "OpCounters::gotOp unknown op: " << op << endl; - } - } - - - OpCounters globalOpCounters; -} diff --git a/db/dbstats.h b/db/dbstats.h deleted file mode 100644 index c7d6340..0000000 --- a/db/dbstats.h +++ /dev/null @@ -1,44 +0,0 @@ -// dbstats.h - -#include "../stdafx.h" -#include "jsobj.h" -#include "../util/message.h" - -namespace mongo { - - /** - * for storing operation counters - * note: not thread safe. ok with that for speed - */ - class OpCounters { - public: - - OpCounters(); - - int * getInsert(){ return _insert; } - int * getQuery(){ return _query; } - int * getUpdate(){ return _update; } - int * getDelete(){ return _delete; } - int * getGetGore(){ return _getmore; } - - void gotInsert(){ _insert[0]++; } - void gotQuery(){ _query[0]++; } - void gotUpdate(){ _update[0]++; } - void gotDelete(){ _delete[0]++; } - void gotGetMore(){ _getmore[0]++; } - - void gotOp( int op ); - - BSONObj& getObj(){ return _obj; } - private: - BSONObj _obj; - int * _insert; - int * _query; - int * _update; - int * _delete; - int * _getmore; - }; - - extern OpCounters globalOpCounters; - -} diff --git a/db/dbwebserver.cpp b/db/dbwebserver.cpp index 0e1483c..75d3a92 100644 --- a/db/dbwebserver.cpp +++ b/db/dbwebserver.cpp @@ -27,6 +27,9 @@ #include "replset.h" #include "instance.h" #include "security.h" +#include "stats/snapshots.h" +#include "background.h" +#include "commands.h" #include <pcrecpp.h> #include <boost/date_time/posix_time/posix_time.hpp> @@ -61,48 +64,6 @@ namespace mongo { } unsigned long long start, timeLocked; }; - Timing tlast; - const int NStats = 32; - string lockStats[NStats]; - unsigned q = 0; - - void statsThread() { - /*cout << "TEMP disabled statsthread" << endl; - if( 1 ) - return;*/ - Client::initThread("stats"); - unsigned long long timeLastPass = 0; - while ( 1 ) { - { - /* todo: do we even need readlock here? if so for what? */ - readlock lk(""); - Top::completeSnapshot(); - q = (q+1)%NStats; - Timing timing; - dbMutex.info().getTimingInfo(timing.start, timing.timeLocked); - unsigned long long now = curTimeMicros64(); - if ( timeLastPass ) { - unsigned long long dt = now - timeLastPass; - unsigned long long dlocked = timing.timeLocked - tlast.timeLocked; - { - stringstream ss; - ss << dt / 1000 << '\t'; - ss << dlocked / 1000 << '\t'; - if ( dt ) - ss << (dlocked*100)/dt << '%'; - string s = ss.str(); - if ( cmdLine.cpu ) - log() << "cpu: " << s << endl; - lockStats[q] = s; - ClientCursor::idleTimeReport( (unsigned) ((dt - dlocked)/1000) ); - } - } - timeLastPass = now; - tlast = timing; - } - sleepsecs(4); - } - } bool _bold; string bold(bool x) { @@ -118,14 +79,11 @@ namespace mongo { // caller locks void doLockedStuff(stringstream& ss) { ss << "# databases: " << dbHolder.size() << '\n'; - if ( cc().database() ) { - ss << "curclient: " << cc().database()->name; // TODO: isn't this useless? - ss << '\n'; - } + ss << bold(ClientCursor::byLocSize()>10000) << "Cursors byLoc.size(): " << ClientCursor::byLocSize() << bold() << '\n'; ss << "\n<b>replication</b>\n"; - ss << "master: " << master << '\n'; - ss << "slave: " << slave << '\n'; + ss << "master: " << replSettings.master << '\n'; + ss << "slave: " << replSettings.slave << '\n'; if ( replPair ) { ss << "replpair:\n"; ss << replPair->getInfo(); @@ -135,26 +93,76 @@ namespace mongo { ss << "initialSyncCompleted: " << seemCaughtUp; if ( !seemCaughtUp ) ss << "</b>"; ss << '\n'; - - ss << "\n<b>DBTOP</b>\n"; - ss << "<table border=1><tr align='left'><th>Namespace</th><th>%</th><th>Reads</th><th>Writes</th><th>Calls</th><th>Time</th>"; - vector< Top::Usage > usage; - Top::usage( usage ); - for( vector< Top::Usage >::iterator i = usage.begin(); i != usage.end(); ++i ) - ss << setprecision( 2 ) << fixed << "<tr><td>" << i->ns << "</td><td>" << i->pct << "</td><td>" - << i->reads << "</td><td>" << i->writes << "</td><td>" << i->calls << "</td><td>" << i->time << "</td></tr>\n"; - ss << "</table>"; - ss << "\n<b>dt\ttlocked</b>\n"; - unsigned i = q; - while ( 1 ) { - ss << lockStats[i] << '\n'; - i = (i-1)%NStats; - if ( i == q ) - break; + auto_ptr<SnapshotDelta> delta = statsSnapshots.computeDelta(); + if ( delta.get() ){ + ss << "\n<b>DBTOP (occurences|percent of elapsed)</b>\n"; + ss << "<table border=1>"; + ss << "<tr align='left'>"; + ss << "<th>NS</th>" + "<th colspan=2>total</th>" + "<th colspan=2>Reads</th>" + "<th colspan=2>Writes</th>" + "<th colspan=2>Queries</th>" + "<th colspan=2>GetMores</th>" + "<th colspan=2>Inserts</th>" + "<th colspan=2>Updates</th>" + "<th colspan=2>Removes</th>"; + ss << "</tr>"; + + display( ss , (double) delta->elapsed() , "GLOBAL" , delta->globalUsageDiff() ); + + Top::UsageMap usage = delta->collectionUsageDiff(); + for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ){ + display( ss , (double) delta->elapsed() , i->first , i->second ); + } + + ss << "</table>"; } + + statsSnapshots.outputLockInfoHTML( ss ); + + BackgroundOperation::dump(ss); } + void display( stringstream& ss , double elapsed , const Top::UsageData& usage ){ + ss << "<td>"; + ss << usage.count; + ss << "</td><td>"; + double per = 100 * ((double)usage.time)/elapsed; + ss << setprecision(2) << fixed << per << "%"; + ss << "</td>"; + } + + void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ){ + if ( ns != "GLOBAL" && data.total.count == 0 ) + return; + ss << "<tr><th>" << ns << "</th>"; + + display( ss , elapsed , data.total ); + + display( ss , elapsed , data.readLock ); + display( ss , elapsed , data.writeLock ); + + display( ss , elapsed , data.queries ); + display( ss , elapsed , data.getmore ); + display( ss , elapsed , data.insert ); + display( ss , elapsed , data.update ); + display( ss , elapsed , data.remove ); + + ss << "</tr>"; + } + + void tablecell( stringstream& ss , bool b ){ + ss << "<td>" << (b ? "<b>X</b>" : "") << "</td>"; + } + + + template< typename T> + void tablecell( stringstream& ss , const T& t ){ + ss << "<td>" << t << "</td>"; + } + void doUnlockedStuff(stringstream& ss) { /* this is in the header already ss << "port: " << port << '\n'; */ ss << mongodVersion() << "\n"; @@ -178,21 +186,51 @@ namespace mongo { ss << "\nreplInfo: " << replInfo << "\n\n"; ss << "Clients:\n"; - ss << "<table border=1><tr align='left'><th>Thread</th><th>Current op</th>\n"; + ss << "<table border=1>"; + ss << "<tr align='left'>" + << "<th>Thread</th>" + + << "<th>OpId</th>" + << "<th>Active</th>" + << "<th>LockType</th>" + << "<th>Waiting</th>" + << "<th>SecsRunning</th>" + << "<th>Op</th>" + << "<th>NameSpace</th>" + << "<th>Query</th>" + << "<th>client</th>" + << "<th>msg</th>" + << "<th>progress</th>" + + << "</tr>\n"; { - boostlock bl(Client::clientsMutex); + scoped_lock bl(Client::clientsMutex); for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { Client *c = *i; CurOp& co = *(c->curop()); - ss << "<tr><td>" << c->desc() << "</td><td"; - BSONObj info = co.infoNoauth(); - /* - if( info.getIntField("inLock") > 0 ) - ss << "style='color:red'"; - else if( info.getIntField("inLock") < 0 ) - ss << "style='color:green'"; - */ - ss << ">" << info << "</td></tr>\n"; + ss << "<tr><td>" << c->desc() << "</td>"; + + tablecell( ss , co.opNum() ); + tablecell( ss , co.active() ); + tablecell( ss , co.getLockType() ); + tablecell( ss , co.isWaitingForLock() ); + if ( co.active() ) + tablecell( ss , co.elapsedSeconds() ); + else + tablecell( ss , "" ); + tablecell( ss , co.getOp() ); + tablecell( ss , co.getNS() ); + if ( co.haveQuery() ) + tablecell( ss , co.query() ); + else + tablecell( ss , "" ); + tablecell( ss , co.getRemoteString() ); + + tablecell( ss , co.getMessage() ); + tablecell( ss , co.getProgressMeter().toString() ); + + + ss << "</tr>"; } } ss << "</table>\n"; @@ -203,7 +241,7 @@ namespace mongo { if ( from.localhost() ) return true; - if ( db.findOne( "admin.system.users" , BSONObj() ).isEmpty() ) + if ( db.findOne( "admin.system.users" , BSONObj() , 0 , QueryOption_SlaveOk ).isEmpty() ) return true; string auth = getHeader( rq , "Authorization" ); @@ -270,6 +308,23 @@ namespace mongo { //out() << "url [" << url << "]" << endl; if ( url.size() > 1 ) { + + if ( url.find( "/_status" ) == 0 ){ + if ( ! allowed( rq , headers, from ) ){ + responseCode = 401; + responseMsg = "not allowed\n"; + return; + } + generateServerStatus( url , responseMsg ); + responseCode = 200; + return; + } + + if ( ! cmdLine.rest ){ + responseCode = 403; + responseMsg = "rest is not enabled. use --rest to turn on"; + return; + } if ( ! allowed( rq , headers, from ) ){ responseCode = 401; responseMsg = "not allowed\n"; @@ -294,23 +349,18 @@ namespace mongo { doUnlockedStuff(ss); - int n = 2000; - Timer t; - while ( 1 ) { - if ( !dbMutex.info().isLocked() ) { - { - readlock lk(""); - ss << "time to get dblock: " << t.millis() << "ms\n"; - doLockedStuff(ss); - } - break; + { + Timer t; + readlocktry lk( "" , 2000 ); + if ( lk.got() ){ + ss << "time to get dblock: " << t.millis() << "ms\n"; + doLockedStuff(ss); } - sleepmillis(1); - if ( --n < 0 ) { + else { ss << "\n<b>timed out getting dblock</b>\n"; - break; } } + ss << "</pre></body></html>"; responseMsg = ss.str(); @@ -323,6 +373,51 @@ namespace mongo { } } + void generateServerStatus( string url , string& responseMsg ){ + static vector<string> commands; + if ( commands.size() == 0 ){ + commands.push_back( "serverStatus" ); + commands.push_back( "buildinfo" ); + } + + BSONObj params; + if ( url.find( "?" ) != string::npos ) { + parseParams( params , url.substr( url.find( "?" ) + 1 ) ); + } + + BSONObjBuilder buf(1024); + + for ( unsigned i=0; i<commands.size(); i++ ){ + string cmd = commands[i]; + + Command * c = Command::findCommand( cmd ); + assert( c ); + assert( c->locktype() == 0 ); + + BSONObj co; + { + BSONObjBuilder b; + b.append( cmd.c_str() , 1 ); + + if ( cmd == "serverStatus" && params["repl"].type() ){ + b.append( "repl" , atoi( params["repl"].valuestr() ) ); + } + + co = b.obj(); + } + + string errmsg; + + BSONObjBuilder sub; + if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) ) + buf.append( cmd.c_str() , errmsg ); + else + buf.append( cmd.c_str() , sub.obj() ); + } + + responseMsg = buf.obj().jsonString(); + } + void handleRESTRequest( const char *rq, // the full request string url, string& responseMsg, @@ -341,7 +436,7 @@ namespace mongo { string coll = url.substr( first + 1 ); string action = ""; - map<string,string> params; + BSONObj params; if ( coll.find( "?" ) != string::npos ) { parseParams( params , coll.substr( coll.find( "?" ) + 1 ) ); coll = coll.substr( 0 , coll.find( "?" ) ); @@ -361,7 +456,7 @@ namespace mongo { if ( coll[i] == '/' ) coll[i] = '.'; - string fullns = dbname + "." + coll; + string fullns = urlDecode(dbname + "." + coll); headers.push_back( (string)"x-action: " + action ); headers.push_back( (string)"x-ns: " + fullns ); @@ -387,26 +482,29 @@ namespace mongo { responseMsg = ss.str(); } - void handleRESTQuery( string ns , string action , map<string,string> & params , int & responseCode , stringstream & out ) { + void handleRESTQuery( string ns , string action , BSONObj & params , int & responseCode , stringstream & out ) { Timer t; int skip = _getOption( params["skip"] , 0 ); int num = _getOption( params["limit"] , _getOption( params["count" ] , 1000 ) ); // count is old, limit is new int one = 0; - if ( params["one"].size() > 0 && tolower( params["one"][0] ) == 't' ) { + if ( params["one"].type() == String && tolower( params["one"].valuestr()[0] ) == 't' ) { num = 1; one = 1; } BSONObjBuilder queryBuilder; - for ( map<string,string>::iterator i = params.begin(); i != params.end(); i++ ) { - if ( ! i->first.find( "filter_" ) == 0 ) + BSONObjIterator i(params); + while ( i.more() ){ + BSONElement e = i.next(); + string name = e.fieldName(); + if ( ! name.find( "filter_" ) == 0 ) continue; - const char * field = i->first.substr( 7 ).c_str(); - const char * val = i->second.c_str(); + const char * field = name.substr( 7 ).c_str(); + const char * val = e.valuestr(); char * temp; @@ -454,7 +552,7 @@ namespace mongo { } // TODO Generate id and revision per couch POST spec - void handlePost( string ns, const char *body, map<string,string> & params, int & responseCode, stringstream & out ) { + void handlePost( string ns, const char *body, BSONObj& params, int & responseCode, stringstream & out ) { try { BSONObj obj = fromjson( body ); db.insert( ns.c_str(), obj ); @@ -468,10 +566,12 @@ namespace mongo { out << "{ \"ok\" : true }"; } - int _getOption( string val , int def ) { - if ( val.size() == 0 ) - return def; - return atoi( val.c_str() ); + int _getOption( BSONElement e , int def ) { + if ( e.isNumber() ) + return e.numberInt(); + if ( e.type() == String ) + return atoi( e.valuestr() ); + return def; } private: @@ -481,7 +581,6 @@ namespace mongo { DBDirectClient DbWebServer::db; void webServerThread() { - boost::thread thr(statsThread); Client::initThread("websvr"); DbWebServer mini; int p = cmdLine.port + 1000; diff --git a/db/storage.h b/db/diskloc.h index cc29e60..cc29e60 100644 --- a/db/storage.h +++ b/db/diskloc.h diff --git a/db/driverHelpers.cpp b/db/driverHelpers.cpp new file mode 100644 index 0000000..c2d1b9d --- /dev/null +++ b/db/driverHelpers.cpp @@ -0,0 +1,63 @@ +// driverHelpers.cpp + +/** + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + this file has dbcommands that are for drivers + mostly helpers +*/ + + +#include "stdafx.h" +#include "jsobj.h" +#include "pdfile.h" +#include "namespace.h" +#include "commands.h" +#include "cmdline.h" +#include "btree.h" +#include "curop.h" +#include "../util/background.h" +#include "../scripting/engine.h" + +namespace mongo { + + class BasicDriverHelper : public Command { + public: + BasicDriverHelper( const char * name ) : Command( name ){} + + virtual LockType locktype(){ return NONE; } + virtual bool slaveOk(){ return true; } + virtual bool slaveOverrideOk(){ return true; } + + }; + + class ObjectIdTest : public BasicDriverHelper { + public: + ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ){} + virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + if ( cmdObj.firstElement().type() != jstOID ){ + errmsg = "not oid"; + return false; + } + + const OID& oid = cmdObj.firstElement().__oid(); + result.append( "oid" , oid ); + result.append( "str" , oid.str() ); + + return true; + } + } driverObjectIdTest; +} diff --git a/db/extsort.cpp b/db/extsort.cpp index 08b343a..a0b9f7a 100644 --- a/db/extsort.cpp +++ b/db/extsort.cpp @@ -27,11 +27,12 @@ namespace mongo { + BSONObj BSONObjExternalSorter::extSortOrder; unsigned long long BSONObjExternalSorter::_compares = 0; BSONObjExternalSorter::BSONObjExternalSorter( const BSONObj & order , long maxFileSize ) : _order( order.getOwned() ) , _maxFilesize( maxFileSize ) , - _cur(0), _curSizeSoFar(0), _sorted(0){ + _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0){ stringstream rootpath; rootpath << dbpath; @@ -56,13 +57,21 @@ namespace mongo { wassert( removed == 1 + _files.size() ); } + void BSONObjExternalSorter::_sortInMem(){ + // extSortComp needs to use glbals + // qsort_r only seems available on bsd, which is what i really want to use + dblock l; + extSortOrder = _order; + _cur->sort( BSONObjExternalSorter::extSortComp ); + } + void BSONObjExternalSorter::sort(){ uassert( 10048 , "already sorted" , ! _sorted ); - + _sorted = true; if ( _cur && _files.size() == 0 ){ - _cur->sort( MyCmp( _order ) ); + _sortInMem(); log(1) << "\t\t not using file. size:" << _curSizeSoFar << " _compares:" << _compares << endl; return; } @@ -85,16 +94,20 @@ namespace mongo { uassert( 10049 , "sorted already" , ! _sorted ); if ( ! _cur ){ - _cur = new InMemory(); + _cur = new InMemory( _arraySize ); } - _cur->push_back( pair<BSONObj,DiskLoc>( o.getOwned() , loc ) ); - + Data& d = _cur->getNext(); + d.first = o.getOwned(); + d.second = loc; + long size = o.objsize(); - _curSizeSoFar += size + sizeof( DiskLoc ); + _curSizeSoFar += size + sizeof( DiskLoc ) + sizeof( BSONObj ); - if ( _curSizeSoFar > _maxFilesize ) + if ( _cur->hasSpace() == false || _curSizeSoFar > _maxFilesize ){ finishMap(); + log(1) << "finishing map" << endl; + } } @@ -105,7 +118,7 @@ namespace mongo { if ( _cur->size() == 0 ) return; - _cur->sort( MyCmp( _order ) ); + _sortInMem(); stringstream ss; ss << _root.string() << "/file." << _files.size(); @@ -113,10 +126,10 @@ namespace mongo { ofstream out; out.open( file.c_str() , ios_base::out | ios_base::binary ); - uassert( 10051 , (string)"couldn't open file: " + file , out.good() ); + ASSERT_STREAM_GOOD( 10051 , (string)"couldn't open file: " + file , out ); int num = 0; - for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); i++ ){ + for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ){ Data p = *i; out.write( p.first.objdata() , p.first.objsize() ); out.write( (char*)(&p.second) , sizeof( DiskLoc ) ); @@ -169,10 +182,12 @@ namespace mongo { return false; } - pair<BSONObj,DiskLoc> BSONObjExternalSorter::Iterator::next(){ + BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next(){ if ( _in ){ - return *(_it++); + Data& d = *_it; + ++_it; + return d; } Data best; @@ -204,7 +219,7 @@ namespace mongo { BSONObjExternalSorter::FileIterator::FileIterator( string file ){ long length; - _buf = (char*)_file.map( file.c_str() , length ); + _buf = (char*)_file.map( file.c_str() , length , MemoryMappedFile::SEQUENTIAL ); massert( 10308 , "mmap failed" , _buf ); assert( (unsigned long)length == file_size( file ) ); _end = _buf + length; @@ -216,7 +231,7 @@ namespace mongo { return _buf < _end; } - pair<BSONObj,DiskLoc> BSONObjExternalSorter::FileIterator::next(){ + BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next(){ BSONObj o( _buf ); _buf += o.objsize(); DiskLoc * l = (DiskLoc*)_buf; diff --git a/db/extsort.h b/db/extsort.h index 5bfa86f..60ee423 100644 --- a/db/extsort.h +++ b/db/extsort.h @@ -22,9 +22,11 @@ #include "jsobj.h" #include "namespace.h" #include "curop.h" +#include "../util/array.h" namespace mongo { + /** for sorting by BSONObj and attaching a value */ @@ -32,8 +34,21 @@ namespace mongo { public: typedef pair<BSONObj,DiskLoc> Data; - + private: + static BSONObj extSortOrder; + + static int extSortComp( const void *lv, const void *rv ){ + RARELY killCurrentOp.checkForInterrupt(); + _compares++; + Data * l = (Data*)lv; + Data * r = (Data*)rv; + int cmp = l->first.woCompare( r->first , extSortOrder ); + if ( cmp ) + return cmp; + return l->second.compare( r->second ); + }; + class FileIterator : boost::noncopyable { public: FileIterator( string file ); @@ -57,13 +72,14 @@ namespace mongo { return x < 0; return l.second.compare( r.second ) < 0; }; + private: BSONObj _order; }; - - public: - typedef list<Data> InMemory; + public: + + typedef FastArray<Data> InMemory; class Iterator : boost::noncopyable { public: @@ -102,8 +118,17 @@ namespace mongo { int numFiles(){ return _files.size(); } + + long getCurSizeSoFar(){ return _curSizeSoFar; } + + void hintNumObjects( long long numObjects ){ + if ( numObjects < _arraySize ) + _arraySize = (int)(numObjects + 100); + } private: + + void _sortInMem(); void sort( string file ); void finishMap(); @@ -112,6 +137,7 @@ namespace mongo { long _maxFilesize; path _root; + int _arraySize; InMemory * _cur; long _curSizeSoFar; diff --git a/db/flushtest.cpp b/db/flushtest.cpp index a301e0e..00cebcf 100644 --- a/db/flushtest.cpp +++ b/db/flushtest.cpp @@ -1,3 +1,19 @@ +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + #include "stdafx.h" #include <stdio.h> #include "../util/goodies.h" diff --git a/db/index.cpp b/db/index.cpp index fab6918..5ec2658 100644 --- a/db/index.cpp +++ b/db/index.cpp @@ -21,22 +21,80 @@ #include "index.h" #include "btree.h" #include "query.h" +#include "background.h" namespace mongo { + map<string,IndexPlugin*> * IndexPlugin::_plugins; + + IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec ) + : _plugin( plugin ) , _spec( spec ){ + + } + + IndexType::~IndexType(){ + } + + const BSONObj& IndexType::keyPattern() const { + return _spec->keyPattern; + } + + IndexPlugin::IndexPlugin( const string& name ) + : _name( name ){ + if ( ! _plugins ) + _plugins = new map<string,IndexPlugin*>(); + (*_plugins)[name] = this; + } + + int IndexType::compare( const BSONObj& l , const BSONObj& r ) const { + return l.woCompare( r , _spec->keyPattern ); + } + + + int removeFromSysIndexes(const char *ns, const char *idxName) { + string system_indexes = cc().database()->name + ".system.indexes"; + BSONObjBuilder b; + b.append("ns", ns); + b.append("name", idxName); // e.g.: { name: "ts_1", ns: "foo.coll" } + BSONObj cond = b.done(); + return (int) deleteObjects(system_indexes.c_str(), cond, false, false, true); + } + + /* this is just an attempt to clean up old orphaned stuff on a delete all indexes + call. repair database is the clean solution, but this gives one a lighter weight + partial option. see dropIndexes() + */ + void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) { + string system_indexes = cc().database()->name + ".system.indexes"; + BSONObjBuilder b; + b.append("ns", ns); + if( idIndex ) { + b.append("name", BSON( "$ne" << idIndex->indexName().c_str() )); + } + BSONObj cond = b.done(); + int n = (int) deleteObjects(system_indexes.c_str(), cond, false, false, true); + if( n ) { + log() << "info: assureSysIndexesEmptied cleaned up " << n << " entries" << endl; + } + } + + const IndexSpec& IndexDetails::getSpec() const { + scoped_lock lk(NamespaceDetailsTransient::_qcMutex); + return NamespaceDetailsTransient::get_inlock( info.obj()["ns"].valuestr() ).getIndexSpec( this ); + } + /* delete this index. does NOT clean up the system catalog (system.indexes or system.namespaces) -- only NamespaceIndex. */ void IndexDetails::kill_idx() { string ns = indexNamespace(); // e.g. foo.coll.$ts_1 + + string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below // clean up parent namespace index cache - NamespaceDetailsTransient::get_w( parentNS().c_str() ).deletedIndex(); + NamespaceDetailsTransient::get_w( pns.c_str() ).deletedIndex(); - BSONObjBuilder b; - b.append("name", indexName().c_str()); - b.append("ns", parentNS().c_str()); - BSONObj cond = b.done(); // e.g.: { name: "ts_1", ns: "foo.coll" } + string name = indexName(); /* important to catch exception here so we can finish cleanup below. */ try { @@ -48,22 +106,44 @@ namespace mongo { head.setInvalid(); info.setInvalid(); - // clean up in system.indexes. we do this last on purpose. note we have - // to make the cond object before the drop() above though. - string system_indexes = cc().database()->name + ".system.indexes"; - int n = deleteObjects(system_indexes.c_str(), cond, false, false, true); + // clean up in system.indexes. we do this last on purpose. + int n = removeFromSysIndexes(pns.c_str(), name.c_str()); wassert( n == 1 ); } + + void IndexSpec::reset( const IndexDetails * details ){ + _details = details; + reset( details->info ); + } + + void IndexSpec::reset( const DiskLoc& loc ){ + info = loc.obj(); + keyPattern = info["key"].embeddedObjectUserCheck(); + if ( keyPattern.objsize() == 0 ) { + out() << info.toString() << endl; + assert(false); + } + _init(); + } + void IndexSpec::_init(){ - assert( keys.objsize() ); + assert( keyPattern.objsize() ); - BSONObjIterator i( keys ); + string pluginName = ""; + + BSONObjIterator i( keyPattern ); BSONObjBuilder nullKeyB; while( i.more() ) { - _fieldNames.push_back( i.next().fieldName() ); + BSONElement e = i.next(); + _fieldNames.push_back( e.fieldName() ); _fixed.push_back( BSONElement() ); nullKeyB.appendNull( "" ); + if ( e.type() == String ){ + uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 ); + pluginName = e.valuestr(); + } + } _nullKey = nullKeyB.obj(); @@ -72,10 +152,25 @@ namespace mongo { b.appendNull( "" ); _nullObj = b.obj(); _nullElt = _nullObj.firstElement(); + + if ( pluginName.size() ){ + IndexPlugin * plugin = IndexPlugin::get( pluginName ); + if ( ! plugin ){ + log() << "warning: can't find plugin [" << pluginName << "]" << endl; + } + else { + _indexType.reset( plugin->generate( this ) ); + } + } + _finishedInit = true; } - + void IndexSpec::getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const { + if ( _indexType.get() ){ + _indexType->getKeys( obj , keys ); + return; + } vector<const char*> fieldNames( _fieldNames ); vector<BSONElement> fixed( _fixed ); _getKeys( fieldNames , fixed , obj, keys ); @@ -115,7 +210,7 @@ namespace mongo { if ( allFound ) { if ( arrElt.eoo() ) { // no terminal array element to expand - BSONObjBuilder b; + BSONObjBuilder b(_sizeTracker); for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i ) b.appendAs( *i, "" ); keys.insert( b.obj() ); @@ -125,7 +220,7 @@ namespace mongo { BSONObjIterator i( arrElt.embeddedObject() ); if ( i.more() ){ while( i.more() ) { - BSONObjBuilder b; + BSONObjBuilder b(_sizeTracker); for( unsigned j = 0; j < fixed.size(); ++j ) { if ( j == arrIdx ) b.appendAs( i.next(), "" ); @@ -137,7 +232,7 @@ namespace mongo { } else if ( fixed.size() > 1 ){ // x : [] - need to insert undefined - BSONObjBuilder b; + BSONObjBuilder b(_sizeTracker); for( unsigned j = 0; j < fixed.size(); ++j ) { if ( j == arrIdx ) b.appendUndefined( "" ); @@ -165,7 +260,7 @@ namespace mongo { Keys will be left empty if key not found in the object. */ void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSetDefaultOrder& keys) const { - NamespaceDetailsTransient::get_w( info.obj()["ns"].valuestr() ).getIndexSpec( this ).getKeys( obj, keys ); + getSpec().getKeys( obj, keys ); } void setDifference(BSONObjSetDefaultOrder &l, BSONObjSetDefaultOrder &r, vector<BSONObj*> &diff) { @@ -185,27 +280,27 @@ namespace mongo { } void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj) { - v.resize(d.nIndexes); + int z = d.nIndexesBeingBuilt(); + v.resize(z); NamespaceDetails::IndexIterator i = d.ii(); - while( i.more() ) { - int j = i.pos(); - IndexDetails& idx = i.next(); + for( int i = 0; i < z; i++ ) { + IndexDetails& idx = d.idx(i); BSONObj idxKey = idx.info.obj().getObjectField("key"); // eg { ts : 1 } - IndexChanges& ch = v[j]; + IndexChanges& ch = v[i]; idx.getKeysFromObject(oldObj, ch.oldkeys); idx.getKeysFromObject(newObj, ch.newkeys); if( ch.newkeys.size() > 1 ) - d.setIndexIsMultikey(j); + d.setIndexIsMultikey(i); setDifference(ch.oldkeys, ch.newkeys, ch.removed); setDifference(ch.newkeys, ch.oldkeys, ch.added); } } - void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d) { - NamespaceDetails::IndexIterator i = d.ii(); - while( i.more() ) { - int j = i.pos(); - v[j].dupCheck(i.next()); + void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc) { + int z = d.nIndexesBeingBuilt(); + for( int i = 0; i < z; i++ ) { + IndexDetails& idx = d.idx(i); + v[i].dupCheck(idx, curObjLoc); } } @@ -248,6 +343,12 @@ namespace mongo { uassert(10097, "bad table to index name on add index attempt", cc().database()->name == nsToDatabase(sourceNS.c_str())); + /* we can't build a new index for the ns if a build is already in progress in the background - + EVEN IF this is a foreground build. + */ + uassert(12588, "cannot add index with a background operation in progress", + !BackgroundOperation::inProgForNs(sourceNS.c_str())); + BSONObj key = io.getObjectField("key"); uassert(12524, "index key pattern too large", key.objsize() <= 2048); if( !validKeyPattern(key) ) { @@ -303,4 +404,40 @@ namespace mongo { return true; } + bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ){ + BSONObjIterator x(a); + while ( x.more() ){ + BSONElement e = x.next(); + BSONObjIterator y(b); + while ( y.more() ){ + BSONElement f = y.next(); + FieldCompareResult res = compareDottedFieldNames( e.fieldName() , f.fieldName() ); + if ( res == SAME || res == LEFT_SUBFIELD || res == RIGHT_SUBFIELD ) + return true; + } + } + return false; + } + + IndexSuitability IndexSpec::suitability( const BSONObj& query , const BSONObj& order ) const { + if ( _indexType.get() ) + return _indexType->suitability( query , order ); + return _suitability( query , order ); + } + + IndexSuitability IndexSpec::_suitability( const BSONObj& query , const BSONObj& order ) const { + // TODO: optimize + if ( anyElementNamesMatch( keyPattern , query ) == 0 && anyElementNamesMatch( keyPattern , order ) == 0 ) + return USELESS; + return HELPFUL; + } + + IndexSuitability IndexType::suitability( const BSONObj& query , const BSONObj& order ) const { + return _spec->_suitability( query , order ); + } + + bool IndexType::scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const { + return ! order.isEmpty(); + } + } @@ -19,46 +19,136 @@ #pragma once #include "../stdafx.h" +#include "diskloc.h" +#include "jsobj.h" +#include <map> namespace mongo { + + class IndexSpec; + class IndexType; // TODO: this name sucks + class IndexPlugin; + class IndexDetails; + + enum IndexSuitability { USELESS = 0 , HELPFUL = 1 , OPTIMAL = 2 }; + + /** + * this represents an instance of a index plugin + * done this way so parsing, etc... can be cached + * so if there is a FTS IndexPlugin, for each index using FTS + * there will be 1 of these, and it can have things pre-parsed, etc... + */ + class IndexType : boost::noncopyable { + public: + IndexType( const IndexPlugin * plugin , const IndexSpec * spec ); + virtual ~IndexType(); + + virtual void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const = 0; + virtual auto_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const = 0; + + /** optional op : changes query to match what's in the index */ + virtual BSONObj fixKey( const BSONObj& in ) { return in; } + + /** optional op : compare 2 objects with regards to this index */ + virtual int compare( const BSONObj& l , const BSONObj& r ) const; + + /** @return plugin */ + const IndexPlugin * getPlugin() const { return _plugin; } + + const BSONObj& keyPattern() const; + + virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ; + + virtual bool scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const ; + + protected: + const IndexPlugin * _plugin; + const IndexSpec * _spec; + }; + /** + * this represents a plugin + * a plugin could be something like full text search, sparse index, etc... + * 1 of these exists per type of index per server + * 1 IndexType is created per index using this plugin + */ + class IndexPlugin : boost::noncopyable { + public: + IndexPlugin( const string& name ); + virtual ~IndexPlugin(){} + + virtual IndexType* generate( const IndexSpec * spec ) const = 0; + + static IndexPlugin* get( const string& name ){ + if ( ! _plugins ) + return 0; + map<string,IndexPlugin*>::iterator i = _plugins->find( name ); + if ( i == _plugins->end() ) + return 0; + return i->second; + } + + string getName() const { return _name; } + private: + string _name; + static map<string,IndexPlugin*> * _plugins; + }; + + /* precomputed details about an index, used for inserting keys on updates + stored/cached in NamespaceDetailsTransient, or can be used standalone + */ class IndexSpec { public: - BSONObj keys; - BSONObj meta; + BSONObj keyPattern; // e.g., { name : 1 } + BSONObj info; // this is the same as IndexDetails::info.obj() - IndexSpec(){ + IndexSpec() + : _details(0) , _finishedInit(false){ } IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() ) - : keys(k) , meta(m){ + : keyPattern(k) , info(m) , _details(0) , _finishedInit(false){ _init(); } - + /** - this is a DickLock of an IndexDetails info + this is a DiscLoc of an IndexDetails info should have a key field */ IndexSpec( const DiskLoc& loc ){ reset( loc ); } - void reset( const DiskLoc& loc ){ - meta = loc.obj(); - keys = meta["key"].embeddedObjectUserCheck(); - if ( keys.objsize() == 0 ) { - out() << meta.toString() << endl; - assert(false); - - } - _init(); - } + void reset( const DiskLoc& loc ); + void reset( const IndexDetails * details ); void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const; - private: + BSONElement missingField() const { return _nullElt; } + + string getTypeName() const { + if ( _indexType.get() ) + return _indexType->getPlugin()->getName(); + return ""; + } + + IndexType* getType() const { + return _indexType.get(); + } + + const IndexDetails * getDetails() const { + return _details; + } + + IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ; + + protected: + + IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ; void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const; + + BSONSizeTracker _sizeTracker; vector<const char*> _fieldNames; vector<BSONElement> _fixed; @@ -67,14 +157,23 @@ namespace mongo { BSONObj _nullObj; BSONElement _nullElt; + shared_ptr<IndexType> _indexType; + + const IndexDetails * _details; + void _init(); + + public: + bool _finishedInit; + + friend class IndexType; }; /* Details about a particular index. There is one of these effectively for each object in system.namespaces (although this also includes the head pointer, which is not in that collection). - ** MemoryMapped Record ** + ** MemoryMapped Record ** (i.e., this is on disk data) */ class IndexDetails { public: @@ -117,6 +216,7 @@ namespace mongo { /* true if the specified key is in the index */ bool hasKey(const BSONObj& key); + bool wouldCreateDup(const BSONObj& key, DiskLoc self); // returns name of this index's storage area // database.table.$index @@ -172,6 +272,8 @@ namespace mongo { (system.indexes or system.namespaces) -- only NamespaceIndex. */ void kill_idx(); + + const IndexSpec& getSpec() const; operator string() const { return info.obj().toString(); @@ -184,15 +286,20 @@ namespace mongo { vector<BSONObj*> removed; // these keys were removed as part of the change vector<BSONObj*> added; // these keys were added as part of the change - void dupCheck(IndexDetails& idx) { + /** @curObjLoc - the object we want to add's location. if it is already in the + index, that is allowed here (for bg indexing case). + */ + void dupCheck(IndexDetails& idx, DiskLoc curObjLoc) { if( added.empty() || !idx.unique() ) return; - for( vector<BSONObj*>::iterator i = added.begin(); i != added.end(); i++ ) - uassert( 11001 , "E11001 duplicate key on update", !idx.hasKey(**i)); + for( vector<BSONObj*>::iterator i = added.begin(); i != added.end(); i++ ) { + bool dup = idx.wouldCreateDup(**i, curObjLoc); + uassert( 11001 , "E11001 duplicate key on update", !dup); + } } }; class NamespaceDetails; void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj); - void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d); + void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc); } // namespace mongo diff --git a/db/index_geo2d.cpp b/db/index_geo2d.cpp new file mode 100644 index 0000000..4730c29 --- /dev/null +++ b/db/index_geo2d.cpp @@ -0,0 +1,1675 @@ +// geo2d.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "stdafx.h" +#include "namespace.h" +#include "jsobj.h" +#include "index.h" +#include "../util/unittest.h" +#include "commands.h" +#include "pdfile.h" +#include "btree.h" +#include "curop.h" +#include "matcher.h" + +//#define GEODEBUG(x) cout << x << endl; +#define GEODEBUG(x) + +namespace mongo { + + const string GEO2DNAME = "2d"; + + class GeoBitSets { + public: + GeoBitSets(){ + for ( int i=0; i<32; i++ ){ + masks32[i] = ( 1 << ( 31 - i ) ); + } + for ( int i=0; i<64; i++ ){ + masks64[i] = ( 1LL << ( 63 - i ) ); + } + } + int masks32[32]; + long long masks64[64]; + } geoBitSets; + + + class GeoHash { + public: + GeoHash() + : _hash(0),_bits(0){ + } + + GeoHash( const char * hash ){ + init( hash ); + } + + GeoHash( const string& hash ){ + init( hash ); + } + + GeoHash( const BSONElement& e , unsigned bits=32 ){ + _bits = bits; + if ( e.type() == BinData ){ + int len = 0; + _copy( (char*)&_hash , e.binData( len ) ); + assert( len == 8 ); + _bits = bits; + } + else { + cout << "GeoHash cons e : " << e << endl; + uassert(13047,"wrong type for geo index. if you're using a pre-release version, need to rebuild index",0); + } + _fix(); + } + + GeoHash( unsigned x , unsigned y , unsigned bits=32){ + init( x , y , bits ); + } + + GeoHash( const GeoHash& old ){ + _hash = old._hash; + _bits = old._bits; + } + + GeoHash( long long hash , unsigned bits ) + : _hash( hash ) , _bits( bits ){ + _fix(); + } + + void init( unsigned x , unsigned y , unsigned bits ){ + assert( bits <= 32 ); + _hash = 0; + _bits = bits; + for ( unsigned i=0; i<bits; i++ ){ + if ( isBitSet( x , i ) ) _hash |= geoBitSets.masks64[i*2]; + if ( isBitSet( y , i ) ) _hash |= geoBitSets.masks64[(i*2)+1]; + } + } + + void unhash( unsigned& x , unsigned& y ) const { + x = 0; + y = 0; + for ( unsigned i=0; i<_bits; i++ ){ + if ( getBitX(i) ) + x |= geoBitSets.masks32[i]; + if ( getBitY(i) ) + y |= geoBitSets.masks32[i]; + } + } + + /** + * @param 0 = high + */ + static bool isBitSet( unsigned val , unsigned bit ){ + return geoBitSets.masks32[bit] & val; + } + + GeoHash up() const { + return GeoHash( _hash , _bits - 1 ); + } + + bool hasPrefix( const GeoHash& other ) const { + assert( other._bits <= _bits ); + if ( other._bits == 0 ) + return true; + long long x = other._hash ^ _hash; + x = x >> (64-(other._bits*2)); + return x == 0; + } + + + string toString() const { + StringBuilder buf( _bits * 2 ); + for ( unsigned x=0; x<_bits*2; x++ ) + buf.append( _hash & geoBitSets.masks64[x] ? "1" : "0" ); + return buf.str(); + } + + string toStringHex1() const { + stringstream ss; + ss << hex << _hash; + return ss.str(); + } + + void init( const string& s ){ + _hash = 0; + _bits = s.size() / 2; + for ( unsigned pos=0; pos<s.size(); pos++ ) + if ( s[pos] == '1' ) + setBit( pos , 1 ); + } + + void setBit( unsigned pos , bool one ){ + assert( pos < _bits * 2 ); + if ( one ) + _hash |= geoBitSets.masks64[pos]; + else if ( _hash & geoBitSets.masks64[pos] ) + _hash &= ~geoBitSets.masks64[pos]; + } + + bool getBit( unsigned pos ) const { + return _hash & geoBitSets.masks64[pos]; + } + + bool getBitX( unsigned pos ) const { + assert( pos < 32 ); + return getBit( pos * 2 ); + } + + bool getBitY( unsigned pos ) const { + assert( pos < 32 ); + return getBit( ( pos * 2 ) + 1 ); + } + + BSONObj wrap() const { + BSONObjBuilder b(20); + append( b , "" ); + BSONObj o = b.obj(); + assert( o.objsize() == 20 ); + return o; + } + + bool constrains() const { + return _bits > 0; + } + + void move( int x , int y ){ + assert( _bits ); + _move( 0 , x ); + _move( 1 , y ); + } + + void _move( unsigned offset , int d ){ + if ( d == 0 ) + return; + assert( d <= 1 && d>= -1 ); // TEMP + + bool from, to; + if ( d > 0 ){ + from = 0; + to = 1; + } + else { + from = 1; + to = 0; + } + + unsigned pos = ( _bits * 2 ) - 1; + if ( offset == 0 ) + pos--; + while ( true ){ + if ( getBit(pos) == from ){ + setBit( pos , to ); + return; + } + + if ( pos < 2 ){ + // overflow + for ( ; pos < ( _bits * 2 ) ; pos += 2 ){ + setBit( pos , from ); + } + return; + } + + setBit( pos , from ); + pos -= 2; + } + + assert(0); + } + + GeoHash& operator=(const GeoHash& h) { + _hash = h._hash; + _bits = h._bits; + return *this; + } + + bool operator==(const GeoHash& h ){ + return _hash == h._hash && _bits == h._bits; + } + + GeoHash& operator+=( const char * s ) { + unsigned pos = _bits * 2; + _bits += strlen(s) / 2; + assert( _bits <= 32 ); + while ( s[0] ){ + if ( s[0] == '1' ) + setBit( pos , 1 ); + pos++; + s++; + } + + return *this; + } + + GeoHash operator+( const char * s ) const { + GeoHash n = *this; + n+=s; + return n; + } + + void _fix(){ + if ( ( _hash << ( _bits * 2 ) ) == 0 ) + return; + long long mask = 0; + for ( unsigned i=0; i<_bits*2; i++ ) + mask |= geoBitSets.masks64[i]; + _hash &= mask; + } + + void append( BSONObjBuilder& b , const char * name ) const { + char buf[8]; + _copy( buf , (char*)&_hash ); + b.appendBinData( name , 8 , bdtCustom , buf ); + } + + long long getHash() const { + return _hash; + } + + GeoHash commonPrefix( const GeoHash& other ) const { + unsigned i=0; + for ( ; i<_bits && i<other._bits; i++ ){ + if ( getBitX( i ) == other.getBitX( i ) && + getBitY( i ) == other.getBitY( i ) ) + continue; + break; + } + return GeoHash(_hash,i); + } + private: + + void _copy( char * dst , const char * src ) const { + for ( unsigned a=0; a<8; a++ ){ + dst[a] = src[7-a]; + } + } + + long long _hash; + unsigned _bits; // bits per field, so 1 to 32 + }; + + ostream& operator<<( ostream &s, const GeoHash &h ){ + s << h.toString(); + return s; + } // end GeoHash + + class Geo2dType : public IndexType { + public: + Geo2dType( const IndexPlugin * plugin , const IndexSpec* spec ) + : IndexType( plugin , spec ){ + + BSONObjBuilder orderBuilder; + + BSONObjIterator i( spec->keyPattern ); + while ( i.more() ){ + BSONElement e = i.next(); + if ( e.type() == String && GEO2DNAME == e.valuestr() ){ + uassert( 13022 , "can't have 2 geo field" , _geo.size() == 0 ); + uassert( 13023 , "2d has to be first in index" , _other.size() == 0 ); + _geo = e.fieldName(); + } + else { + _other.push_back( e.fieldName() ); + } + orderBuilder.append( "" , 1 ); + } + + uassert( 13024 , "no geo field specified" , _geo.size() ); + + _bits = _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft + + uassert( 13028 , "can't have more than 32 bits in geo index" , _bits <= 32 ); + + _max = _configval( spec , "max" , 180 ); + _min = _configval( spec , "min" , -180 ); + + _scaling = (1024*1024*1024*4.0)/(_max-_min); + + _order = orderBuilder.obj(); + } + + int _configval( const IndexSpec* spec , const string& name , int def ){ + BSONElement e = spec->info[name]; + if ( e.isNumber() ) + return e.numberInt(); + return def; + } + + ~Geo2dType(){ + + } + + virtual BSONObj fixKey( const BSONObj& in ) { + if ( in.firstElement().type() == BinData ) + return in; + + BSONObjBuilder b(in.objsize()+16); + + if ( in.firstElement().isABSONObj() ) + _hash( in.firstElement().embeddedObject() ).append( b , "" ); + else if ( in.firstElement().type() == String ) + GeoHash( in.firstElement().valuestr() ).append( b , "" ); + else if ( in.firstElement().type() == RegEx ) + GeoHash( in.firstElement().regex() ).append( b , "" ); + else + return in; + + BSONObjIterator i(in); + i.next(); + while ( i.more() ) + b.append( i.next() ); + return b.obj(); + } + + virtual void getKeys( const BSONObj &obj, BSONObjSetDefaultOrder &keys ) const { + BSONElement geo = obj.getFieldDotted(_geo.c_str()); + if ( geo.eoo() ) + return; + + BSONObjBuilder b(64); + + if ( ! geo.isABSONObj() ) + return; + + BSONObj embed = geo.embeddedObject(); + if ( embed.isEmpty() ) + return; + + _hash( embed ).append( b , "" ); + + for ( size_t i=0; i<_other.size(); i++ ){ + BSONElement e = obj[_other[i]]; + if ( e.eoo() ) + e = _spec->missingField(); + b.appendAs( e , "" ); + } + keys.insert( b.obj() ); + } + + GeoHash _tohash( const BSONElement& e ) const { + if ( e.isABSONObj() ) + return _hash( e.embeddedObject() ); + + return GeoHash( e , _bits ); + } + + GeoHash _hash( const BSONObj& o ) const { + BSONObjIterator i(o); + uassert( 13067 , "geo field is empty" , i.more() ); + BSONElement x = i.next(); + uassert( 13068 , "geo field only has 1 element" , i.more() ); + BSONElement y = i.next(); + + uassert( 13026 , "geo values have to be numbers" , x.isNumber() && y.isNumber() ); + + return _hash( x.number() , y.number() ); + } + + GeoHash _hash( double x , double y ) const { + return GeoHash( _convert(x), _convert(y) , _bits ); + } + + BSONObj _unhash( const GeoHash& h ) const { + unsigned x , y; + h.unhash( x , y ); + BSONObjBuilder b; + b.append( "x" , _unconvert( x ) ); + b.append( "y" , _unconvert( y ) ); + return b.obj(); + } + + unsigned _convert( double in ) const { + uassert( 13027 , "point not in range" , in <= _max && in >= _min ); + in -= _min; + assert( in > 0 ); + return (unsigned)(in * _scaling); + } + + double _unconvert( unsigned in ) const { + double x = in; + x /= _scaling; + x += _min; + return x; + } + + void _unconvert( const GeoHash& h , double& x , double& y ) const { + unsigned a,b; + h.unhash(a,b); + x = _unconvert( a ); + y = _unconvert( b ); + } + + double distance( const GeoHash& a , const GeoHash& b ) const { + double ax,ay,bx,by; + _unconvert( a , ax , ay ); + _unconvert( b , bx , by ); + + double dx = bx - ax; + double dy = by - ay; + + return sqrt( ( dx * dx ) + ( dy * dy ) ); + } + + double size( const GeoHash& a ) const { + GeoHash b = a; + b.move( 1 , 1 ); + return distance( a , b ); + } + + const IndexDetails* getDetails() const { + return _spec->getDetails(); + } + + virtual auto_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const; + + virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const { + BSONElement e = query.getFieldDotted(_geo.c_str()); + switch ( e.type() ){ + case Object: { + BSONObj sub = e.embeddedObject(); + switch ( sub.firstElement().getGtLtOp() ){ + case BSONObj::opNEAR: + case BSONObj::opWITHIN: + return OPTIMAL; + default:; + } + } + case Array: + return HELPFUL; + default: + return USELESS; + } + } + + string _geo; + vector<string> _other; + + unsigned _bits; + int _max; + int _min; + double _scaling; + + BSONObj _order; + }; + + class Point { + public: + + Point( const Geo2dType * g , const GeoHash& hash ){ + g->_unconvert( hash , _x , _y ); + } + + Point( double x , double y ) + : _x( x ) , _y( y ){ + } + + Point() : _x(0),_y(0){ + } + + GeoHash hash( const Geo2dType * g ){ + return g->_hash( _x , _y ); + } + + string toString() const { + StringBuilder buf(32); + buf << "(" << _x << "," << _y << ")"; + return buf.str(); + + } + + double _x; + double _y; + }; + + class Box { + public: + + Box( const Geo2dType * g , const GeoHash& hash ) + : _min( g , hash ) , + _max( _min._x + g->size( hash ) , _min._y + g->size( hash ) ){ + } + + Box( double x , double y , double size ) + : _min( x , y ) , + _max( x + size , y + size ){ + } + + Box( Point min , Point max ) + : _min( min ) , _max( max ){ + } + + Box(){} + + string toString() const { + StringBuilder buf(64); + buf << _min.toString() << " -->> " << _max.toString(); + return buf.str(); + } + + operator string() const { + return toString(); + } + + bool between( double min , double max , double val , double fudge=0) const { + return val + fudge >= min && val <= max + fudge; + } + + bool mid( double amin , double amax , double bmin , double bmax , bool min , double& res ) const { + assert( amin < amax ); + assert( bmin < bmax ); + + if ( amin < bmin ){ + if ( amax < bmin ) + return false; + res = min ? bmin : amax; + return true; + } + if ( amin > bmax ) + return false; + res = min ? amin : bmax; + return true; + } + + double intersects( const Box& other ) const { + + Point boundMin(0,0); + Point boundMax(0,0); + + if ( mid( _min._x , _max._x , other._min._x , other._max._x , true , boundMin._x ) == false || + mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false || + mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false || + mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false ) + return 0; + + Box intersection( boundMin , boundMax ); + + return intersection.area() / ( ( area() + other.area() ) / 2 ); + } + + double area() const { + return ( _max._x - _min._x ) * ( _max._y - _min._y ); + } + + Point center() const { + return Point( ( _min._x + _max._x ) / 2 , + ( _min._y + _max._y ) / 2 ); + } + + bool inside( Point p , double fudge = 0 ){ + bool res = inside( p._x , p._y , fudge ); + //cout << "is : " << p.toString() << " in " << toString() << " = " << res << endl; + return res; + } + + bool inside( double x , double y , double fudge = 0 ){ + return + between( _min._x , _max._x , x , fudge ) && + between( _min._y , _max._y , y , fudge ); + } + + Point _min; + Point _max; + }; + + class Geo2dPlugin : public IndexPlugin { + public: + Geo2dPlugin() : IndexPlugin( GEO2DNAME ){ + } + + virtual IndexType* generate( const IndexSpec* spec ) const { + return new Geo2dType( this , spec ); + } + } geo2dplugin; + + struct GeoUnitTest : public UnitTest { + + int round( double d ){ + return (int)(.5+(d*1000)); + } + +#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == b ); } + + void run(){ + assert( ! GeoHash::isBitSet( 0 , 0 ) ); + assert( ! GeoHash::isBitSet( 0 , 31 ) ); + assert( GeoHash::isBitSet( 1 , 31 ) ); + + IndexSpec i( BSON( "loc" << "2d" ) ); + Geo2dType g( &geo2dplugin , &i ); + { + double x = 73.01212; + double y = 41.352964; + BSONObj in = BSON( "x" << x << "y" << y ); + GeoHash h = g._hash( in ); + BSONObj out = g._unhash( h ); + assert( round(x) == round( out["x"].number() ) ); + assert( round(y) == round( out["y"].number() ) ); + assert( round( in["x"].number() ) == round( out["x"].number() ) ); + assert( round( in["y"].number() ) == round( out["y"].number() ) ); + } + + { + double x = -73.01212; + double y = 41.352964; + BSONObj in = BSON( "x" << x << "y" << y ); + GeoHash h = g._hash( in ); + BSONObj out = g._unhash( h ); + assert( round(x) == round( out["x"].number() ) ); + assert( round(y) == round( out["y"].number() ) ); + assert( round( in["x"].number() ) == round( out["x"].number() ) ); + assert( round( in["y"].number() ) == round( out["y"].number() ) ); + } + + { + GeoHash h( "0000" ); + h.move( 0 , 1 ); + GEOHEQ( h , "0001" ); + h.move( 0 , -1 ); + GEOHEQ( h , "0000" ); + + h.init( "0001" ); + h.move( 0 , 1 ); + GEOHEQ( h , "0100" ); + h.move( 0 , -1 ); + GEOHEQ( h , "0001" ); + + + h.init( "0000" ); + h.move( 1 , 0 ); + GEOHEQ( h , "0010" ); + } + + { + Box b( 5 , 5 , 2 ); + assert( "(5,5) -->> (7,7)" == b.toString() ); + } + + { + GeoHash a = g._hash( 1 , 1 ); + GeoHash b = g._hash( 4 , 5 ); + assert( 5 == (int)(g.distance( a , b ) ) ); + a = g._hash( 50 , 50 ); + b = g._hash( 42 , 44 ); + assert( round(10) == round(g.distance( a , b )) ); + } + + { + GeoHash x("0000"); + assert( 0 == x.getHash() ); + x.init( 0 , 1 , 32 ); + GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" ) + + assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) ); + assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) ); + } + + { + GeoHash x("1010"); + GEOHEQ( x , "1010" ); + GeoHash y = x + "01"; + GEOHEQ( y , "101001" ); + } + + { + + GeoHash a = g._hash( 5 , 5 ); + GeoHash b = g._hash( 5 , 7 ); + GeoHash c = g._hash( 100 , 100 ); + /* + cout << "a: " << a << endl; + cout << "b: " << b << endl; + cout << "c: " << c << endl; + + cout << "a: " << a.toStringHex1() << endl; + cout << "b: " << b.toStringHex1() << endl; + cout << "c: " << c.toStringHex1() << endl; + */ + BSONObj oa = a.wrap(); + BSONObj ob = b.wrap(); + BSONObj oc = c.wrap(); + /* + cout << "a: " << oa.hexDump() << endl; + cout << "b: " << ob.hexDump() << endl; + cout << "c: " << oc.hexDump() << endl; + */ + assert( oa.woCompare( ob ) < 0 ); + assert( oa.woCompare( oc ) < 0 ); + + } + + { + GeoHash x( "000000" ); + x.move( -1 , 0 ); + GEOHEQ( x , "101010" ); + x.move( 1 , -1 ); + GEOHEQ( x , "010101" ); + x.move( 0 , 1 ); + GEOHEQ( x , "000000" ); + } + + { + GeoHash prefix( "110011000000" ); + GeoHash entry( "1100110000011100000111000001110000011100000111000001000000000000" ); + assert( ! entry.hasPrefix( prefix ) ); + + entry = "1100110000001100000111000001110000011100000111000001000000000000"; + assert( entry.toString().find( prefix.toString() ) == 0 ); + assert( entry.hasPrefix( GeoHash( "1100" ) ) ); + assert( entry.hasPrefix( prefix ) ); + } + + { + GeoHash a = g._hash( 50 , 50 ); + GeoHash b = g._hash( 48 , 54 ); + assert( round( 4.47214 ) == round( g.distance( a , b ) ) ); + } + + + { + Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) ); + assert( b.inside( 29.763 , -95.363 ) ); + assert( ! b.inside( 32.9570255 , -96.1082497 ) ); + assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) ); + } + + { + GeoHash a( "11001111" ); + assert( GeoHash( "11" ) == a.commonPrefix( "11" ) ); + assert( GeoHash( "11" ) == a.commonPrefix( "11110000" ) ); + } + + } + } geoUnitTest; + + class GeoPoint { + public: + GeoPoint(){ + } + + GeoPoint( const KeyNode& node , double distance ) + : _key( node.key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ) , _distance( distance ){ + } + + GeoPoint( const BSONObj& key , DiskLoc loc , double distance ) + : _key(key) , _loc(loc) , _o( loc.obj() ) , _distance( distance ){ + } + + bool operator<( const GeoPoint& other ) const { + return _distance < other._distance; + } + + bool isEmpty() const { + return _o.isEmpty(); + } + + BSONObj _key; + DiskLoc _loc; + BSONObj _o; + double _distance; + }; + + class GeoAccumulator { + public: + GeoAccumulator( const Geo2dType * g , const BSONObj& filter ) + : _g(g) , _lookedAt(0) , _objectsLoaded(0) , _found(0) { + if ( ! filter.isEmpty() ){ + _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) ); + } + } + + virtual ~GeoAccumulator(){ + } + + virtual void add( const KeyNode& node ){ + // when looking at other boxes, don't want to look at some object twice + if ( _seen.count( node.recordLoc ) ){ + GEODEBUG( "\t\t\t\t already seen : " << node.recordLoc.obj()["_id"] ); + return; + } + _seen.insert( node.recordLoc ); + _lookedAt++; + + // distance check + double d = 0; + if ( ! checkDistance( GeoHash( node.key.firstElement() ) , d ) ){ + GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj() << "\t" << d ); + return; + } + + // matcher + MatchDetails details; + if ( _matcher.get() ){ + bool good = _matcher->matches( node.key , node.recordLoc , &details ); + if ( details.loadedObject ) + _objectsLoaded++; + + if ( ! good ){ + GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] ); + return; + } + } + + if ( ! details.loadedObject ) // dont double count + _objectsLoaded++; + + addSpecific( node , d ); + _found++; + } + + virtual void addSpecific( const KeyNode& node , double d ) = 0; + virtual bool checkDistance( const GeoHash& node , double& d ) = 0; + + long long found() const { + return _found; + } + + const Geo2dType * _g; + set<DiskLoc> _seen; + auto_ptr<CoveredIndexMatcher> _matcher; + + long long _lookedAt; + long long _objectsLoaded; + long long _found; + }; + + class GeoHopper : public GeoAccumulator { + public: + typedef multiset<GeoPoint> Holder; + + GeoHopper( const Geo2dType * g , unsigned max , const GeoHash& n , const BSONObj& filter = BSONObj() ) + : GeoAccumulator( g , filter ) , _max( max ) , _near( n ) { + + } + + virtual bool checkDistance( const GeoHash& h , double& d ){ + d = _g->distance( _near , h ); + bool good = _points.size() < _max || d < farthest(); + GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near << "\t" << h << "\t" << d + << " ok: " << good << " farthest: " << farthest() ); + return good; + } + + virtual void addSpecific( const KeyNode& node , double d ){ + GEODEBUG( "\t\t" << GeoHash( node.key.firstElement() ) << "\t" << node.recordLoc.obj() << "\t" << d ); + _points.insert( GeoPoint( node.key , node.recordLoc , d ) ); + if ( _points.size() > _max ){ + _points.erase( --_points.end() ); + } + } + + double farthest(){ + if ( _points.size() == 0 ) + return -1; + + Holder::iterator i = _points.end(); + i--; + return i->_distance; + } + + unsigned _max; + GeoHash _near; + Holder _points; + + }; + + struct BtreeLocation { + int pos; + bool found; + DiskLoc bucket; + + BSONObj key(){ + if ( bucket.isNull() ) + return BSONObj(); + return bucket.btree()->keyNode( pos ).key; + } + + bool hasPrefix( const GeoHash& hash ){ + BSONElement e = key().firstElement(); + if ( e.eoo() ) + return false; + return GeoHash( e ).hasPrefix( hash ); + } + + bool advance( int direction , int& totalFound , GeoAccumulator* all ){ + + if ( bucket.isNull() ) + return false; + bucket = bucket.btree()->advance( bucket , pos , direction , "btreelocation" ); + + return checkCur( totalFound , all ); + } + + bool checkCur( int& totalFound , GeoAccumulator* all ){ + if ( bucket.isNull() ) + return false; + + if ( bucket.btree()->isUsed(pos) ){ + totalFound++; + all->add( bucket.btree()->keyNode( pos ) ); + } + else { + GEODEBUG( "\t\t\t\t not used: " << key() ); + } + + return true; + } + + string toString(){ + stringstream ss; + ss << "bucket: " << bucket.toString() << " pos: " << pos << " found: " << found; + return ss.str(); + } + + static bool initial( const IndexDetails& id , const Geo2dType * spec , + BtreeLocation& min , BtreeLocation& max , + GeoHash start , + int & found , GeoAccumulator * hopper ){ + + min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , + spec->_order , min.pos , min.found , minDiskLoc ); + min.checkCur( found , hopper ); + max = min; + + if ( min.bucket.isNull() ){ + min.bucket = id.head.btree()->locate( id , id.head , start.wrap() , + spec->_order , min.pos , min.found , minDiskLoc , -1 ); + min.checkCur( found , hopper ); + } + + return ! min.bucket.isNull() || ! max.bucket.isNull(); + } + }; + + class GeoSearch { + public: + GeoSearch( const Geo2dType * g , const GeoHash& n , int numWanted=100 , BSONObj filter=BSONObj() ) + : _spec( g ) , _n( n ) , _start( n ) , + _numWanted( numWanted ) , _filter( filter ) , + _hopper( new GeoHopper( g , numWanted , n , filter ) ) + { + assert( g->getDetails() ); + _nscanned = 0; + _found = 0; + } + + void exec(){ + const IndexDetails& id = *_spec->getDetails(); + + BtreeBucket * head = id.head.btree(); + assert( head ); + /* + * Search algorithm + * 1) use geohash prefix to find X items + * 2) compute max distance from want to an item + * 3) find optimal set of boxes that complete circle + * 4) use regular btree cursors to scan those boxes + */ + + GeoHopper * hopper = _hopper.get(); + + _prefix = _start; + { // 1 regular geo hash algorithm + + + BtreeLocation min,max; + if ( ! BtreeLocation::initial( id , _spec , min , max , _n , _found , hopper ) ) + return; + + while ( _hopper->found() < _numWanted ){ + GEODEBUG( _prefix << "\t" << _found << "\t DESC" ); + while ( min.hasPrefix( _prefix ) && min.advance( -1 , _found , hopper ) ) + _nscanned++; + GEODEBUG( _prefix << "\t" << _found << "\t ASC" ); + while ( max.hasPrefix( _prefix ) && max.advance( 1 , _found , hopper ) ) + _nscanned++; + if ( ! _prefix.constrains() ) + break; + _prefix = _prefix.up(); + } + } + GEODEBUG( "done part 1" ); + if ( _found && _prefix.constrains() ){ + // 2 + Point center( _spec , _n ); + double boxSize = _spec->size( _prefix ); + double farthest = hopper->farthest(); + if ( farthest > boxSize ) + boxSize = farthest; + Box want( center._x - ( boxSize / 2 ) , center._y - ( boxSize / 2 ) , boxSize ); + while ( _spec->size( _prefix ) < boxSize ) + _prefix = _prefix.up(); + log(1) << "want: " << want << " found:" << _found << " hash size:" << _spec->size( _prefix ) << endl; + + for ( int x=-1; x<=1; x++ ){ + for ( int y=-1; y<=1; y++ ){ + GeoHash toscan = _prefix; + toscan.move( x , y ); + + // 3 & 4 + doBox( id , want , toscan ); + } + } + } + GEODEBUG( "done search" ) + + } + + void doBox( const IndexDetails& id , const Box& want , const GeoHash& toscan , int depth = 0 ){ + Box testBox( _spec , toscan ); + if ( logLevel > 0 ) log(1) << "\t doBox: " << testBox << "\t" << toscan.toString() << endl; + + double intPer = testBox.intersects( want ); + + if ( intPer <= 0 ) + return; + + if ( intPer < .5 && depth < 3 ){ + doBox( id , want , toscan + "00" , depth + 1); + doBox( id , want , toscan + "01" , depth + 1); + doBox( id , want , toscan + "10" , depth + 1); + doBox( id , want , toscan + "11" , depth + 1); + return; + } + + BtreeLocation loc; + loc.bucket = id.head.btree()->locate( id , id.head , toscan.wrap() , _spec->_order , + loc.pos , loc.found , minDiskLoc ); + loc.checkCur( _found , _hopper.get() ); + while ( loc.hasPrefix( toscan ) && loc.advance( 1 , _found , _hopper.get() ) ) + _nscanned++; + + } + + + const Geo2dType * _spec; + + GeoHash _n; + GeoHash _start; + GeoHash _prefix; + int _numWanted; + BSONObj _filter; + shared_ptr<GeoHopper> _hopper; + + long long _nscanned; + int _found; + }; + + class GeoCursorBase : public Cursor { + public: + GeoCursorBase( const Geo2dType * spec ) + : _spec( spec ), _id( _spec->getDetails() ){ + + } + + virtual DiskLoc refLoc(){ return DiskLoc(); } + + virtual BSONObj indexKeyPattern() { + return _spec->keyPattern(); + } + + virtual void noteLocation() { + assert(0); + } + + /* called before query getmore block is iterated */ + virtual void checkLocation() { + assert(0); + } + + virtual bool supportGetMore() { return false; } + + virtual bool getsetdup(DiskLoc loc){ + return false; + } + + const Geo2dType * _spec; + const IndexDetails * _id; + }; + + class GeoSearchCursor : public GeoCursorBase { + public: + GeoSearchCursor( shared_ptr<GeoSearch> s ) + : GeoCursorBase( s->_spec ) , + _s( s ) , _cur( s->_hopper->_points.begin() ) , _end( s->_hopper->_points.end() ) { + } + + virtual ~GeoSearchCursor() {} + + virtual bool ok(){ + return _cur != _end; + } + + virtual Record* _current(){ assert(ok()); return _cur->_loc.rec(); } + virtual BSONObj current(){ assert(ok()); return _cur->_o; } + virtual DiskLoc currLoc(){ assert(ok()); return _cur->_loc; } + virtual bool advance(){ _cur++; return ok(); } + virtual BSONObj currKey() const { return _cur->_key; } + + virtual string toString() { + return "GeoSearchCursor"; + } + + + virtual BSONObj prettyStartKey() const { + return BSON( _s->_spec->_geo << _s->_prefix.toString() ); + } + virtual BSONObj prettyEndKey() const { + GeoHash temp = _s->_prefix; + temp.move( 1 , 1 ); + return BSON( _s->_spec->_geo << temp.toString() ); + } + + + shared_ptr<GeoSearch> _s; + GeoHopper::Holder::iterator _cur; + GeoHopper::Holder::iterator _end; + }; + + class GeoBrowse : public GeoCursorBase , public GeoAccumulator { + public: + GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj() ) + : GeoCursorBase( g ) ,GeoAccumulator( g , filter ) , + _type( type ) , _filter( filter ) , _firstCall(true) { + } + + virtual string toString() { + return (string)"GeoBrowse-" + _type; + } + + virtual bool ok(){ + if ( _firstCall ){ + fillStack(); + _firstCall = false; + } + if ( ! _cur.isEmpty() || _stack.size() ) + return true; + + while ( moreToDo() ){ + fillStack(); + if ( ! _cur.isEmpty() ) + return true; + } + + return false; + } + + virtual bool advance(){ + _cur._o = BSONObj(); + + if ( _stack.size() ){ + _cur = _stack.front(); + _stack.pop_front(); + return true; + } + + if ( ! moreToDo() ) + return false; + + while ( _cur.isEmpty() && moreToDo() ) + fillStack(); + return ! _cur.isEmpty(); + } + + virtual Record* _current(){ assert(ok()); return _cur._loc.rec(); } + virtual BSONObj current(){ assert(ok()); return _cur._o; } + virtual DiskLoc currLoc(){ assert(ok()); return _cur._loc; } + virtual BSONObj currKey() const { return _cur._key; } + + + virtual bool moreToDo() = 0; + virtual void fillStack() = 0; + + virtual void addSpecific( const KeyNode& node , double d ){ + if ( _cur.isEmpty() ) + _cur = GeoPoint( node , d ); + else + _stack.push_back( GeoPoint( node , d ) ); + } + + string _type; + BSONObj _filter; + list<GeoPoint> _stack; + + GeoPoint _cur; + bool _firstCall; + + }; + + class GeoCircleBrowse : public GeoBrowse { + public: + + enum State { + START , + DOING_EXPAND , + DOING_AROUND , + DONE + } _state; + + GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() ) + : GeoBrowse( g , "circle" , filter ){ + + uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 ); + BSONObjIterator i(circle); + _start = g->_tohash( i.next() ); + _prefix = _start; + _maxDistance = i.next().numberDouble(); + uassert( 13061 , "need a max distance > 0 " , _maxDistance > 0 ); + + _state = START; + _found = 0; + + ok(); + } + + virtual bool moreToDo(){ + return _state != DONE; + } + + virtual void fillStack(){ + if ( _state == START ){ + if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , + _prefix , _found , this ) ){ + _state = DONE; + return; + } + _state = DOING_EXPAND; + } + + if ( _state == DOING_EXPAND ){ + GEODEBUG( "circle prefix [" << _prefix << "]" ); + while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) ); + while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) ); + + if ( ! _prefix.constrains() ){ + GEODEBUG( "\t exhausted the btree" ); + _state = DONE; + return; + } + + if ( _g->distance( _prefix , _start ) > _maxDistance ){ + GEODEBUG( "\tpast circle bounds" ); + GeoHash tr = _prefix; + tr.move( 1 , 1 ); + if ( _g->distance( tr , _start ) > _maxDistance ) + _state = DOING_AROUND; + else + _prefix = _prefix.up(); + } + else + _prefix = _prefix.up(); + return; + } + + if ( _state == DOING_AROUND ){ + _state = DONE; + return; + } + } + + virtual bool checkDistance( const GeoHash& h , double& d ){ + d = _g->distance( _start , h ); + GEODEBUG( "\t " << h << "\t" << d ); + return d <= ( _maxDistance + .01 ); + } + + GeoHash _start; + double _maxDistance; + + int _found; + + GeoHash _prefix; + BtreeLocation _min; + BtreeLocation _max; + + }; + + class GeoBoxBrowse : public GeoBrowse { + public: + + enum State { + START , + DOING_EXPAND , + DONE + } _state; + + GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj() ) + : GeoBrowse( g , "box" , filter ){ + + uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 ); + BSONObjIterator i(box); + _bl = g->_tohash( i.next() ); + _tr = g->_tohash( i.next() ); + + _want._min = Point( _g , _bl ); + _want._max = Point( _g , _tr ); + + uassert( 13064 , "need an area > 0 " , _want.area() > 0 ); + + _state = START; + _found = 0; + + Point center = _want.center(); + _prefix = _g->_hash( center._x , center._y ); + + GEODEBUG( "center : " << center.toString() << "\t" << _prefix ); + + { + GeoHash a(0LL,32); + GeoHash b(0LL,32); + b.move(1,1); + _fudge = _g->distance(a,b); + } + + ok(); + } + + virtual bool moreToDo(){ + return _state != DONE; + } + + virtual void fillStack(){ + if ( _state == START ){ + + if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , + _prefix , _found , this ) ){ + _state = DONE; + return; + } + _state = DOING_EXPAND; + } + + if ( _state == DOING_EXPAND ){ + int started = _found; + while ( started == _found || _state == DONE ){ + GEODEBUG( "box prefix [" << _prefix << "]" ); + while ( _min.hasPrefix( _prefix ) && _min.advance( -1 , _found , this ) ); + while ( _max.hasPrefix( _prefix ) && _max.advance( 1 , _found , this ) ); + + if ( _state == DONE ) + return; + + if ( ! _prefix.constrains() ){ + GEODEBUG( "box exhausted" ); + _state = DONE; + return; + } + + Box cur( _g , _prefix ); + if ( cur._min._x + _fudge < _want._min._x && + cur._min._y + _fudge < _want._min._y && + cur._max._x - _fudge > _want._max._x && + cur._max._y - _fudge > _want._max._y ){ + + _state = DONE; + GeoHash temp = _prefix.commonPrefix( cur._max.hash( _g ) ); + + GEODEBUG( "box done : " << cur.toString() << " prefix:" << _prefix << " common:" << temp ); + + if ( temp == _prefix ) + return; + _prefix = temp; + GEODEBUG( "\t one more loop" ); + continue; + } + else { + _prefix = _prefix.up(); + } + } + return; + } + + } + + virtual bool checkDistance( const GeoHash& h , double& d ){ + bool res = _want.inside( Point( _g , h ) , _fudge ); + GEODEBUG( "\t want : " << _want.toString() + << " point: " << Point( _g , h ).toString() + << " in : " << res ); + return res; + } + + GeoHash _bl; + GeoHash _tr; + Box _want; + + int _found; + + GeoHash _prefix; + BtreeLocation _min; + BtreeLocation _max; + + double _fudge; + }; + + + auto_ptr<Cursor> Geo2dType::newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const { + if ( numWanted < 0 ) + numWanted = numWanted * -1; + else if ( numWanted == 0 ) + numWanted = 100; + + BSONObjIterator i(query); + while ( i.more() ){ + BSONElement e = i.next(); + + if ( _geo != e.fieldName() ) + continue; + + if ( e.type() != Object ) + continue; + + switch ( e.embeddedObject().firstElement().getGtLtOp() ){ + case BSONObj::opNEAR: { + e = e.embeddedObject().firstElement(); + shared_ptr<GeoSearch> s( new GeoSearch( this , _tohash(e) , numWanted , query ) ); + s->exec(); + auto_ptr<Cursor> c; + c.reset( new GeoSearchCursor( s ) ); + return c; + } + case BSONObj::opWITHIN: { + e = e.embeddedObject().firstElement(); + uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() ); + e = e.embeddedObject().firstElement(); + string type = e.fieldName(); + if ( type == "$center" ){ + uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() ); + auto_ptr<Cursor> c; + c.reset( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query ) ); + return c; + } + else if ( type == "$box" ){ + uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() ); + auto_ptr<Cursor> c; + c.reset( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query ) ); + return c; + } + throw UserException( 13058 , (string)"unknown $with type: " + type ); + } + default: + break; + } + } + + throw UserException( 13042 , (string)"missing geo field (" + _geo + ") in : " + query.toString() ); + } + + // ------ + // commands + // ------ + + class Geo2dFindNearCmd : public Command { + public: + Geo2dFindNearCmd() : Command( "geoNear" ){} + virtual LockType locktype(){ return READ; } + bool slaveOk() { return true; } + bool slaveOverrideOk() { return true; } + bool run(const char * stupidns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + string ns = nsToDatabase( stupidns ) + "." + cmdObj.firstElement().valuestr(); + + NamespaceDetails * d = nsdetails( ns.c_str() ); + if ( ! d ){ + errmsg = "can't find ns"; + return false; + } + + int geoIdx = -1; + { + NamespaceDetails::IndexIterator ii = d->ii(); + while ( ii.more() ){ + IndexDetails& id = ii.next(); + if ( id.getSpec().getTypeName() == GEO2DNAME ){ + if ( geoIdx >= 0 ){ + errmsg = "2 geo indexes :("; + return false; + } + geoIdx = ii.pos() - 1; + } + } + } + + if ( geoIdx < 0 ){ + errmsg = "no geo index :("; + return false; + } + + result.append( "ns" , ns ); + + IndexDetails& id = d->idx( geoIdx ); + Geo2dType * g = (Geo2dType*)id.getSpec().getType(); + assert( &id == g->getDetails() ); + + int numWanted = 100; + if ( cmdObj["num"].isNumber() ) + numWanted = cmdObj["num"].numberInt(); + + uassert(13046, "'near' param missing/invalid", !cmdObj["near"].eoo()); + const GeoHash n = g->_tohash( cmdObj["near"] ); + result.append( "near" , n.toString() ); + + BSONObj filter; + if ( cmdObj["query"].type() == Object ) + filter = cmdObj["query"].embeddedObject(); + + GeoSearch gs( g , n , numWanted , filter ); + + if ( cmdObj["start"].type() == String){ + GeoHash start = (string) cmdObj["start"].valuestr(); + gs._start = start; + } + + gs.exec(); + + double distanceMultiplier = 1; + if ( cmdObj["distanceMultiplier"].isNumber() ) + distanceMultiplier = cmdObj["distanceMultiplier"].number(); + + double totalDistance = 0; + + + BSONObjBuilder arr( result.subarrayStart( "results" ) ); + int x = 0; + for ( GeoHopper::Holder::iterator i=gs._hopper->_points.begin(); i!=gs._hopper->_points.end(); i++ ){ + const GeoPoint& p = *i; + + double dis = distanceMultiplier * p._distance; + totalDistance += dis; + + BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ).c_str() ) ); + bb.append( "dis" , dis ); + bb.append( "obj" , p._o ); + bb.done(); + } + arr.done(); + + BSONObjBuilder stats( result.subobjStart( "stats" ) ); + stats.append( "time" , cc().curop()->elapsedMillis() ); + stats.appendNumber( "btreelocs" , gs._nscanned ); + stats.appendNumber( "nscanned" , gs._hopper->_lookedAt ); + stats.appendNumber( "objectsLoaded" , gs._hopper->_objectsLoaded ); + stats.append( "avgDistance" , totalDistance / x ); + stats.done(); + + return true; + } + + } geo2dFindNearCmd; + + class GeoWalkCmd : public Command { + public: + GeoWalkCmd() : Command( "geoWalk" ){} + virtual LockType locktype(){ return READ; } + bool slaveOk() { return true; } + bool slaveOverrideOk() { return true; } + bool run(const char * stupidns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + string ns = nsToDatabase( stupidns ) + "." + cmdObj.firstElement().valuestr(); + + NamespaceDetails * d = nsdetails( ns.c_str() ); + if ( ! d ){ + errmsg = "can't find ns"; + return false; + } + + int geoIdx = -1; + { + NamespaceDetails::IndexIterator ii = d->ii(); + while ( ii.more() ){ + IndexDetails& id = ii.next(); + if ( id.getSpec().getTypeName() == GEO2DNAME ){ + if ( geoIdx >= 0 ){ + errmsg = "2 geo indexes :("; + return false; + } + geoIdx = ii.pos() - 1; + } + } + } + + if ( geoIdx < 0 ){ + errmsg = "no geo index :("; + return false; + } + + + IndexDetails& id = d->idx( geoIdx ); + Geo2dType * g = (Geo2dType*)id.getSpec().getType(); + assert( &id == g->getDetails() ); + + int max = 100000; + + BtreeCursor c( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 ); + while ( c.ok() && max-- ){ + GeoHash h( c.currKey().firstElement() ); + int len; + cout << "\t" << h.toString() + << "\t" << c.current()[g->_geo] + << "\t" << hex << h.getHash() + << "\t" << hex << ((long long*)c.currKey().firstElement().binData(len))[0] + << "\t" << c.current()["_id"] + << endl; + c.advance(); + } + + return true; + } + + } geoWalkCmd; + +} diff --git a/db/instance.cpp b/db/instance.cpp index e8515c4..909911e 100644 --- a/db/instance.cpp +++ b/db/instance.cpp @@ -35,7 +35,8 @@ #if !defined(_WIN32) #include <sys/file.h> #endif -#include "dbstats.h" +#include "stats/counters.h" +#include "background.h" namespace mongo { @@ -45,19 +46,9 @@ namespace mongo { void receivedInsert(Message& m, CurOp& op); bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ); - CmdLine cmdLine; - int nloggedsome = 0; #define LOGSOME if( ++nloggedsome < 1000 || nloggedsome % 100 == 0 ) - SlaveTypes slave = NotSlave; - bool master = false; // true means keep an op log - bool autoresync = false; - - /* we use new here so we don't have to worry about destructor orders at program shutdown */ - MongoMutex &dbMutex( *(new MongoMutex) ); -// MutexInfo dbMutexInfo; - string dbExecCommand; string bind_ip = ""; @@ -66,8 +57,6 @@ namespace mongo { DiagLog _diaglog; - int opIdMem = 100000000; - bool useCursors = true; bool useHints = true; @@ -87,25 +76,30 @@ namespace mongo { // see FSyncCommand: unsigned lockedForWriting; - boost::mutex lockedForWritingMutex; + mongo::mutex lockedForWritingMutex; bool unlockRequested = false; void inProgCmd( Message &m, DbResponse &dbresponse ) { BSONObjBuilder b; - AuthenticationInfo *ai = cc().ai; - if( !ai->isAuthorized("admin") ) { + if( ! cc().isAdmin() ){ BSONObjBuilder b; b.append("err", "unauthorized"); } else { + DbMessage d(m); + QueryMessage q(d); + bool all = q.query["$all"].trueValue(); vector<BSONObj> vals; { - boostlock bl(Client::clientsMutex); + Client& me = cc(); + scoped_lock bl(Client::clientsMutex); for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { Client *c = *i; + if ( c == &me ) + continue; CurOp& co = *(c->curop()); - if( co.active() ) + if( all || co.active() ) vals.push_back( co.infoNoauth() ); } } @@ -116,14 +110,13 @@ namespace mongo { b.append("info", "use command {unlock:0} to terminate the fsync write/snapshot lock"); } } - + replyToQuery(0, m, dbresponse, b.obj()); } void killOp( Message &m, DbResponse &dbresponse ) { BSONObj obj; - AuthenticationInfo *ai = currentClient.get()->ai; - if( !ai->isAuthorized("admin") ) { + if( ! cc().isAdmin() ){ obj = fromjson("{\"err\":\"unauthorized\"}"); } /*else if( !dbMutexInfo.isLocked() ) @@ -146,8 +139,7 @@ namespace mongo { void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) { BSONObj obj; - AuthenticationInfo *ai = currentClient.get()->ai; - if( !ai->isAuthorized("admin") || strncmp(ns, "admin.", 6) != 0 ) { + if( ! cc().isAdmin() || strncmp(ns, "admin.", 6) != 0 ) { obj = fromjson("{\"err\":\"unauthorized\"}"); } else { @@ -163,10 +155,7 @@ namespace mongo { replyToQuery(0, m, dbresponse, obj); } - static bool receivedQuery(DbResponse& dbresponse, Message& m, - CurOp& op, bool logit, - mongolock& lock - ) { + static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ){ bool ok = true; MSGID responseTo = m.data->id; @@ -174,26 +163,9 @@ namespace mongo { QueryMessage q(d); QueryResult* msgdata; - Client& c = cc(); - + CurOp& op = *(c.curop()); + try { - if (q.fields.get() && q.fields->errmsg) - uassert( 10053 , q.fields->errmsg, false); - - /* note these are logged BEFORE authentication -- which is sort of ok */ - if ( _diaglog.level && logit ) { - if ( strstr(q.ns, ".$cmd") ) { - /* $cmd queries are "commands" and usually best treated as write operations */ - OPWRITE; - } - else { - OPREAD; - } - } - - setClient( q.ns, dbpath, &lock ); - c.top.setRead(); - c.curop()->setNS(q.ns); msgdata = runQuery(m, q, op ).release(); } catch ( AssertionException& e ) { @@ -230,32 +202,25 @@ namespace mongo { resp->setData(msgdata, true); // transport will free dbresponse.response = resp; dbresponse.responseTo = responseTo; - Database *database = c.database(); - if ( database ) { - if ( database->profile ) - op.debug().str << " bytes:" << resp->data->dataLen(); - } - else { - if ( strstr(q.ns, "$cmd") == 0 ) // (this condition is normal for $cmd dropDatabase) - log() << "ERROR: receiveQuery: database is null; ns=" << q.ns << endl; + + if ( op.shouldDBProfile( 0 ) ){ + op.debug().str << " bytes:" << resp->data->dataLen(); } return ok; } - bool commandIsReadOnly(BSONObj& _cmdobj); - // Returns false when request includes 'end' bool assembleResponse( Message &m, DbResponse &dbresponse, const sockaddr_in &client ) { - bool writeLock = true; - // before we lock... int op = m.data->operation(); - globalOpCounters.gotOp( op ); + bool isCommand = false; const char *ns = m.data->_data + 4; if ( op == dbQuery ) { if( strstr(ns, ".$cmd") ) { + isCommand = true; + OPWRITE; if( strstr(ns, ".$cmd.sys.") ) { if( strstr(ns, "$cmd.sys.inprog") ) { inProgCmd(m, dbresponse); @@ -270,17 +235,21 @@ namespace mongo { return true; } } - DbMessage d( m ); - QueryMessage q( d ); - writeLock = !commandIsReadOnly(q.query); + + } + else { + OPREAD; } - else - writeLock = false; } else if( op == dbGetMore ) { - writeLock = false; + OPREAD; + } + else { + OPWRITE; } + globalOpCounters.gotOp( op , isCommand ); + if ( handlePossibleShardedMessage( m , dbresponse ) ){ /* important to do this before we lock so if a message has to be forwarded, doesn't block for that @@ -289,161 +258,115 @@ namespace mongo { } Client& c = cc(); - c.clearns(); auto_ptr<CurOp> nestedOp; CurOp* currentOpP = c.curop(); if ( currentOpP->active() ){ - nestedOp.reset( new CurOp() ); + nestedOp.reset( new CurOp( &c , currentOpP ) ); currentOpP = nestedOp.get(); } CurOp& currentOp = *currentOpP; - currentOp.reset(client); - currentOp.setOp(op); + currentOp.reset(client,op); OpDebug& debug = currentOp.debug(); StringBuilder& ss = debug.str; + ss << opToString( op ) << " "; int logThreshold = cmdLine.slowMS; bool log = logLevel >= 1; - - Timer t( currentOp.startTime() ); - - mongolock lk(writeLock); - -#if 0 - /* use this if you only want to process operations for a particular namespace. - maybe add to cmd line parms or something fancier. - */ - DbMessage ddd(m); - if ( strncmp(ddd.getns(), "clusterstock", 12) != 0 ) { - static int q; - if ( ++q < 20 ) - out() << "TEMP skip " << ddd.getns() << endl; - goto skip; - } -#endif - + if ( op == dbQuery ) { - // receivedQuery() does its own authorization processing. - if ( ! receivedQuery(dbresponse, m, currentOp, true, lk) ) + if ( ! receivedQuery(c , dbresponse, m ) ) log = true; } else if ( op == dbGetMore ) { - // does its own authorization processing. - OPREAD; DEV log = true; - ss << "getmore "; if ( ! receivedGetMore(dbresponse, m, currentOp) ) log = true; } else if ( op == dbMsg ) { - /* deprecated / rarely used. intended for connection diagnostics. */ - ss << "msg "; + // deprecated - replaced by commands char *p = m.data->_data; int len = strlen(p); if ( len > 400 ) out() << curTimeMillis() % 10000 << - " long msg received, len:" << len << - " ends with: " << p + len - 10 << endl; - bool end = false; //strcmp("end", p) == 0; + " long msg received, len:" << len << + " ends with: " << p + len - 10 << endl; + Message *resp = new Message(); - resp->setData(opReply, "i am fine"); + if ( strcmp( "end" , p ) == 0 ) + resp->setData( opReply , "dbMsg end no longer supported" ); + else + resp->setData( opReply , "i am fine - dbMsg deprecated"); + dbresponse.response = resp; dbresponse.responseTo = m.data->id; - //dbMsgPort.reply(m, resp); - if ( end ) - return false; } else { const char *ns = m.data->_data + 4; char cl[256]; nsToDatabase(ns, cl); - currentOp.setNS(ns); - AuthenticationInfo *ai = currentClient.get()->ai; - if( !ai->isAuthorized(cl) ) { + if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) { uassert_nothrow("unauthorized"); } - else if ( op == dbInsert ) { - OPWRITE; - try { - ss << "insert "; - receivedInsert(m, currentOp); - } - catch ( AssertionException& e ) { - LOGSOME problem() << " Caught Assertion insert, continuing\n"; - ss << " exception " << e.toString(); - log = true; - } - } - else if ( op == dbUpdate ) { - OPWRITE; - try { - ss << "update "; - receivedUpdate(m, currentOp); - } - catch ( AssertionException& e ) { - LOGSOME problem() << " Caught Assertion update, continuing" << endl; - ss << " exception " << e.toString(); - log = true; - } - } - else if ( op == dbDelete ) { - OPWRITE; - try { - ss << "remove "; - receivedDelete(m, currentOp); - } - catch ( AssertionException& e ) { - LOGSOME problem() << " Caught Assertion receivedDelete, continuing" << endl; - ss << " exception " << e.toString(); - log = true; - } - } - else if ( op == dbKillCursors ) { - OPREAD; + else { try { - logThreshold = 10; - ss << "killcursors "; - receivedKillCursors(m); + if ( op == dbInsert ) { + receivedInsert(m, currentOp); + } + else if ( op == dbUpdate ) { + receivedUpdate(m, currentOp); + } + else if ( op == dbDelete ) { + receivedDelete(m, currentOp); + } + else if ( op == dbKillCursors ) { + currentOp.ensureStarted(); + logThreshold = 10; + ss << "killcursors "; + receivedKillCursors(m); + } + else { + out() << " operation isn't supported: " << op << endl; + currentOp.done(); + log = true; + } } catch ( AssertionException& e ) { - problem() << " Caught Assertion in kill cursors, continuing" << endl; + problem() << " Caught Assertion in " << opToString(op) << " , continuing" << endl; ss << " exception " + e.toString(); log = true; } } - else { - out() << " operation isn't supported: " << op << endl; - currentOp.setActive(false); - assert(false); - } } - int ms = t.millis(); + currentOp.ensureStarted(); + currentOp.done(); + int ms = currentOp.totalTimeMillis(); + log = log || (logLevel >= 2 && ++ctr % 512 == 0); DEV log = true; if ( log || ms > logThreshold ) { ss << ' ' << ms << "ms"; mongo::log() << ss.str() << endl; } - Database *database = c.database(); - if ( database && database->profile >= 1 ) { - if ( database->profile >= 2 || ms >= cmdLine.slowMS ) { - // performance profiling is on - if ( dbMutex.getState() > 1 || dbMutex.getState() < -1 ){ - out() << "warning: not profiling because recursive lock" << endl; + + if ( currentOp.shouldDBProfile( ms ) ){ + // performance profiling is on + if ( dbMutex.getState() < 0 ){ + mongo::log(1) << "note: not profiling because recursive read lock" << endl; + } + else { + mongolock lk(true); + if ( dbHolder.isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ){ + Client::Context c( currentOp.getNS() ); + profile(ss.str().c_str(), ms); } else { - string old_ns = c.ns(); - Database * old_db = c.database(); - lk.releaseAndWriteLock(); - Client::Context c( old_ns , old_db ); - profile(ss.str().c_str(), ms); + mongo::log() << "note: not profiling because db went away - probably a close on: " << currentOp.getNS() << endl; } } } - currentOp.setActive(false); return true; } /* assembleResponse() */ @@ -452,7 +375,7 @@ namespace mongo { int *x = (int *) m.data->_data; x++; // reserved int n = *x++; - assert( n >= 1 ); + uassert( 13004 , "sent 0 cursors to kill" , n >= 1 ); if ( n > 2000 ) { problem() << "Assertion failure, receivedKillCursors, n=" << n << endl; assert( n < 30000 ); @@ -460,29 +383,34 @@ namespace mongo { killCursors(n, (long long *) x); } - /* cl - database name + /* db - database name path - db directory */ - void closeDatabase( const char *cl, const string& path ) { - Database *database = cc().database(); - assert( database ); - assert( database->name == cl ); - /* - if ( string("local") != cl ) { - DBInfo i(cl); - i.dbDropped(); - }*/ + void closeDatabase( const char *db, const string& path ) { + assertInWriteLock(); + + Client::Context * ctx = cc().getContext(); + assert( ctx ); + assert( ctx->inDB( db , path ) ); + Database *database = ctx->db(); + assert( database->name == db ); + + replCheckCloseDatabase( database ); + + if( BackgroundOperation::inProgForDb(db) ) { + log() << "warning: bg op in prog during close db? " << db << endl; + } /* important: kill all open cursors on the database */ - string prefix(cl); + string prefix(db); prefix += '.'; ClientCursor::invalidate(prefix.c_str()); NamespaceDetailsTransient::clearForPrefix( prefix.c_str() ); - dbHolder.erase( cl, path ); + dbHolder.erase( db, path ); delete database; // closes files - cc().clearns(); + ctx->clear(); } void receivedUpdate(Message& m, CurOp& op) { @@ -490,9 +418,6 @@ namespace mongo { const char *ns = d.getns(); assert(*ns); uassert( 10054 , "not master", isMasterNs( ns ) ); - setClient(ns); - Client& client = cc(); - client.top.setWrite(); op.debug().str << ns << ' '; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); @@ -507,13 +432,18 @@ namespace mongo { bool multi = flags & UpdateOption_Multi; { string s = query.toString(); - /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. */ + /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. + instead, let's just story the query BSON in the debug object, and it can toString() + lazily + */ op.debug().str << " query: " << s; - CurOp& currentOp = *client.curop(); - currentOp.setQuery(query); + op.setQuery(query); } + + mongolock lk(1); + Client::Context ctx( ns ); + UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() ); - /* TODO FIX: recordUpdate should take a long int for parm #2 */ recordUpdate( res.existing , (int) res.num ); // for getlasterror } @@ -522,9 +452,6 @@ namespace mongo { const char *ns = d.getns(); assert(*ns); uassert( 10056 , "not master", isMasterNs( ns ) ); - setClient(ns); - Client& client = cc(); - client.top.setWrite(); int flags = d.pullInt(); bool justOne = flags & 1; assert( d.moreJSObjs() ); @@ -532,35 +459,38 @@ namespace mongo { { string s = pattern.toString(); op.debug().str << " query: " << s; - CurOp& currentOp = *client.curop(); - currentOp.setQuery(pattern); + op.setQuery(pattern); } - int n = deleteObjects(ns, pattern, justOne, true); - recordDelete( n ); + + writelock lk(ns); + Client::Context ctx(ns); + + long long n = deleteObjects(ns, pattern, justOne, true); + recordDelete( (int) n ); } QueryResult* emptyMoreResult(long long); bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { + StringBuilder& ss = curop.debug().str; bool ok = true; + DbMessage d(m); + const char *ns = d.getns(); - StringBuilder& ss = curop.debug().str; - ss << ns; - setClient(ns); - cc().top.setRead(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); - ss << " cid:" << cursorid; - ss << " ntoreturn:" << ntoreturn; + + ss << ns << " cid:" << cursorid << " ntoreturn:" << ntoreturn;; + QueryResult* msgdata; try { - AuthenticationInfo *ai = currentClient.get()->ai; - uassert( 10057 , "unauthorized", ai->isAuthorized(cc().database()->name.c_str())); + mongolock lk(false); + Client::Context ctx(ns); msgdata = getMore(ns, ntoreturn, cursorid, curop); } catch ( AssertionException& e ) { - ss << " exception " + e.toString(); + ss << " exception " << e.toString(); msgdata = emptyMoreResult(cursorid); ok = false; } @@ -570,7 +500,7 @@ namespace mongo { ss << " nreturned:" << msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.data->id; - //dbMsgPort.reply(m, resp); + return ok; } @@ -579,10 +509,10 @@ namespace mongo { const char *ns = d.getns(); assert(*ns); uassert( 10058 , "not master", isMasterNs( ns ) ); - setClient(ns); - cc().top.setWrite(); op.debug().str << ns; - + + writelock lk(ns); + Client::Context ctx(ns); while ( d.moreJSObjs() ) { BSONObj js = d.nextJsObj(); uassert( 10059 , "object to insert too large", js.objsize() <= MaxBSONObjectSize); @@ -610,14 +540,21 @@ namespace mongo { boost::filesystem::path path( dbpath ); for ( boost::filesystem::directory_iterator i( path ); i != boost::filesystem::directory_iterator(); ++i ) { - string fileName = boost::filesystem::path(*i).leaf(); - if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" ) - names.push_back( fileName.substr( 0, fileName.length() - 3 ) ); + if ( directoryperdb ) { + boost::filesystem::path p = *i; + string dbName = p.leaf(); + p /= ( dbName + ".ns" ); + if ( boost::filesystem::exists( p ) ) + names.push_back( dbName ); + } else { + string fileName = boost::filesystem::path(*i).leaf(); + if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" ) + names.push_back( fileName.substr( 0, fileName.length() - 3 ) ); + } } } bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk ) { - SavedContext c; if ( lastError._get() ) lastError.startRequest( toSend, lastError._get() ); DbResponse dbResponse; @@ -628,7 +565,6 @@ namespace mongo { } void DBDirectClient::say( Message &toSend ) { - SavedContext c; if ( lastError._get() ) lastError.startRequest( toSend, lastError._get() ); DbResponse dbResponse; @@ -646,15 +582,13 @@ namespace mongo { } - DBDirectClient::AlwaysAuthorized DBDirectClient::SavedContext::always; - DBClientBase * createDirectClient(){ return new DBDirectClient(); } void recCacheCloseAll(); - boost::mutex &exitMutex( *( new boost::mutex ) ); + mongo::mutex exitMutex; int numExitCalls = 0; void shutdown(); @@ -680,8 +614,9 @@ namespace mongo { /* not using log() herein in case we are already locked */ void dbexit( ExitCode rc, const char *why) { + Client * c = currentClient.get(); { - boostlock lk( exitMutex ); + scoped_lock lk( exitMutex ); if ( numExitCalls++ > 0 ) { if ( numExitCalls > 5 ){ // this means something horrible has happened @@ -690,6 +625,7 @@ namespace mongo { stringstream ss; ss << "dbexit: " << why << "; exiting immediately" << endl; tryToOutputFatal( ss.str() ); + if ( c ) c->shutdown(); ::exit( rc ); } } @@ -706,12 +642,12 @@ namespace mongo { } tryToOutputFatal( "dbexit: really exiting now\n" ); + if ( c ) c->shutdown(); ::exit(rc); } void shutdown() { - log() << "\t shutdown: going to close listening sockets..." << endl; ListeningSockets::get()->closeAll(); @@ -751,10 +687,29 @@ namespace mongo { void acquirePathLock() { #if !defined(_WIN32) && !defined(__sunos__) string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string(); - lockFile = open( name.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO ); - massert( 10309 , "Unable to create / open lock file for dbpath: " + name, lockFile > 0 ); - massert( 10310 , "Unable to acquire lock for dbpath: " + name, flock( lockFile, LOCK_EX | LOCK_NB ) == 0 ); + + bool oldFile = false; + + if ( boost::filesystem::exists( name ) && boost::filesystem::file_size( name ) > 0 ){ + oldFile = true; + } + lockFile = open( name.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO ); + uassert( 10309 , "Unable to create / open lock file for dbpath: " + name, lockFile > 0 ); + uassert( 10310 , "Unable to acquire lock for dbpath: " + name, flock( lockFile, LOCK_EX | LOCK_NB ) == 0 ); + + if ( oldFile ){ + // we check this here because we want to see if we can get the lock + // if we can't, then its probably just another mongod running + cout << "************** \n" + << "old lock file: " << name << ". probably means unclean shutdown\n" + << "recommend removing file and running --repair\n" + << "see: http://dochub.mongodb.org/core/repair for more information\n" + << "*************" << endl; + uassert( 12596 , "old lock file" , 0 ); + } + + stringstream ss; ss << getpid() << endl; string s = ss.str(); diff --git a/db/instance.h b/db/instance.h index b2b2c94..b545a78 100644 --- a/db/instance.h +++ b/db/instance.h @@ -38,12 +38,14 @@ namespace mongo { 7 = log a few reads, and all writes. */ int level; + mongo::mutex mutex; + DiagLog() : f(0) , level(0) { } void init() { if ( ! f && level ){ log() << "diagLogging = " << level << endl; stringstream ss; - ss << "diaglog." << hex << time(0); + ss << dbpath << "/diaglog." << hex << time(0); string name = ss.str(); f = new ofstream(name.c_str(), ios::out | ios::binary); if ( ! f->good() ) { @@ -62,17 +64,26 @@ namespace mongo { return old; } void flush() { - if ( level ) f->flush(); + if ( level ){ + scoped_lock lk(mutex); + f->flush(); + } } void write(char *data,int len) { - if ( level & 1 ) f->write(data,len); + if ( level & 1 ){ + scoped_lock lk(mutex); + f->write(data,len); + } } void readop(char *data, int len) { if ( level & 2 ) { bool log = (level & 4) == 0; OCCASIONALLY log = true; - if ( log ) + if ( log ){ + scoped_lock lk(mutex); + assert( f ); f->write(data,len); + } } } }; @@ -124,53 +135,6 @@ namespace mongo { // don't need to piggy back when connected locally return say( toSend ); } - class AlwaysAuthorized : public AuthenticationInfo { - virtual bool isAuthorized( const char *dbname ) { - return true; - } - }; - - /* TODO: this looks bad that auth is set to always. is that really always safe? */ - class SavedContext { - public: - SavedContext() { - _save = dbMutex.atLeastReadLocked(); - - Client *c = currentClient.get(); - oldAuth = c->ai; - // careful, don't want to free this: - c->ai = &always; - - /* it only makes sense to manipulate a pointer - c->database() - if locked. - thus the _saved flag. - */ - if( _save ) { - if ( c->database() ) { - dbMutex.assertAtLeastReadLocked(); - _oldName = c->database()->name; - } - } - } - ~SavedContext() { - Client *c = currentClient.get(); - c->ai = oldAuth; - if( _save ) { - if ( !_oldName.empty() ) { - dbMutex.assertAtLeastReadLocked(); - setClient( _oldName.c_str() ); - } - } - else { - // defensive - cc().clearns(); - } - } - private: - bool _save; - static AlwaysAuthorized always; - AuthenticationInfo *oldAuth; - string _oldName; - }; }; extern int lockFile; diff --git a/db/introspect.cpp b/db/introspect.cpp index 9cb477d..a041d48 100644 --- a/db/introspect.cpp +++ b/db/introspect.cpp @@ -26,8 +26,7 @@ namespace mongo { - void profile(const char *str, - int millis) + void profile( const char *str, int millis) { BSONObjBuilder b; b.appendDate("ts", jsTime()); diff --git a/db/jsobj.cpp b/db/jsobj.cpp index 1a299a5..9f9a684 100644 --- a/db/jsobj.cpp +++ b/db/jsobj.cpp @@ -20,6 +20,7 @@ #include "stdafx.h" #include "jsobj.h" #include "nonce.h" +#include "../util/atomic_int.h" #include "../util/goodies.h" #include "../util/base64.h" #include "../util/md5.hpp" @@ -30,6 +31,7 @@ #include "jsobjmanipulator.h" #include "../util/optime.h" #include <boost/static_assert.hpp> +#include <boost/any.hpp> #undef assert #define assert xassert @@ -50,12 +52,6 @@ namespace mongo { } IDLabeler GENOID; - BSONObjBuilder& operator<<(BSONObjBuilder& b, IDLabeler& id) { - OID oid; - oid.init(); - b.appendOID("_id", &oid); - return b; - } DateNowLabeler DATENOW; @@ -156,7 +152,7 @@ namespace mongo { return s.str(); } - string escape( string s ) { + string escape( string s , bool escape_slash=false) { stringstream ret; for ( string::iterator i = s.begin(); i != s.end(); ++i ) { switch ( *i ) { @@ -167,7 +163,7 @@ namespace mongo { ret << "\\\\"; break; case '/': - ret << "\\/"; + ret << (escape_slash ? "\\/" : "/"); break; case '\b': ret << "\\b"; @@ -306,17 +302,13 @@ namespace mongo { s << " )"; break; case RegEx: - if ( format == Strict ) - s << "{ \"$regex\" : \""; - else - s << "/"; - s << escape( regex() ); - if ( format == Strict ) + if ( format == Strict ){ + s << "{ \"$regex\" : \"" << escape( regex() ); s << "\", \"$options\" : \"" << regexFlags() << "\" }"; - else { - s << "/"; + } else { + s << "/" << escape( regex() , true ) << "/"; // FIXME Worry about alpha order? - for ( const char *f = regexFlags(); *f; ++f ) + for ( const char *f = regexFlags(); *f; ++f ){ switch ( *f ) { case 'g': case 'i': @@ -325,6 +317,7 @@ namespace mongo { default: break; } + } } break; @@ -413,7 +406,8 @@ namespace mongo { default: { stringstream ss; ss << "BSONElement: bad type " << (int) type(); - massert( 10320 , ss.str().c_str(),false); + string msg = ss.str(); + massert( 10320 , msg.c_str(),false); } } totalSize = x + fieldNameSize() + 1; // BSONType @@ -434,8 +428,12 @@ namespace mongo { else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::LTE; } } - else if ( fn[1] == 'n' && fn[2] == 'e' && fn[3] == 0) - return BSONObj::NE; + else if ( fn[1] == 'n' && fn[2] == 'e' ){ + if ( fn[3] == 0 ) + return BSONObj::NE; + if ( fn[3] == 'a' && fn[4] == 'r' && fn[5] == 0 ) + return BSONObj::opNEAR; + } else if ( fn[1] == 'm' && fn[2] == 'o' && fn[3] == 'd' && fn[4] == 0 ) return BSONObj::opMOD; else if ( fn[1] == 't' && fn[2] == 'y' && fn[3] == 'p' && fn[4] == 'e' && fn[5] == 0 ) @@ -458,6 +456,8 @@ namespace mongo { return BSONObj::opREGEX; else if ( fn[1] == 'o' && fn[2] == 'p' && fn[3] == 't' && fn[4] == 'i' && fn[5] == 'o' && fn[6] == 'n' && fn[7] == 's' && fn[8] == 0 ) return BSONObj::opOPTIONS; + else if ( fn[1] == 'w' && fn[2] == 'i' && fn[3] == 't' && fn[4] == 'h' && fn[5] == 'i' && fn[6] == 'n' && fn[7] == 0 ) + return BSONObj::opWITHIN; } return def; } @@ -541,13 +541,18 @@ namespace mongo { case Object: case Array: return l.embeddedObject().woCompare( r.embeddedObject() ); - case DBRef: - case BinData: { + case DBRef: { int lsz = l.valuesize(); int rsz = r.valuesize(); if ( lsz - rsz != 0 ) return lsz - rsz; return memcmp(l.value(), r.value(), lsz); } + case BinData: { + int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte + int rsz = r.objsize(); + if ( lsz - rsz != 0 ) return lsz - rsz; + return memcmp(l.value()+4, r.value()+4, lsz+1); + } case RegEx: { int c = strcmp(l.regex(), r.regex()); @@ -576,31 +581,35 @@ namespace mongo { void BSONElement::validate() const { switch( type() ) { - case DBRef: - case Code: - case Symbol: - case String: - massert( 10321 , "Invalid dbref/code/string/symbol size", - valuestrsize() > 0 && - valuestrsize() - 1 == strnlen( valuestr(), valuestrsize() ) ); - break; - case CodeWScope: { - int totalSize = *( int * )( value() ); - massert( 10322 , "Invalid CodeWScope size", totalSize >= 8 ); - int strSizeWNull = *( int * )( value() + 4 ); - massert( 10323 , "Invalid CodeWScope string size", totalSize >= strSizeWNull + 4 + 4 ); - massert( 10324 , "Invalid CodeWScope string size", - strSizeWNull > 0 && - strSizeWNull - 1 == strnlen( codeWScopeCode(), strSizeWNull ) ); - massert( 10325 , "Invalid CodeWScope size", totalSize >= strSizeWNull + 4 + 4 + 4 ); - int objSize = *( int * )( value() + 4 + 4 + strSizeWNull ); - massert( 10326 , "Invalid CodeWScope object size", totalSize == 4 + 4 + strSizeWNull + objSize ); - // Subobject validation handled elsewhere. - } - case Object: - // We expect Object size validation to be handled elsewhere. - default: - break; + case DBRef: + case Code: + case Symbol: + case String: { + int x = valuestrsize(); + if ( x > 0 && valuestr()[x-1] == 0 ) + return; + StringBuilder buf; + buf << "Invalid dbref/code/string/symbol size: " << x << " strnlen:" << strnlen( valuestr() , x ); + massert( 10321 , buf.str() , 0 ); + break; + } + case CodeWScope: { + int totalSize = *( int * )( value() ); + massert( 10322 , "Invalid CodeWScope size", totalSize >= 8 ); + int strSizeWNull = *( int * )( value() + 4 ); + massert( 10323 , "Invalid CodeWScope string size", totalSize >= strSizeWNull + 4 + 4 ); + massert( 10324 , "Invalid CodeWScope string size", + strSizeWNull > 0 && + strSizeWNull - 1 == strnlen( codeWScopeCode(), strSizeWNull ) ); + massert( 10325 , "Invalid CodeWScope size", totalSize >= strSizeWNull + 4 + 4 + 4 ); + int objSize = *( int * )( value() + 4 + 4 + strSizeWNull ); + massert( 10326 , "Invalid CodeWScope object size", totalSize == 4 + 4 + strSizeWNull + objSize ); + // Subobject validation handled elsewhere. + } + case Object: + // We expect Object size validation to be handled elsewhere. + default: + break; } } @@ -653,7 +662,7 @@ namespace mongo { const string& c = l.substr( lstart , lend - lstart ); const string& d = r.substr( rstart , rend - rstart ); - int x = c.compare( d ); + int x = lexNumCmp( c.c_str(), d.c_str() ); if ( x < 0 ) return LEFT_BEFORE; @@ -766,9 +775,18 @@ namespace mongo { if ( r.eoo() ) return 1; - int x = l.woCompare( r, considerFieldName ); - if ( ordered && o.number() < 0 ) - x = -x; + int x; +/* + if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 && + l.type() == String && r.type() == String ) { + // note: no negative support yet, as this is just sort of a POC + x = _stricmp(l.valuestr(), r.valuestr()); + } + else*/ { + x = l.woCompare( r, considerFieldName ); + if ( ordered && o.number() < 0 ) + x = -x; + } if ( x != 0 ) return x; } @@ -809,18 +827,6 @@ namespace mongo { } - BSONElement BSONObj::getField(const char *name) const { - BSONObjIterator i(*this); - while ( i.moreWithEOO() ) { - BSONElement e = i.next(); - if ( e.eoo() ) - break; - if ( strcmp(e.fieldName(), name) == 0 ) - return e; - } - return nullElement; - } - /* return has eoo() true if no match supports "." notation to reach into embedded objects */ @@ -838,49 +844,62 @@ namespace mongo { return e; } - /* jul09 : 'deep' and this function will be going away in the future - kept only for backward compatibility of datafiles for now. */ - void trueDat( bool *deep ) { - if( deep ) - *deep = true; - } + void BSONObj::getFieldsDotted(const char *name, BSONElementSet &ret ) const { + BSONObjIterator i(*this); + while ( i.more() ){ + BSONElement e = i.next(); + FieldCompareResult cmp = compareDottedFieldNames( name , e.fieldName() ); + switch ( cmp ){ - void BSONObj::getFieldsDotted(const char *name, BSONElementSet &ret, bool *deep ) const { - BSONElement e = getField( name ); - if ( e.eoo() ) { - const char *p = strchr(name, '.'); - if ( p ) { - string left(name, p-name); - BSONElement e = getField( left ); - if ( e.type() == Array ) { - trueDat( deep ); - BSONObjIterator i( e.embeddedObject() ); - while( i.moreWithEOO() ) { - BSONElement f = i.next(); - if ( f.eoo() ) - break; + case LEFT_BEFORE: + case RIGHT_BEFORE: + break; + + case RIGHT_SUBFIELD: + assert(0); + break; + + case LEFT_SUBFIELD: { + const char * next = name + strlen( e.fieldName() ) + 1; + bool allDigits = false; + if ( isdigit( *next ) ){ + const char * temp = next + 1; + while ( isdigit( *temp ) ) + temp++; + allDigits = *temp == '.'; + } + + if ( e.type() == Object || allDigits ){ + e.embeddedObject().getFieldsDotted( next , ret ); + } + else if ( e.type() == Array ){ + BSONObjIterator j( e.embeddedObject() ); + while ( j.more() ){ + BSONElement f = j.next(); if ( f.type() == Object ) - f.embeddedObject().getFieldsDotted(p+1, ret); + f.embeddedObject().getFieldsDotted( next , ret ); } - } else if ( e.type() == Object ) { - e.embeddedObject().getFieldsDotted(p+1, ret); } + else { + // intentially left blank, this means no match + } + return; } - } else { - if ( e.type() == Array ) { - trueDat( deep ); - BSONObjIterator i( e.embeddedObject() ); - while( i.moreWithEOO() ) { - BSONElement f = i.next(); - if ( f.eoo() ) - break; - ret.insert( f ); + + case SAME: { + if ( e.type() == Array ){ + BSONObjIterator j( e.embeddedObject() ); + while ( j.more() ) + ret.insert( j.next() ); } - } else { - ret.insert( e ); + else { + ret.insert( e ); + } + return; + } + } } - if ( ret.empty() && deep ) - *deep = false; } BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const { @@ -1141,7 +1160,10 @@ namespace mongo { if ( strchr( name , '.' ) || strchr( name , '$' ) ){ - return false; + return + strcmp( name , "$ref" ) == 0 || + strcmp( name , "$id" ) == 0 + ; } if ( e.mayEncapsulate() ){ @@ -1410,7 +1432,7 @@ namespace mongo { } void OID::init() { - static WrappingInt inc = (unsigned) security.getNonce(); + static AtomicUInt inc = (unsigned) security.getNonce(); unsigned t = (unsigned) time(0); char *T = (char *) &t; data[0] = T[3]; @@ -1420,7 +1442,7 @@ namespace mongo { (unsigned&) data[4] = _machine; - int new_inc = inc.atomicIncrement(); + int new_inc = inc++; T = (char *) &new_inc; char * raw = (char*)&b; raw[0] = T[3]; @@ -1464,7 +1486,7 @@ namespace mongo { Labeler::Label SIZE( "$size" ); void BSONElementManipulator::initTimestamp() { - massert( 10332 , "Expected CurrentTime type", element_.type() == Timestamp ); + massert( 10332 , "Expected CurrentTime type", _element.type() == Timestamp ); unsigned long long ×tamp = *( reinterpret_cast< unsigned long long* >( value() ) ); if ( timestamp == 0 ) timestamp = OpTime::now().asDate(); @@ -1610,12 +1632,23 @@ namespace mongo { } + void BSONObjBuilder::appendKeys( const BSONObj& keyPattern , const BSONObj& values ){ + BSONObjIterator i(keyPattern); + BSONObjIterator j(values); + + while ( i.more() && j.more() ){ + appendAs( j.next() , i.next().fieldName() ); + } + + assert( ! i.more() ); + assert( ! j.more() ); + } int BSONElementFieldSorter( const void * a , const void * b ){ const char * x = *((const char**)a); const char * y = *((const char**)b); x++; y++; - return strcmp( x , y ); + return lexNumCmp( x , y ); } BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ){ @@ -480,7 +480,7 @@ namespace mongo { BSONObj embeddedObject() const; /* uasserts if not an object */ - BSONObj embeddedObjectUserCheck(); + BSONObj embeddedObjectUserCheck() const; BSONObj codeWScopeObject() const; @@ -509,7 +509,7 @@ namespace mongo { BinDataType binDataType() const { // BinData: <int len> <byte subtype> <byte[len] data> assert( type() == BinData ); - char c = (value() + 4)[0]; + unsigned char c = (value() + 4)[0]; return (BinDataType)c; } @@ -574,9 +574,25 @@ namespace mongo { /** True if this element may contain subobjects. */ bool mayEncapsulate() const { - return type() == Object || - type() == Array || - type() == CodeWScope; + switch ( type() ){ + case Object: + case Array: + case CodeWScope: + return true; + default: + return false; + } + } + + /** True if this element can be a BSONObj */ + bool isABSONObj() const { + switch( type() ){ + case Object: + case Array: + return true; + default: + return false; + } } Date_t timestampTime() const{ @@ -625,7 +641,7 @@ namespace mongo { mutable int fieldNameSize_; // cached value int fieldNameSize() const { if ( fieldNameSize_ == -1 ) - fieldNameSize_ = strlen( fieldName() ) + 1; + fieldNameSize_ = (int)strlen( fieldName() ) + 1; return fieldNameSize_; } mutable int totalSize; /* caches the computed size */ @@ -635,7 +651,7 @@ namespace mongo { struct BSONElementCmpWithoutField { bool operator()( const BSONElement &l, const BSONElement &r ) const { - return l.woCompare( r, false ); + return l.woCompare( r, false ) < 0; } }; @@ -700,6 +716,11 @@ namespace mongo { if ( ! isValid() ){ stringstream ss; ss << "Invalid BSONObj spec size: " << objsize(); + try { + BSONElement e = firstElement(); + ss << " first element:" << e.toString() << " "; + } + catch ( ... ){} string s = ss.str(); massert( 10334 , s , 0 ); } @@ -759,7 +780,7 @@ namespace mongo { BSONElement getFieldDotted(const char *name) const; /** Like getFieldDotted(), but expands multikey arrays and returns all matching objects */ - void getFieldsDotted(const char *name, BSONElementSet &ret, bool *deep = 0) const; + void getFieldsDotted(const char *name, BSONElementSet &ret ) const; /** Like getFieldDotted(), but returns first array encountered while traversing the dotted fields of name. The name variable is updated to represent field names with respect to the returned element. */ @@ -768,14 +789,14 @@ namespace mongo { /** Get the field of the specified name. eoo() is true on the returned element if not found. */ - BSONElement getField(const string name) const { - return getField( name.c_str() ); - }; + BSONElement getField(const char *name) const; /** Get the field of the specified name. eoo() is true on the returned element if not found. */ - BSONElement getField(const char *name) const; /* return has eoo() true if no match */ + BSONElement getField(const string name) const { + return getField( name.c_str() ); + }; /** Get the field of the specified name. eoo() is true on the returned element if not found. @@ -902,13 +923,9 @@ namespace mongo { return BSONElement(objdata() + 4); } - /** @return element with fieldname "name". returnvalue.eoo() is true if not found */ - BSONElement findElement(const char *name) const; - - /** @return element with fieldname "name". returnvalue.eoo() is true if not found */ - BSONElement findElement(string name) const { - return findElement(name.c_str()); - } + /** use getField() instead. */ + //BSONElement getField(const char *name) const; + //BSONElement getField(string name) const { /** @return true if field exists in the object */ bool hasElement(const char *name) const; @@ -976,7 +993,9 @@ namespace mongo { opTYPE = 0x0F, opREGEX = 0x10, opOPTIONS = 0x11, - opELEM_MATCH = 0x12 + opELEM_MATCH = 0x12, + opNEAR = 0x13, + opWITHIN = 0x14, }; }; ostream& operator<<( ostream &s, const BSONObj &o ); @@ -1028,7 +1047,7 @@ namespace mongo { BSON( "a" << GT << 23.4 << NE << 30 << "b" << 2 ) produces the object { a: { \$gt: 23.4, \$ne: 30 }, b: 2 }. */ -#define BSON(x) (( mongo::BSONObjBuilder() << x ).obj()) +#define BSON(x) (( mongo::BSONObjBuilder(64) << x ).obj()) /** Use BSON_ARRAY macro like BSON macro, but without keys @@ -1042,7 +1061,6 @@ namespace mongo { cout << BSON( GENOID << "z" << 3 ); // { _id : ..., z : 3 } */ extern struct IDLabeler { } GENOID; - BSONObjBuilder& operator<<(BSONObjBuilder& b, IDLabeler& id); /* Utility class to add a Date element with the current time Example: @@ -1107,20 +1125,63 @@ namespace mongo { }; /** + used in conjuction with BSONObjBuilder, allows for proper buffer size to prevent crazy memory usage + */ + class BSONSizeTracker { + public: +#define BSONSizeTrackerSize 10 + + BSONSizeTracker(){ + _pos = 0; + for ( int i=0; i<BSONSizeTrackerSize; i++ ) + _sizes[i] = 512; // this is the default, so just be consistent + } + + ~BSONSizeTracker(){ + } + + void got( int size ){ + _sizes[_pos++] = size; + if ( _pos >= BSONSizeTrackerSize ) + _pos = 0; + } + + /** + * right now choosing largest size + */ + int getSize() const { + int x = 16; // sane min + for ( int i=0; i<BSONSizeTrackerSize; i++ ){ + if ( _sizes[i] > x ) + x = _sizes[i]; + } + return x; + } + + private: + int _pos; + int _sizes[BSONSizeTrackerSize]; + }; + + /** utility for creating a BSONObj */ class BSONObjBuilder : boost::noncopyable { public: /** @param initsize this is just a hint as to the final size of the object */ - BSONObjBuilder(int initsize=512) : b(buf_), buf_(initsize), offset_( 0 ), s_( this ) { + BSONObjBuilder(int initsize=512) : b(buf_), buf_(initsize), offset_( 0 ), s_( this ) , _tracker(0) { b.skip(4); /*leave room for size field*/ } /** @param baseBuilder construct a BSONObjBuilder using an existing BufBuilder */ - BSONObjBuilder( BufBuilder &baseBuilder ) : b( baseBuilder ), buf_( 0 ), offset_( baseBuilder.len() ), s_( this ) { + BSONObjBuilder( BufBuilder &baseBuilder ) : b( baseBuilder ), buf_( 0 ), offset_( baseBuilder.len() ), s_( this ) , _tracker(0) { b.skip( 4 ); } + BSONObjBuilder( const BSONSizeTracker & tracker ) : b(buf_) , buf_(tracker.getSize() ), offset_(0), s_( this ) , _tracker( (BSONSizeTracker*)(&tracker) ){ + b.skip( 4 ); + } + /** add all the fields from the object specified to this object */ BSONObjBuilder& appendElements(BSONObj x); @@ -1188,6 +1249,13 @@ namespace mongo { b.append((char) (val?1:0)); } + /** Append a boolean element */ + void append(const char *fieldName, bool val) { + b.append((char) Bool); + b.append(fieldName); + b.append((char) (val?1:0)); + } + /** Append a 32 bit integer element */ void append(const char *fieldName, int n) { b.append((char) NumberInt); @@ -1214,7 +1282,42 @@ namespace mongo { append( fieldName.c_str() , n ); } + /** appends a number. if n < max(int)/2 then uses int, otherwise long long */ + void appendIntOrLL( const string& fieldName , long long n ){ + long long x = n; + if ( x < 0 ) + x = x * -1; + if ( x < ( numeric_limits<int>::max() / 2 ) ) + append( fieldName.c_str() , (int)n ); + else + append( fieldName.c_str() , n ); + } + + + /** + * appendNumber is a series of method for appending the smallest sensible type + * mostly for JS + */ + void appendNumber( const string& fieldName , int n ){ + append( fieldName.c_str() , n ); + } + void appendNumber( const string& fieldName , double d ){ + append( fieldName.c_str() , d ); + } + + void appendNumber( const string& fieldName , long long l ){ + static long long maxInt = (int)pow( 2.0 , 30.0 ); + static long long maxDouble = (long long)pow( 2.0 , 40.0 ); + + if ( l < maxInt ) + append( fieldName.c_str() , (int)l ); + else if ( l < maxDouble ) + append( fieldName.c_str() , (double)l ); + else + append( fieldName.c_str() , l ); + } + /** Append a double element */ BSONObjBuilder& append(const char *fieldName, double n) { b.append((char) NumberDouble); @@ -1451,6 +1554,16 @@ namespace mongo { return BSONObj(_done()); } + /** Peek at what is in the builder, but leave the builder ready for more appends. + The returned object is only valid until the next modification or destruction of the builder. + Intended use case: append a field if not already there. + */ + BSONObj asTempObj() { + BSONObj temp(_done()); + b.setlen(b.len()-1); //next append should overwrite the EOO + return temp; + } + /* assume ownership of the buffer - you must then free it (with free()) */ char* decouple(int& l) { char *x = _done(); @@ -1463,6 +1576,7 @@ namespace mongo { b.decouple(); // post done() call version. be sure jsobj frees... } + void appendKeys( const BSONObj& keyPattern , const BSONObj& values ); private: static const string numStrs[100]; // cache of 0 to 99 inclusive @@ -1482,6 +1596,14 @@ namespace mongo { return s_; } + /** Stream oriented way to add field names and values. */ + BSONObjBuilder& operator<<( IDLabeler ) { + OID oid; + oid.init(); + appendOID("_id", &oid); + return *this; + } + // prevent implicit string conversions which would allow bad things like BSON( BSON( "foo" << 1 ) << 2 ) struct ForceExplicitString { ForceExplicitString( const string &str ) : str_( str ) {} @@ -1509,12 +1631,15 @@ namespace mongo { b.append( fieldName ); b.append( (void *) arr.objdata(), arr.objsize() ); } - + char* _done() { s_.endField(); b.append((char) EOO); char *data = b.buf() + offset_; - *((int*)data) = b.len() - offset_; + int size = b.len() - offset_; + *((int*)data) = size; + if ( _tracker ) + _tracker->got( size ); return data; } @@ -1522,34 +1647,88 @@ namespace mongo { BufBuilder buf_; int offset_; BSONObjBuilderValueStream s_; + BSONSizeTracker * _tracker; }; class BSONArrayBuilder : boost::noncopyable{ public: - BSONArrayBuilder() :i(0), b() {} + BSONArrayBuilder() : _i(0), _b() {} + BSONArrayBuilder( BufBuilder &b ) : _i(0), _b(b) {} template <typename T> BSONArrayBuilder& append(const T& x){ - b.append(num().c_str(), x); + _b.append(num().c_str(), x); return *this; } BSONArrayBuilder& append(const BSONElement& e){ - b.appendAs(e, num().c_str()); + _b.appendAs(e, num().c_str()); return *this; } - + template <typename T> BSONArrayBuilder& operator<<(const T& x){ return append(x); } + + void appendNull() { + _b.appendNull(num().c_str()); + } - BSONArray arr(){ return BSONArray(b.obj()); } + BSONArray arr(){ return BSONArray(_b.obj()); } + + BSONObj done() { return _b.done(); } + + template <typename T> + BSONArrayBuilder& append(const char *name, const T& x){ + fill( name ); + append( x ); + return *this; + } + + BufBuilder &subobjStart( const char *name ) { + fill( name ); + return _b.subobjStart( num().c_str() ); + } + BufBuilder &subarrayStart( const char *name ) { + fill( name ); + return _b.subarrayStart( num().c_str() ); + } + + void appendArray( const char *name, BSONObj subObj ) { + fill( name ); + _b.appendArray( num().c_str(), subObj ); + } + + void appendAs( const BSONElement &e, const char *name ) { + fill( name ); + append( e ); + } + private: - string num(){ return b.numStr(i++); } - int i; - BSONObjBuilder b; + void fill( const char *name ) { + char *r; + int n = strtol( name, &r, 10 ); + uassert( 13048, "can't append to array using string field name", !*r ); + while( _i < n ) + append( nullElt() ); + } + + static BSONElement nullElt() { + static BSONObj n = nullObj(); + return n.firstElement(); + } + + static BSONObj nullObj() { + BSONObjBuilder b; + b.appendNull( "" ); + return b.obj(); + } + + string num(){ return _b.numStr(_i++); } + int _i; + BSONObjBuilder _b; }; @@ -1584,8 +1763,8 @@ namespace mongo { /** @return the next element in the object. For the final element, element.eoo() will be true. */ BSONElement next( bool checkEnd = false ) { assert( pos < theend ); - BSONElement e( pos, checkEnd ? theend - pos : -1 ); - pos += e.size( checkEnd ? theend - pos : -1 ); + BSONElement e( pos, checkEnd ? (int)(theend - pos) : -1 ); + pos += e.size( checkEnd ? (int)(theend - pos) : -1 ); return e; } private: @@ -1653,13 +1832,13 @@ namespace mongo { #define CHECK_OBJECT( o , msg ) #endif - inline BSONObj BSONElement::embeddedObjectUserCheck() { - uassert( 10065 , "invalid parameter: expected an object", type()==Object || type()==Array ); + inline BSONObj BSONElement::embeddedObjectUserCheck() const { + uassert( 10065 , "invalid parameter: expected an object", isABSONObj() ); return BSONObj(value()); } inline BSONObj BSONElement::embeddedObject() const { - assert( type()==Object || type()==Array ); + assert( isABSONObj() ); return BSONObj(value()); } @@ -1701,14 +1880,12 @@ namespace mongo { return false; } - inline BSONElement BSONObj::findElement(const char *name) const { - if ( !isEmpty() ) { - BSONObjIterator it(*this); - while ( it.moreWithEOO() ) { - BSONElement e = it.next(); - if ( strcmp(name, e.fieldName()) == 0 ) - return e; - } + inline BSONElement BSONObj::getField(const char *name) const { + BSONObjIterator i(*this); + while ( i.more() ) { + BSONElement e = i.next(); + if ( strcmp(e.fieldName(), name) == 0 ) + return e; } return BSONElement(); } @@ -1729,7 +1906,7 @@ namespace mongo { } inline bool BSONObj::getObjectID(BSONElement& e) const { - BSONElement f = findElement("_id"); + BSONElement f = getField("_id"); if( !f.eoo() ) { e = f; return true; @@ -1845,7 +2022,7 @@ namespace mongo { ~BSONObjIteratorSorted(){ assert( _fields ); - delete _fields; + delete[] _fields; _fields = 0; } diff --git a/db/jsobjmanipulator.h b/db/jsobjmanipulator.h index d534d08..1771bff 100644 --- a/db/jsobjmanipulator.h +++ b/db/jsobjmanipulator.h @@ -22,57 +22,63 @@ namespace mongo { -/** Manipulate the binary representation of a BSONElement in-place. - Careful, this casts away const. - */ -class BSONElementManipulator { -public: - BSONElementManipulator( const BSONElement &element ) : - element_( element ) { - assert( !element_.eoo() ); - } - /** Replace a Timestamp type with a Date type initialized to - OpTime::now().asDate() - */ - void initTimestamp(); - - /** Change the value, in place, of the number. */ - void setNumber(double d) { - if ( element_.type() == NumberDouble ) *reinterpret_cast< double * >( value() ) = d; - else if ( element_.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d; - } - void setLong(long long n) { - if( element_.type() == NumberLong ) *reinterpret_cast< long long * >( value() ) = n; - } + /** Manipulate the binary representation of a BSONElement in-place. + Careful, this casts away const. + */ + class BSONElementManipulator { + public: + BSONElementManipulator( const BSONElement &element ) : + _element( element ) { + assert( !_element.eoo() ); + } + /** Replace a Timestamp type with a Date type initialized to + OpTime::now().asDate() + */ + void initTimestamp(); + + /** Change the value, in place, of the number. */ + void setNumber(double d) { + if ( _element.type() == NumberDouble ) *reinterpret_cast< double * >( value() ) = d; + else if ( _element.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d; + } + void setLong(long long n) { + if( _element.type() == NumberLong ) *reinterpret_cast< long long * >( value() ) = n; + } + void setInt(int n) { + assert( _element.type() == NumberInt ); + *reinterpret_cast< int * >( value() ) = n; + } - /** Replace the type and value of the element with the type and value of e, - preserving the original fieldName */ - void replaceTypeAndValue( const BSONElement &e ) { - *data() = e.type(); - memcpy( value(), e.value(), e.valuesize() ); - } - - static void lookForTimestamps( const BSONObj& obj ){ - // If have a Timestamp field as the first or second element, - // update it to a Date field set to OpTime::now().asDate(). The - // replacement policy is a work in progress. - BSONObjIterator i( obj ); - for( int j = 0; i.moreWithEOO() && j < 2; ++j ) { - BSONElement e = i.next(); - if ( e.eoo() ) - break; - if ( e.type() == Timestamp ){ - BSONElementManipulator( e ).initTimestamp(); - break; + /** Replace the type and value of the element with the type and value of e, + preserving the original fieldName */ + void replaceTypeAndValue( const BSONElement &e ) { + *data() = e.type(); + memcpy( value(), e.value(), e.valuesize() ); + } + + static void lookForTimestamps( const BSONObj& obj ){ + // If have a Timestamp field as the first or second element, + // update it to a Date field set to OpTime::now().asDate(). The + // replacement policy is a work in progress. + + BSONObjIterator i( obj ); + for( int j = 0; i.moreWithEOO() && j < 2; ++j ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + if ( e.type() == Timestamp ){ + BSONElementManipulator( e ).initTimestamp(); + break; + } } } - } -private: - char *data() { return nonConst( element_.rawdata() ); } - char *value() { return nonConst( element_.value() ); } - static char *nonConst( const char *s ) { return const_cast< char * >( s ); } - const BSONElement element_; -}; + private: + char *data() { return nonConst( _element.rawdata() ); } + char *value() { return nonConst( _element.value() ); } + static char *nonConst( const char *s ) { return const_cast< char * >( s ); } + + const BSONElement _element; + }; } // namespace mongo diff --git a/db/json.cpp b/db/json.cpp index b55ddb1..7645b6b 100644 --- a/db/json.cpp +++ b/db/json.cpp @@ -20,6 +20,7 @@ #include "json.h" #include "../util/builder.h" #include "../util/base64.h" +#include "../util/hex.h" using namespace boost::spirit; @@ -167,27 +168,11 @@ namespace mongo { ObjectBuilder &b; }; - namespace hex { - int val( char c ) { - if ( '0' <= c && c <= '9' ) - return c - '0'; - if ( 'a' <= c && c <= 'f' ) - return c - 'a' + 10; - if ( 'A' <= c && c <= 'F' ) - return c - 'A' + 10; - assert( false ); - return 0xff; - } - char val( const char *c ) { - return ( val( c[ 0 ] ) << 4 ) | val( c[ 1 ] ); - } - } // namespace hex - struct chU { chU( ObjectBuilder &_b ) : b( _b ) {} void operator() ( const char *start, const char *end ) const { - unsigned char first = hex::val( start ); - unsigned char second = hex::val( start + 2 ); + unsigned char first = fromHex( start ); + unsigned char second = fromHex( start + 2 ); if ( first == 0 && second < 0x80 ) b.ss << second; else if ( first < 0x08 ) { @@ -315,7 +300,7 @@ namespace mongo { OID oid; char *oidP = (char *)( &oid ); for ( int i = 0; i < 12; ++i ) - oidP[ i ] = hex::val( s + ( i * 2 ) ); + oidP[ i ] = fromHex( s + ( i * 2 ) ); return oid; } @@ -356,7 +341,7 @@ namespace mongo { struct binDataType { binDataType( ObjectBuilder &_b ) : b( _b ) {} void operator() ( const char *start, const char *end ) const { - b.binDataType = BinDataType( hex::val( start ) ); + b.binDataType = BinDataType( fromHex( start ) ); } ObjectBuilder &b; }; diff --git a/db/lasterror.cpp b/db/lasterror.cpp index e8b1fcf..9fefcfa 100644 --- a/db/lasterror.cpp +++ b/db/lasterror.cpp @@ -28,7 +28,7 @@ namespace mongo { LastError LastError::noError; LastErrorHolder lastError; - boost::mutex LastErrorHolder::_idsmutex; + mongo::mutex LastErrorHolder::_idsmutex; void LastError::appendSelf( BSONObjBuilder &b ) { if ( !valid ) { @@ -75,7 +75,7 @@ namespace mongo { if ( id == 0 ) return _tl.get(); - boostlock lock(_idsmutex); + scoped_lock lock(_idsmutex); map<int,Status>::iterator i = _ids.find( id ); if ( i == _ids.end() ){ if ( ! create ) @@ -95,7 +95,7 @@ namespace mongo { } void LastErrorHolder::remove( int id ){ - boostlock lock(_idsmutex); + scoped_lock lock(_idsmutex); map<int,Status>::iterator i = _ids.find( id ); if ( i == _ids.end() ) return; @@ -121,7 +121,7 @@ namespace mongo { return; } - boostlock lock(_idsmutex); + scoped_lock lock(_idsmutex); Status & status = _ids[id]; status.time = time(0); status.lerr = le; diff --git a/db/lasterror.h b/db/lasterror.h index 8f687bb..78160eb 100644 --- a/db/lasterror.h +++ b/db/lasterror.h @@ -30,7 +30,7 @@ namespace mongo { string msg; enum UpdatedExistingType { NotUpdate, True, False } updatedExisting; /* todo: nObjects should be 64 bit */ - int nObjects; + long long nObjects; int nPrev; bool valid; bool overridenById; @@ -40,12 +40,12 @@ namespace mongo { code = _code; msg = _msg; } - void recordUpdate( bool _updatedExisting, int nChanged ) { + void recordUpdate( bool _updatedExisting, long long nChanged ) { reset( true ); nObjects = nChanged; updatedExisting = _updatedExisting ? True : False; } - void recordDelete( int nDeleted ) { + void recordDelete( long long nDeleted ) { reset( true ); nObjects = nDeleted; } @@ -100,7 +100,7 @@ namespace mongo { time_t time; LastError *lerr; }; - static boost::mutex _idsmutex; + static mongo::mutex _idsmutex; map<int,Status> _ids; } lastError; diff --git a/db/matcher.cpp b/db/matcher.cpp index d71b7ef..8c904e3 100644 --- a/db/matcher.cpp +++ b/db/matcher.cpp @@ -22,16 +22,35 @@ #include "matcher.h" #include "../util/goodies.h" #include "../util/unittest.h" -#include "storage.h" +#include "diskloc.h" #include "../scripting/engine.h" #include "db.h" #include "client.h" +#include "pdfile.h" + +namespace { + inline pcrecpp::RE_Options flags2options(const char* flags){ + pcrecpp::RE_Options options; + options.set_utf8(true); + while ( flags && *flags ) { + if ( *flags == 'i' ) + options.set_caseless(true); + else if ( *flags == 'm' ) + options.set_multiline(true); + else if ( *flags == 'x' ) + options.set_extended(true); + flags++; + } + return options; + } +} + +//#define DEBUGMATCHER(x) cout << x << endl; +#define DEBUGMATCHER(x) + namespace mongo { - //#include "minilex.h" - //MiniLex minilex; - class Where { public: Where() { @@ -66,52 +85,61 @@ namespace mongo { where = 0; } - ElementMatcher::ElementMatcher( BSONElement _e , int _op ) : toMatch( _e ) , compareOp( _op ) { + ElementMatcher::ElementMatcher( BSONElement _e , int _op, bool _isNot ) : toMatch( _e ) , compareOp( _op ), isNot( _isNot ) { if ( _op == BSONObj::opMOD ){ - BSONObj o = _e.embeddedObject().firstElement().embeddedObject(); + BSONObj o = _e.embeddedObject(); mod = o["0"].numberInt(); modm = o["1"].numberInt(); uassert( 10073 , "mod can't be 0" , mod ); } else if ( _op == BSONObj::opTYPE ){ - type = (BSONType)(_e.embeddedObject().firstElement().numberInt()); + type = (BSONType)(_e.numberInt()); } else if ( _op == BSONObj::opELEM_MATCH ){ - BSONElement m = toMatch.embeddedObjectUserCheck().firstElement(); + BSONElement m = _e; uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object ); subMatcher.reset( new Matcher( m.embeddedObject() ) ); } } - - ElementMatcher::~ElementMatcher(){ - } - - - -} // namespace mongo - -#include "pdfile.h" - -namespace { - inline pcrecpp::RE_Options flags2options(const char* flags){ - pcrecpp::RE_Options options; - options.set_utf8(true); - while ( flags && *flags ) { - if ( *flags == 'i' ) - options.set_caseless(true); - else if ( *flags == 'm' ) - options.set_multiline(true); - else if ( *flags == 'x' ) - options.set_extended(true); - flags++; + ElementMatcher::ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot ) + : toMatch( _e ) , compareOp( _op ), isNot( _isNot ) { + + myset.reset( new set<BSONElement,element_lt>() ); + + BSONObjIterator i( array ); + while ( i.more() ) { + BSONElement ie = i.next(); + if ( _op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ){ + shared_ptr<Matcher> s; + s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) ); + allMatchers.push_back( s ); + } else if ( ie.type() == RegEx ) { + if ( !myregex.get() ) { + myregex.reset( new vector< RegexMatcher >() ); + } + myregex->push_back( RegexMatcher() ); + RegexMatcher &rm = myregex->back(); + rm.re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) ); + rm.fieldName = 0; // no need for field name + rm.regex = ie.regex(); + rm.flags = ie.regexFlags(); + rm.isNot = false; + bool purePrefix; + string prefix = simpleRegex(rm.regex, rm.flags, &purePrefix); + if (purePrefix) + rm.prefix = prefix; + } else { + myset->insert(ie); + } } - return options; + + if ( allMatchers.size() ){ + uassert( 13020 , "with $all, can't mix $elemMatch and others" , myset->size() == 0 && !myregex.get()); + } + } -} - -namespace mongo { CoveredIndexMatcher::CoveredIndexMatcher(const BSONObj &jsobj, const BSONObj &indexKeyPattern) : _keyMatcher(jsobj.filterFieldsUndotted(indexKeyPattern, true), @@ -120,13 +148,18 @@ namespace mongo { { _needRecord = ! ( _docMatcher.keyMatch() && - _keyMatcher.jsobj.nFields() == _docMatcher.jsobj.nFields() + _keyMatcher.jsobj.nFields() == _docMatcher.jsobj.nFields() && + ! _keyMatcher.hasType( BSONObj::opEXISTS ) ); + } - bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc ) { + bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details ) { + if ( details ) + details->reset(); + if ( _keyMatcher.keyMatch() ) { - if ( !_keyMatcher.matches(key) ) { + if ( !_keyMatcher.matches(key, details ) ){ return false; } } @@ -135,14 +168,128 @@ namespace mongo { return true; } - return _docMatcher.matches(recLoc.rec()); + if ( details ) + details->loadedObject = true; + + return _docMatcher.matches(recLoc.rec() , details ); } + void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot){ + + if ( nRegex >= 4 ) { + out() << "ERROR: too many regexes in query" << endl; + } + else { + RegexMatcher& rm = regexs[nRegex]; + rm.re.reset( new pcrecpp::RE(regex, flags2options(flags)) ); + rm.fieldName = fieldName; + rm.regex = regex; + rm.flags = flags; + rm.isNot = isNot; + nRegex++; + + if (!isNot){ //TODO something smarter + bool purePrefix; + string prefix = simpleRegex(regex, flags, &purePrefix); + if (purePrefix) + rm.prefix = prefix; + } + } + } + + bool Matcher::addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ) { + const char *fn = fe.fieldName(); + int op = fe.getGtLtOp( -1 ); + if ( op == -1 ){ + if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ){ + return false; // { $ref : xxx } - treat as normal object + } + uassert( 10068 , (string)"invalid operator: " + fn , op != -1 ); + } + + switch ( op ){ + case BSONObj::GT: + case BSONObj::GTE: + case BSONObj::LT: + case BSONObj::LTE:{ + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), op, isNot); + break; + } + case BSONObj::NE:{ + haveNeg = true; + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), BSONObj::NE, isNot); + break; + } + case BSONObj::opALL: + all = true; + case BSONObj::opIN: + basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) ); + break; + case BSONObj::NIN: + haveNeg = true; + basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) ); + break; + case BSONObj::opMOD: + case BSONObj::opTYPE: + case BSONObj::opELEM_MATCH: { + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + // these are types where ElementMatcher has all the info + basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) ); + break; + } + case BSONObj::opSIZE:{ + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot); + haveSize = true; + break; + } + case BSONObj::opEXISTS:{ + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot); + break; + } + case BSONObj::opREGEX:{ + uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot ); + if ( fe.type() == RegEx ){ + regex = fe.regex(); + flags = fe.regexFlags(); + } + else { + regex = fe.valuestrsafe(); + } + break; + } + case BSONObj::opOPTIONS:{ + uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot ); + flags = fe.valuestrsafe(); + break; + } + case BSONObj::opNEAR: + case BSONObj::opWITHIN: + break; + default: + uassert( 10069 , (string)"BUG - can't operator for: " + fn , 0 ); + } + return true; + } + /* _jsobj - the query pattern */ Matcher::Matcher(const BSONObj &_jsobj, const BSONObj &constrainIndexKey) : - where(0), jsobj(_jsobj), haveSize(), all(), hasArray(0), _atomic(false), nRegex(0) { + where(0), jsobj(_jsobj), haveSize(), all(), hasArray(0), haveNeg(), _atomic(false), nRegex(0) { BSONObjIterator i(jsobj); while ( i.more() ) { @@ -171,15 +318,7 @@ namespace mongo { } if ( e.type() == RegEx ) { - if ( nRegex >= 4 ) { - out() << "ERROR: too many regexes in query" << endl; - } - else { - RegexMatcher& rm = regexs[nRegex]; - rm.re = new pcrecpp::RE(e.regex(), flags2options(e.regexFlags())); - rm.fieldName = e.fieldName(); - nRegex++; - } + addRegex( e.fieldName(), e.regex(), e.regexFlags() ); continue; } @@ -200,75 +339,31 @@ namespace mongo { const char *fn = fe.fieldName(); if ( fn[0] == '$' && fn[1] ) { - int op = fe.getGtLtOp( -1 ); - - if ( op == -1 ){ - if ( fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ){ - break; // { $ref : xxx } - treat as normal object - } - uassert( 10068 , (string)"invalid operator: " + fn , op != -1 ); - } - isOperator = true; - switch ( op ){ - case BSONObj::GT: - case BSONObj::GTE: - case BSONObj::LT: - case BSONObj::LTE:{ - shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); - _builders.push_back( b ); - b->appendAs(fe, e.fieldName()); - addBasic(b->done().firstElement(), op); - isOperator = true; - break; - } - case BSONObj::NE:{ - shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); - _builders.push_back( b ); - b->appendAs(fe, e.fieldName()); - addBasic(b->done().firstElement(), BSONObj::NE); - break; - } - case BSONObj::opALL: - all = true; - case BSONObj::opIN: - case BSONObj::NIN: - basics.push_back( ElementMatcher( e , op , fe.embeddedObject() ) ); - break; - case BSONObj::opMOD: - case BSONObj::opTYPE: - case BSONObj::opELEM_MATCH: - // these are types where ElementMatcher has all the info - basics.push_back( ElementMatcher( e , op ) ); - break; - case BSONObj::opSIZE:{ - shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); - _builders.push_back( b ); - b->appendAs(fe, e.fieldName()); - addBasic(b->done().firstElement(), BSONObj::opSIZE); - haveSize = true; - break; - } - case BSONObj::opEXISTS:{ - shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); - _builders.push_back( b ); - b->appendAs(fe, e.fieldName()); - addBasic(b->done().firstElement(), BSONObj::opEXISTS); - break; - } - case BSONObj::opREGEX:{ - regex = fe.valuestrsafe(); - break; - } - case BSONObj::opOPTIONS:{ - flags = fe.valuestrsafe(); - break; - } - default: - uassert( 10069 , (string)"BUG - can't operator for: " + fn , 0 ); + if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) { + haveNeg = true; + switch( fe.type() ) { + case Object: { + BSONObjIterator k( fe.embeddedObject() ); + uassert( 13030, "$not cannot be empty", k.more() ); + while( k.more() ) { + addOp( e, k.next(), true, regex, flags ); + } + break; + } + case RegEx: + addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true ); + break; + default: + uassert( 13031, "invalid use of $not", false ); + } + } else { + if ( !addOp( e, fe, false, regex, flags ) ) { + isOperator = false; + break; + } } - } else { isOperator = false; @@ -276,14 +371,7 @@ namespace mongo { } } if (regex){ - if ( nRegex >= 4 ) { - out() << "ERROR: too many regexes in query" << endl; - } else { - RegexMatcher& rm = regexs[nRegex]; - rm.re = new pcrecpp::RE(regex, flags2options(flags)); - rm.fieldName = e.fieldName(); - nRegex++; - } + addRegex(e.fieldName(), regex, flags); } if ( isOperator ) continue; @@ -298,21 +386,46 @@ namespace mongo { } // normal, simple case e.g. { a : "foo" } - addBasic(e, BSONObj::Equality); + addBasic(e, BSONObj::Equality, false); } constrainIndexKey_ = constrainIndexKey; } - + + inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) { + switch (e.type()){ + case String: + case Symbol: + if (rm.prefix.empty()) + return rm.re->PartialMatch(e.valuestr()); + else + return !strncmp(e.valuestr(), rm.prefix.c_str(), rm.prefix.size()); + case RegEx: + return !strcmp(rm.regex, e.regex()) && !strcmp(rm.flags, e.regexFlags()); + default: + return false; + } + } + inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) { assert( op != BSONObj::NE && op != BSONObj::NIN ); - if ( op == BSONObj::Equality ) + if ( op == BSONObj::Equality ) { return l.valuesEqual(r); + } if ( op == BSONObj::opIN ) { // { $in : [1,2,3] } - return bm.myset->count(l); + int count = bm.myset->count(l); + if ( count ) + return count; + if ( bm.myregex.get() ) { + for( vector<RegexMatcher>::const_iterator i = bm.myregex->begin(); i != bm.myregex->end(); ++i ) { + if ( regexMatches( *i, l ) ) { + return true; + } + } + } } if ( op == BSONObj::opSIZE ) { @@ -350,8 +463,8 @@ namespace mongo { return (op & z); } - int Matcher::matchesNe(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm ) { - int ret = matchesDotted( fieldName, toMatch, obj, BSONObj::Equality, bm ); + int Matcher::matchesNe(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm , MatchDetails * details ) { + int ret = matchesDotted( fieldName, toMatch, obj, BSONObj::Equality, bm , false , details ); if ( bm.toMatch.type() != jstNULL ) return ( ret <= 0 ) ? 1 : 0; else @@ -383,16 +496,44 @@ namespace mongo { 0 missing element 1 match */ - int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& bm , bool isArr) { - + int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& em , bool isArr, MatchDetails * details ) { + DEBUGMATCHER( "\t matchesDotted : " << fieldName << " hasDetails: " << ( details ? "yes" : "no" ) ); if ( compareOp == BSONObj::opALL ) { - if ( bm.myset->size() == 0 ) + + if ( em.allMatchers.size() ){ + BSONElement e = obj.getFieldDotted( fieldName ); + uassert( 13021 , "$all/$elemMatch needs to be applied to array" , e.type() == Array ); + + for ( unsigned i=0; i<em.allMatchers.size(); i++ ){ + bool found = false; + BSONObjIterator x( e.embeddedObject() ); + while ( x.more() ){ + BSONElement f = x.next(); + + if ( f.type() != Object ) + continue; + if ( em.allMatchers[i]->matches( f.embeddedObject() ) ){ + found = true; + break; + } + } + + if ( ! found ) + return -1; + } + + return 1; + } + + if ( em.myset->size() == 0 && !em.myregex.get() ) return -1; // is this desired? + BSONObjSetDefaultOrder actualKeys; IndexSpec( BSON( fieldName << 1 ) ).getKeys( obj, actualKeys ); if ( actualKeys.size() == 0 ) return 0; - for( set< BSONElement, element_lt >::const_iterator i = bm.myset->begin(); i != bm.myset->end(); ++i ) { + + for( set< BSONElement, element_lt >::const_iterator i = em.myset->begin(); i != em.myset->end(); ++i ) { // ignore nulls if ( i->type() == jstNULL ) continue; @@ -402,17 +543,44 @@ namespace mongo { if ( !actualKeys.count( b.done() ) ) return -1; } - return 1; - } + if ( !em.myregex.get() ) + return 1; + + for( vector< RegexMatcher >::const_iterator i = em.myregex->begin(); i != em.myregex->end(); ++i ) { + bool match = false; + for( BSONObjSetDefaultOrder::const_iterator j = actualKeys.begin(); j != actualKeys.end(); ++j ) { + if ( regexMatches( *i, j->firstElement() ) ) { + match = true; + break; + } + } + if ( !match ) + return -1; + } + + return 1; + } // end opALL + if ( compareOp == BSONObj::NE ) - return matchesNe( fieldName, toMatch, obj, bm ); + return matchesNe( fieldName, toMatch, obj, em , details ); if ( compareOp == BSONObj::NIN ) { - for( set<BSONElement,element_lt>::const_iterator i = bm.myset->begin(); i != bm.myset->end(); ++i ) { - int ret = matchesNe( fieldName, *i, obj, bm ); + for( set<BSONElement,element_lt>::const_iterator i = em.myset->begin(); i != em.myset->end(); ++i ) { + int ret = matchesNe( fieldName, *i, obj, em , details ); if ( ret != 1 ) return ret; } + if ( em.myregex.get() ) { + BSONElementSet s; + obj.getFieldsDotted( fieldName, s ); + for( vector<RegexMatcher>::const_iterator i = em.myregex->begin(); i != em.myregex->end(); ++i ) { + for( BSONElementSet::const_iterator j = s.begin(); j != s.end(); ++j ) { + if ( regexMatches( *i, *j ) ) { + return -1; + } + } + } + } return 1; } @@ -420,49 +588,73 @@ namespace mongo { bool indexed = !constrainIndexKey_.isEmpty(); if ( indexed ) { e = obj.getFieldUsingIndexNames(fieldName, constrainIndexKey_); - assert( !e.eoo() ); + if( e.eoo() ){ + cout << "obj: " << obj << endl; + cout << "fieldName: " << fieldName << endl; + cout << "constrainIndexKey_: " << constrainIndexKey_ << endl; + assert( !e.eoo() ); + } } else { + + const char *p = strchr(fieldName, '.'); + if ( p ) { + string left(fieldName, p-fieldName); + + BSONElement se = obj.getField(left.c_str()); + if ( se.eoo() ) + ; + else if ( se.type() != Object && se.type() != Array ) + ; + else { + BSONObj eo = se.embeddedObject(); + return matchesDotted(p+1, toMatch, eo, compareOp, em, se.type() == Array , details ); + } + } + if ( isArr ) { + DEBUGMATCHER( "\t\t isArr 1 : obj : " << obj ); BSONObjIterator ai(obj); bool found = false; while ( ai.moreWithEOO() ) { BSONElement z = ai.next(); + + if( strcmp(z.fieldName(),fieldName) == 0 && valuesMatch(z, toMatch, compareOp, em) ) { + // "field.<n>" array notation was used + if ( details ) + details->elemMatchKey = z.fieldName(); + return 1; + } + if ( z.type() == Object ) { BSONObj eo = z.embeddedObject(); - int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, bm, false); + int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, em, false, details ); if ( cmp > 0 ) { + if ( details ) + details->elemMatchKey = z.fieldName(); return 1; - } else if ( cmp < 0 ) { + } + else if ( cmp < 0 ) { found = true; } } } - return found ? -1 : retMissing( bm ); + return found ? -1 : retMissing( em ); } - const char *p = strchr(fieldName, '.'); - if ( p ) { - string left(fieldName, p-fieldName); - BSONElement se = obj.getField(left.c_str()); - if ( se.eoo() ) - return retMissing( bm ); - if ( se.type() != Object && se.type() != Array ) - return retMissing( bm ); - - BSONObj eo = se.embeddedObject(); - return matchesDotted(p+1, toMatch, eo, compareOp, bm, se.type() == Array); - } else { + if( p ) { + return retMissing( em ); + } + else { e = obj.getField(fieldName); } } if ( compareOp == BSONObj::opEXISTS ) { - return ( e.eoo() ^ toMatch.boolean() ) ? 1 : -1; + return ( e.eoo() ^ ( toMatch.boolean() ^ em.isNot ) ) ? 1 : -1; } else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) && - valuesMatch(e, toMatch, compareOp, bm ) ) { + valuesMatch(e, toMatch, compareOp, em ) ) { return 1; } else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) { - BSONObjIterator ai(e.embeddedObject()); while ( ai.moreWithEOO() ) { @@ -470,18 +662,23 @@ namespace mongo { if ( compareOp == BSONObj::opELEM_MATCH ){ // SERVER-377 - if ( z.type() == Object && bm.subMatcher->matches( z.embeddedObject() ) ) + if ( z.type() == Object && em.subMatcher->matches( z.embeddedObject() ) ){ + if ( details ) + details->elemMatchKey = z.fieldName(); return 1; + } } else { - if ( valuesMatch( z, toMatch, compareOp, bm) ) { + if ( valuesMatch( z, toMatch, compareOp, em) ) { + if ( details ) + details->elemMatchKey = z.fieldName(); return 1; } } } - if ( compareOp == BSONObj::Equality && e.woCompare( toMatch ) == 0 ){ + if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ){ // match an entire array to itself return 1; } @@ -496,27 +693,9 @@ namespace mongo { extern int dump; - inline bool regexMatches(RegexMatcher& rm, const BSONElement& e) { - char buf[64]; - const char *p = buf; - if ( e.type() == String || e.type() == Symbol ) - p = e.valuestr(); - else if ( e.isNumber() ) { - sprintf(buf, "%f", e.number()); - } - else if ( e.type() == Date ) { - Date_t d = e.date(); - time_t t = (d.millis/1000); - time_t_to_String(t, buf); - } - else - return false; - return rm.re->PartialMatch(p); - } - /* See if an object matches the query. */ - bool Matcher::matches(const BSONObj& jsobj ) { + bool Matcher::matches(const BSONObj& jsobj , MatchDetails * details ) { /* assuming there is usually only one thing to match. if more this could be slow sometimes. */ @@ -525,17 +704,21 @@ namespace mongo { ElementMatcher& bm = basics[i]; BSONElement& m = bm.toMatch; // -1=mismatch. 0=missing element. 1=match - int cmp = matchesDotted(m.fieldName(), m, jsobj, bm.compareOp, bm ); + int cmp = matchesDotted(m.fieldName(), m, jsobj, bm.compareOp, bm , false , details ); + if ( bm.compareOp != BSONObj::opEXISTS && bm.isNot ) + cmp = -cmp; if ( cmp < 0 ) return false; if ( cmp == 0 ) { /* missing is ok iff we were looking for null */ if ( m.type() == jstNULL || m.type() == Undefined ) { - if ( bm.compareOp == BSONObj::NE ) { + if ( ( bm.compareOp == BSONObj::NE ) ^ bm.isNot ) { return false; } } else { - return false; + if ( !bm.isNot ) { + return false; + } } } } @@ -554,7 +737,7 @@ namespace mongo { for( BSONElementSet::const_iterator i = s.begin(); i != s.end(); ++i ) if ( regexMatches(rm, *i) ) match = true; - if ( !match ) + if ( !match ^ rm.isNot ) return false; } @@ -590,6 +773,13 @@ namespace mongo { return true; } + bool Matcher::hasType( BSONObj::MatchType type ) const { + for ( unsigned i=0; i<basics.size() ; i++ ) + if ( basics[i].compareOp == type ) + return true; + return false; + } + struct JSObj1 js1; #pragma pack(1) diff --git a/db/matcher.h b/db/matcher.h index f1609f9..3839b68 100644 --- a/db/matcher.h +++ b/db/matcher.h @@ -31,13 +31,12 @@ namespace mongo { class RegexMatcher { public: const char *fieldName; - pcrecpp::RE *re; - RegexMatcher() { - re = 0; - } - ~RegexMatcher() { - delete re; - } + const char *regex; + const char *flags; + string prefix; + shared_ptr< pcrecpp::RE > re; + bool isNot; + RegexMatcher() : isNot() {} }; struct element_lt @@ -58,24 +57,17 @@ namespace mongo { ElementMatcher() { } - ElementMatcher( BSONElement _e , int _op ); + ElementMatcher( BSONElement _e , int _op, bool _isNot ); - ElementMatcher( BSONElement _e , int _op , const BSONObj& array ) : toMatch( _e ) , compareOp( _op ) { - - myset.reset( new set<BSONElement,element_lt>() ); - - BSONObjIterator i( array ); - while ( i.more() ) { - BSONElement ie = i.next(); - myset->insert(ie); - } - } + ElementMatcher( BSONElement _e , int _op , const BSONObj& array, bool _isNot ); - ~ElementMatcher(); + ~ElementMatcher() { } BSONElement toMatch; int compareOp; + bool isNot; shared_ptr< set<BSONElement,element_lt> > myset; + shared_ptr< vector<RegexMatcher> > myregex; // these are for specific operators int mod; @@ -83,12 +75,34 @@ namespace mongo { BSONType type; shared_ptr<Matcher> subMatcher; + + vector< shared_ptr<Matcher> > allMatchers; }; -// SQL where clause equivalent - class Where; + class Where; // used for $where javascript eval class DiskLoc; + struct MatchDetails { + MatchDetails(){ + reset(); + } + + void reset(){ + loadedObject = false; + elemMatchKey = 0; + } + + string toString() const { + stringstream ss; + ss << "loadedObject: " << loadedObject << " "; + ss << "elemMatchKey: " << ( elemMatchKey ? elemMatchKey : "NULL" ) << " "; + return ss.str(); + } + + bool loadedObject; + const char * elemMatchKey; // warning, this may go out of scope if matched object does + }; + /* Match BSON objects against a query pattern. e.g. @@ -107,12 +121,12 @@ namespace mongo { int matchesDotted( const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, - int compareOp, const ElementMatcher& bm, bool isArr = false); + int compareOp, const ElementMatcher& bm, bool isArr , MatchDetails * details ); int matchesNe( const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, - const ElementMatcher&bm); + const ElementMatcher&bm, MatchDetails * details ); public: static int opDirection(int op) { @@ -125,30 +139,34 @@ namespace mongo { ~Matcher(); - bool matches(const BSONObj& j); + bool matches(const BSONObj& j, MatchDetails * details = 0 ); - bool keyMatch() const { return !all && !haveSize && !hasArray; } + bool keyMatch() const { return !all && !haveSize && !hasArray && !haveNeg; } bool atomic() const { return _atomic; } + bool hasType( BSONObj::MatchType type ) const; private: - void addBasic(const BSONElement &e, int c) { + void addBasic(const BSONElement &e, int c, bool isNot) { // TODO May want to selectively ignore these element types based on op type. if ( e.type() == MinKey || e.type() == MaxKey ) return; - basics.push_back( ElementMatcher( e , c ) ); + basics.push_back( ElementMatcher( e , c, isNot ) ); } + void addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot = false); + bool addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ); + int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm); Where *where; // set if query uses $where BSONObj jsobj; // the query pattern. e.g., { name: "joe" } BSONObj constrainIndexKey_; vector<ElementMatcher> basics; -// int n; // # of basicmatcher items bool haveSize; bool all; bool hasArray; + bool haveNeg; /* $atomic - if true, a multi document operation (some removes, updates) should be done atomically. in that case, we do not yield - @@ -171,7 +189,7 @@ namespace mongo { public: CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern); bool matches(const BSONObj &o){ return _docMatcher.matches( o ); } - bool matches(const BSONObj &key, const DiskLoc &recLoc); + bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 ); bool needRecord(){ return _needRecord; } Matcher& docMatcher() { return _docMatcher; } diff --git a/db/module.cpp b/db/module.cpp index d218fe6..78f8f79 100644 --- a/db/module.cpp +++ b/db/module.cpp @@ -1,4 +1,20 @@ // module.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + #include "stdafx.h" #include "module.h" diff --git a/db/modules/mms.cpp b/db/modules/mms.cpp index 9c00e60..248a4e4 100644 --- a/db/modules/mms.cpp +++ b/db/modules/mms.cpp @@ -1,4 +1,20 @@ // mms.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + #include "stdafx.h" #include "../db.h" @@ -6,6 +22,7 @@ #include "../module.h" #include "../../util/httpclient.h" #include "../../util/background.h" +#include "../commands.h" namespace po = boost::program_options; @@ -13,24 +30,26 @@ namespace mongo { /** Mongo Monitoring Service if enabled, this runs in the background ands pings mss - */ + */ class MMS : public BackgroundJob , Module { public: MMS() - : Module( "mms" ) , _baseurl( "http://mms.10gen.com/ping/" ) , + : Module( "mms" ) , _baseurl( "" ) , _secsToSleep(1) , _token( "" ) , _name( "" ) { add_options() + ( "mms-url" , po::value<string>()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" ) ( "mms-token" , po::value<string>() , "account token for mongo monitoring server" ) - ( "mms-name" , po::value<string>() , "server name mongo monitoring server" ) - ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval for mongo monitoring server" ) + ( "mms-name" , po::value<string>() , "server name for mongo monitoring server" ) + ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" ) ; } ~MMS(){} - + void config( program_options::variables_map& params ){ + _baseurl = params["mms-url"].as<string>(); if ( params.count( "mms-token" ) ){ _token = params["mms-token"].as<string>(); } @@ -41,87 +60,94 @@ namespace mongo { } void run(){ - if ( _token.size() == 0 && _name.size() == 0 ){ - log(1) << "mms not configured" << endl; - return; - } - - if ( _token.size() == 0 ){ - log() << "no token for mms - not running" << endl; - return; - } - - if ( _name.size() == 0 ){ - log() << "no name for mms - not running" << endl; - return; - } - - log() << "mms monitor staring... token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl; - - unsigned long long lastTime = 0; - unsigned long long lastLockTime = 0; - - while ( ! inShutdown() ){ - sleepsecs( _secsToSleep ); - - stringstream url; - url << _baseurl << _token << "?"; - url << "monitor_name=" << _name << "&"; - url << "version=" << versionString << "&"; - url << "git_hash=" << gitVersion() << "&"; + if ( _token.size() == 0 && _name.size() == 0 ){ + log(1) << "mms not configured" << endl; + return; + } - { //percent_locked - unsigned long long time = curTimeMicros64(); - unsigned long long start , lock; - dbMutex.info().getTimingInfo( start , lock ); - if ( lastTime ){ - double timeDiff = (double) (time - lastTime); - double lockDiff = (double) (lock - lastLockTime); - url << "percent_locked=" << (int)ceil( 100 * ( lockDiff / timeDiff ) ) << "&"; - } - lastTime = time; - lastLockTime = lock; + if ( _token.size() == 0 ){ + log() << "no token for mms - not running" << endl; + return; } - - vector< string > dbNames; - getDatabaseNames( dbNames ); - boost::intmax_t totalSize = 0; - for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) { - boost::intmax_t size = dbSize( i->c_str() ); - totalSize += size; + + if ( _name.size() == 0 ){ + log() << "no name for mms - not running" << endl; + return; } - url << "data_size=" << totalSize / ( 1024 * 1024 ) << "&"; - + log() << "mms monitor staring... token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl; + Client::initThread( "mms" ); + Client& c = cc(); - /* TODO: - message_operations - update_operations - insert_operations - get_more_operations - delete_operations - kill_cursors_operations - */ - - log(1) << "mms url: " << url.str() << endl; + // TODO: using direct client is bad, but easy for now - try { - HttpClient c; - map<string,string> headers; - stringstream ss; - int rc = c.get( url.str() , headers , ss ); - log(1) << "\t response code: " << rc << endl; - if ( rc != 200 ){ - log() << "mms error response code:" << rc << endl; - log(1) << "mms error body:" << ss.str() << endl; + while ( ! inShutdown() ){ + sleepsecs( _secsToSleep ); + + try { + stringstream url; + url << _baseurl << "?" + << "token=" << _token << "&" + << "name=" << _name << "&" + << "ts=" << time(0) + ; + + BSONObjBuilder bb; + // duplicated so the post has everything + bb.append( "token" , _token ); + bb.append( "name" , _name ); + bb.appendDate( "ts" , jsTime() ); + + // any commands + _add( bb , "buildinfo" ); + _add( bb , "serverStatus" ); + + BSONObj postData = bb.obj(); + + log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;; + + HttpClient c; + HttpClient::Result r; + int rc = c.post( url.str() , postData.jsonString() , &r ); + log(1) << "\t response code: " << rc << endl; + if ( rc != 200 ){ + log() << "mms error response code:" << rc << endl; + log(1) << "mms error body:" << r.getEntireResponse() << endl; + } + } + catch ( std::exception& e ){ + log() << "mms exception: " << e.what() << endl; } } - catch ( std::exception& e ){ - log() << "mms get exception: " << e.what() << endl; - } + + c.shutdown(); } + + void _add( BSONObjBuilder& postData , const char* cmd ){ + Command * c = Command::findCommand( cmd ); + if ( ! c ){ + log() << "MMS can't find command: " << cmd << endl; + postData.append( cmd , "can't find command" ); + return; + } + + if ( c->locktype() ){ + log() << "MMS can only use noLocking commands not: " << cmd << endl; + postData.append( cmd , "not noLocking" ); + return; + } + + BSONObj co = BSON( cmd << 1 ); + + string errmsg; + BSONObjBuilder sub; + if ( ! c->run( "admin.$cmd" , co , errmsg , sub , false ) ) + postData.append( cmd , errmsg ); + else + postData.append( cmd , sub.obj() ); } + void init(){ go(); } @@ -135,8 +161,8 @@ namespace mongo { string _token; string _name; - - } /* mms */; + + } /*mms*/ ; } @@ -28,6 +28,8 @@ namespace mongo { namespace mr { + typedef vector<BSONObj> BSONList; + class MyCmp { public: MyCmp(){} @@ -38,48 +40,76 @@ namespace mongo { typedef pair<BSONObj,BSONObj> Data; //typedef list< Data > InMemory; - typedef map< BSONObj,list<BSONObj>,MyCmp > InMemory; + typedef map< BSONObj,BSONList,MyCmp > InMemory; - BSONObj reduceValues( list<BSONObj>& values , Scope * s , ScriptingFunction reduce , bool final , ScriptingFunction finalize ){ + BSONObj reduceValues( BSONList& values , Scope * s , ScriptingFunction reduce , bool final , ScriptingFunction finalize ){ uassert( 10074 , "need values" , values.size() ); int sizeEstimate = ( values.size() * values.begin()->getField( "value" ).size() ) + 128; BSONObj key; BSONObjBuilder reduceArgs( sizeEstimate ); - - BSONObjBuilder valueBuilder( sizeEstimate ); - int n = 0; - for ( list<BSONObj>::iterator i=values.begin(); i!=values.end(); i++){ - BSONObj o = *i; - BSONObjIterator j(o); + BSONArrayBuilder * valueBuilder = 0; + + int sizeSoFar = 0; + unsigned n = 0; + for ( ; n<values.size(); n++ ){ + BSONObjIterator j(values[n]); BSONElement keyE = j.next(); if ( n == 0 ){ reduceArgs.append( keyE ); - BSONObjBuilder temp; - temp.append( keyE ); - key = temp.obj(); + key = keyE.wrap(); + valueBuilder = new BSONArrayBuilder( reduceArgs.subarrayStart( "values" ) ); + sizeSoFar = 5 + keyE.size(); } - valueBuilder.appendAs( j.next() , BSONObjBuilder::numStr( n++ ).c_str() ); + + BSONElement ee = j.next(); + + uassert( 13070 , "value to large to reduce" , ee.size() < ( 2 * 1024 * 1024 ) ); + + if ( sizeSoFar + ee.size() > ( 4 * 1024 * 1024 ) ){ + assert( n > 1 ); // if not, inf. loop + break; + } + + valueBuilder->append( ee ); + sizeSoFar += ee.size(); } - - reduceArgs.appendArray( "values" , valueBuilder.obj() ); + assert(valueBuilder); + valueBuilder->done(); + delete valueBuilder; BSONObj args = reduceArgs.obj(); - + s->invokeSafe( reduce , args ); if ( s->type( "return" ) == Array ){ uassert( 10075 , "reduce -> multiple not supported yet",0); return BSONObj(); } + + int endSizeEstimate = key.objsize() + ( args.objsize() / values.size() ); + + if ( n < values.size() ){ + BSONList x; + for ( ; n < values.size(); n++ ){ + x.push_back( values[n] ); + } + BSONObjBuilder temp( endSizeEstimate ); + temp.append( key.firstElement() ); + s->append( temp , "1" , "return" ); + x.push_back( temp.obj() ); + return reduceValues( x , s , reduce , final , finalize ); + } + + if ( finalize ){ - BSONObjBuilder b; + BSONObjBuilder b(endSizeEstimate); b.appendAs( key.firstElement() , "_id" ); s->append( b , "value" , "return" ); s->invokeSafe( finalize , b.obj() ); } - BSONObjBuilder b; + BSONObjBuilder b(endSizeEstimate); b.appendAs( key.firstElement() , final ? "_id" : "0" ); s->append( b , final ? "value" : "1" , "return" ); return b.obj(); @@ -108,8 +138,12 @@ namespace mongo { if ( ! keeptemp && markAsTemp ) cc().addTempCollection( tempLong ); - if ( cmdObj["out"].type() == String ) + replicate = keeptemp; + + if ( cmdObj["out"].type() == String ){ finalShort = cmdObj["out"].valuestr(); + replicate = true; + } else finalShort = tempShort; @@ -123,8 +157,10 @@ namespace mongo { if ( cmdObj["finalize"].type() ){ finalizeCode = cmdObj["finalize"].ascode(); } + checkCodeWScope( "map" , cmdObj ); + checkCodeWScope( "reduce" , cmdObj ); + checkCodeWScope( "finalize" , cmdObj ); - if ( cmdObj["mapparams"].type() == Array ){ mapparams = cmdObj["mapparams"].embeddedObjectUserCheck(); } @@ -151,6 +187,14 @@ namespace mongo { } } + void checkCodeWScope( const char * field , const BSONObj& o ){ + BSONElement e = o[field]; + if ( e.type() != CodeWScope ) + return; + BSONObj x = e.codeWScopeObject(); + uassert( 13035 , (string)"can't use CodeWScope with map/reduce function: " + field , x.isEmpty() ); + } + /** @return number objects in collection */ @@ -171,6 +215,7 @@ namespace mongo { // options bool verbose; bool keeptemp; + bool replicate; // query options @@ -224,12 +269,13 @@ namespace mongo { db.dropCollection( setup.incLong ); writelock l( setup.incLong ); + Client::Context ctx( setup.incLong ); string err; assert( userCreateNS( setup.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ); } - void finalReduce( list<BSONObj>& values ){ + void finalReduce( BSONList& values ){ if ( values.size() == 0 ) return; @@ -237,7 +283,11 @@ namespace mongo { BSONObj res = reduceValues( values , scope.get() , reduce , 1 , finalize ); writelock l( setup.tempLong ); - theDataFileMgr.insertAndLog( setup.tempLong.c_str() , res , false ); + Client::Context ctx( setup.incLong ); + if ( setup.replicate ) + theDataFileMgr.insertAndLog( setup.tempLong.c_str() , res , false ); + else + theDataFileMgr.insert( setup.tempLong.c_str() , res , false ); } @@ -272,7 +322,7 @@ namespace mongo { for ( InMemory::iterator i=old->begin(); i!=old->end(); i++ ){ BSONObj key = i->first; - list<BSONObj>& all = i->second; + BSONList& all = i->second; if ( all.size() == 1 ){ // this key has low cardinality, so just write to db @@ -291,13 +341,14 @@ namespace mongo { void dump(){ writelock l(_state.setup.incLong); + Client::Context ctx(_state.setup.incLong); for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ){ - list<BSONObj>& all = i->second; + BSONList& all = i->second; if ( all.size() < 1 ) continue; - for ( list<BSONObj>::iterator j=all.begin(); j!=all.end(); j++ ) + for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ ) write( *j ); } _temp->clear(); @@ -306,7 +357,7 @@ namespace mongo { } void insert( const BSONObj& a ){ - list<BSONObj>& all = (*_temp)[a]; + BSONList& all = (*_temp)[a]; all.push_back( a ); _size += a.objsize() + 16; } @@ -343,7 +394,8 @@ namespace mongo { boost::thread_specific_ptr<MRTL> _tlmr; BSONObj fast_emit( const BSONObj& args ){ - uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 ); + uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 ); + uassert( 13069 , "an emit can't be more than 2mb" , args.objsize() < ( 2 * 1024 * 1024 ) ); _tlmr->insert( args ); _tlmr->numEmits++; return BSONObj(); @@ -357,11 +409,14 @@ namespace mongo { virtual void help( stringstream &help ) const { help << "see http://www.mongodb.org/display/DOCS/MapReduce"; } - + virtual LockType locktype(){ return WRITE; } // TODO, READ? bool run(const char *dbname, BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ Timer t; Client::GodScope cg; - MRSetup mr( cc().database()->name , cmd ); + Client& client = cc(); + CurOp * op = client.curop(); + + MRSetup mr( client.database()->name , cmd ); log(1) << "mr ns: " << mr.ns << endl; @@ -385,7 +440,7 @@ namespace mongo { MRTL * mrtl = new MRTL( state ); _tlmr.reset( mrtl ); - ProgressMeter pm( db.count( mr.ns , mr.filter ) ); + ProgressMeter & pm = op->setMessage( "m/r: (1/3) emit phase" , db.count( mr.ns , mr.filter ) ); auto_ptr<DBClientCursor> cursor = db.query( mr.ns , mr.q ); long long mapTime = 0; Timer mt; @@ -405,6 +460,7 @@ namespace mongo { Timer t; mrtl->checkSize(); inReduce += t.micros(); + killCurrentOp.checkForInterrupt(); dbtemprelease temprlease; } pm.hit(); @@ -412,9 +468,10 @@ namespace mongo { if ( mr.limit && num >= mr.limit ) break; } + pm.finished(); - countsBuilder.append( "input" , num ); - countsBuilder.append( "emit" , mrtl->numEmits ); + countsBuilder.appendNumber( "input" , num ); + countsBuilder.appendNumber( "emit" , mrtl->numEmits ); if ( mrtl->numEmits ) shouldHaveData = true; @@ -422,7 +479,7 @@ namespace mongo { timingBuilder.append( "emitLoop" , t.millis() ); // final reduce - + op->setMessage( "m/r: (2/3) final reduce in memory" ); mrtl->reduceInMemory(); mrtl->dump(); @@ -430,16 +487,22 @@ namespace mongo { db.ensureIndex( mr.incLong , sortKey ); BSONObj prev; - list<BSONObj> all; + BSONList all; - ProgressMeter fpm( db.count( mr.incLong ) ); + assert( userCreateNS( mr.tempLong.c_str() , BSONObj() , errmsg , mr.replicate ) ); + + pm = op->setMessage( "m/r: (3/3) final reduce to collection" , db.count( mr.incLong ) ); cursor = db.query( mr.incLong, Query().sort( sortKey ) ); while ( cursor->more() ){ BSONObj o = cursor->next().getOwned(); - + pm.hit(); + if ( o.woSortOrder( prev , sortKey ) == 0 ){ all.push_back( o ); + if ( pm.hits() % 1000 == 0 ){ + dbtemprelease tl; + } continue; } @@ -448,12 +511,11 @@ namespace mongo { all.clear(); prev = o; all.push_back( o ); - fpm.hit(); + killCurrentOp.checkForInterrupt(); dbtemprelease tl; } - state.finalReduce( all ); - + pm.finished(); _tlmr.reset( 0 ); } catch ( ... ){ @@ -471,7 +533,7 @@ namespace mongo { result.append( "result" , mr.finalShort ); result.append( "timeMillis" , t.millis() ); - countsBuilder.append( "output" , finalCount ); + countsBuilder.appendNumber( "output" , finalCount ); if ( mr.verbose ) result.append( "timing" , timingBuilder.obj() ); result.append( "counts" , countsBuilder.obj() ); @@ -493,11 +555,12 @@ namespace mongo { public: MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ){} virtual bool slaveOk() { return true; } - + + virtual LockType locktype(){ return WRITE; } bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool){ - dbtemprelease temprlease; // we don't touch the db directly - - string dbname = cc().database()->name; + string dbname = cc().database()->name; // this has to come before dbtemprelease + dbtemprelease temprelease; // we don't touch the db directly + string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe(); MRSetup mr( dbname , cmdObj.firstElement().embeddedObjectUserCheck() , false ); @@ -540,14 +603,14 @@ namespace mongo { if ( mr.finalizeCode.size() ) finalizeFunction = s->createFunction( mr.finalizeCode.c_str() ); - list<BSONObj> values; + BSONList values; result.append( "result" , mr.finalShort ); DBDirectClient db; while ( cursor.more() ){ - BSONObj t = cursor.next(); + BSONObj t = cursor.next().getOwned(); if ( values.size() == 0 ){ values.push_back( t ); diff --git a/db/namespace.cpp b/db/namespace.cpp index ecd5f64..210efb6 100644 --- a/db/namespace.cpp +++ b/db/namespace.cpp @@ -47,11 +47,43 @@ namespace mongo { } boost::filesystem::path NamespaceIndex::path() const { - return boost::filesystem::path( dir_ ) / ( database_ + ".ns" ); + boost::filesystem::path ret( dir_ ); + if ( directoryperdb ) + ret /= database_; + ret /= ( database_ + ".ns" ); + return ret; } + void NamespaceIndex::maybeMkdir() const { + if ( !directoryperdb ) + return; + boost::filesystem::path dir( dir_ ); + dir /= database_; + if ( !boost::filesystem::exists( dir ) ) + BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( dir ) ); + } + int lenForNewNsFiles = 16 * 1024 * 1024; + void NamespaceDetails::onLoad(const Namespace& k) { + if( k.isExtra() ) { + /* overflow storage for indexes - so don't treat as a NamespaceDetails object. */ + return; + } + + assertInWriteLock(); + if( backgroundIndexBuildInProgress ) { + log() << "backgroundIndexBuildInProgress was " << backgroundIndexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl; + backgroundIndexBuildInProgress = 0; + } + } + + static void callback(const Namespace& k, NamespaceDetails& v) { + v.onLoad(k); + } + + bool checkNsFilesOnLoad = true; + void NamespaceIndex::init() { if ( ht ) return; @@ -82,6 +114,7 @@ namespace mongo { else { // use lenForNewNsFiles, we are making a new database massert( 10343 , "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 ); + maybeMkdir(); long l = lenForNewNsFiles; p = f.map(pathString.c_str(), l); if( p ) { @@ -95,6 +128,8 @@ namespace mongo { dbexit( EXIT_FS ); } ht = new HashTable<Namespace,NamespaceDetails>(p, len, "namespace index"); + if( checkNsFilesOnLoad ) + ht->iterAll(callback); } void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) { @@ -446,9 +481,14 @@ namespace mongo { // signal done allocating new extents. if ( !deletedList[ 1 ].isValid() ) deletedList[ 1 ] = DiskLoc(); - + assert( len < 400000000 ); int passes = 0; + int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog + if ( maxPasses < 5000 ){ + // this is for bacwards safety since 5000 was the old value + maxPasses = 5000; + } DiskLoc loc; // delete records until we have room and the max # objects limit achieved. @@ -497,10 +537,10 @@ namespace mongo { DiskLoc fr = theCapExtent()->firstRecord; theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true); compact(); - if( ++passes >= 5000 ) { - log() << "passes ns:" << ns << " len:" << len << '\n'; + if( ++passes > maxPasses ) { + log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n'; log() << "passes max:" << max << " nrecords:" << nrecords << " datasize: " << datasize << endl; - massert( 10345 , "passes >= 5000 in capped collection alloc", false ); + massert( 10345 , "passes >= maxPasses in capped collection alloc", false ); } } @@ -512,7 +552,7 @@ namespace mongo { } /* you MUST call when adding an index. see pdfile.cpp */ - IndexDetails& NamespaceDetails::addIndex(const char *thisns) { + IndexDetails& NamespaceDetails::addIndex(const char *thisns, bool resetTransient) { assert( nsdetails(thisns) == this ); if( nIndexes == NIndexesBase && extraOffset == 0 ) { @@ -521,7 +561,8 @@ namespace mongo { IndexDetails& id = idx(nIndexes); nIndexes++; - NamespaceDetailsTransient::get_w(thisns).addedIndex(); + if ( resetTransient ) + NamespaceDetailsTransient::get_w(thisns).addedIndex(); return id; } @@ -543,31 +584,39 @@ namespace mongo { for ( int i = 0; i < nIndexes; i++ ) { IndexDetails& idx = indexes[i]; BSONObj idxKey = idx.info.obj().getObjectField("key"); // e.g., { ts : -1 } - if ( !idxKey.findElement(fieldName).eoo() ) + if ( !idxKey.getField(fieldName).eoo() ) return i; }*/ return -1; } - long long NamespaceDetails::storageSize(){ + long long NamespaceDetails::storageSize( int * numExtents ){ Extent * e = firstExtent.ext(); assert( e ); long long total = 0; + int n = 0; while ( e ){ - total += e->length; - e = e->getNextExtent(); + total += e->length; + e = e->getNextExtent(); + n++; } + + if ( numExtents ) + *numExtents = n; + return total; } /* ------------------------------------------------------------------------- */ - boost::mutex NamespaceDetailsTransient::_qcMutex; + mongo::mutex NamespaceDetailsTransient::_qcMutex; + mongo::mutex NamespaceDetailsTransient::_isMutex; map< string, shared_ptr< NamespaceDetailsTransient > > NamespaceDetailsTransient::_map; typedef map< string, shared_ptr< NamespaceDetailsTransient > >::iterator ouriter; void NamespaceDetailsTransient::reset() { + DEV assertInWriteLock(); clearQueryCache(); _keysComputed = false; _indexSpecs.clear(); @@ -595,11 +644,13 @@ namespace mongo { _keysComputed = true; _indexKeys.clear(); NamespaceDetails *d = nsdetails(_ns.c_str()); + if ( ! d ) + return; NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) i.next().keyPattern().getFieldNames(_indexKeys); } - + void NamespaceDetailsTransient::cllStart( int logSizeMb ) { assertInWriteLock(); _cll_ns = "local.temp.oplog." + _ns; @@ -607,7 +658,7 @@ namespace mongo { stringstream spec; // 128MB spec << "{size:" << logSizeMb * 1024 * 1024 << ",capped:true,autoIndexId:false}"; - setClient( _cll_ns.c_str() ); + Client::Context ct( _cll_ns ); string err; massert( 10347 , "Could not create log ns", userCreateNS( _cll_ns.c_str(), fromjson( spec.str() ), err, false ) ); NamespaceDetails *d = nsdetails( _cll_ns.c_str() ); @@ -633,7 +684,7 @@ namespace mongo { assertInWriteLock(); if ( !_cll_enabled ) return; - setClient( _cll_ns.c_str() ); + Client::Context ctx( _cll_ns ); dropNS( _cll_ns ); } diff --git a/db/namespace.h b/db/namespace.h index df4c62f..1b1a954 100644 --- a/db/namespace.h +++ b/db/namespace.h @@ -21,7 +21,7 @@ #include "../stdafx.h" #include "jsobj.h" #include "queryutil.h" -#include "storage.h" +#include "diskloc.h" #include "../util/hashtab.h" #include "../util/mmap.h" @@ -75,6 +75,10 @@ namespace mongo { NamespaceString( const char * ns ) { init(ns); } NamespaceString( const string& ns ) { init(ns.c_str()); } + string ns() const { + return db + '.' + coll; + } + bool isSystem() { return strncmp(coll.c_str(), "system.", 7) == 0; } @@ -100,6 +104,10 @@ namespace mongo { massert( 10348 , "ns name too long", s.size() < MaxNsLen); return s; } + bool isExtra() const { + const char *p = strstr(buf, "$extra"); + return p && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example + } void kill() { buf[0] = 0x7f; @@ -186,6 +194,9 @@ namespace mongo { BOOST_STATIC_ASSERT( NIndexesMax == NIndexesBase + NIndexesExtra ); + /* called when loaded from disk */ + void onLoad(const Namespace& k); + NamespaceDetails( const DiskLoc &loc, bool _capped ) { /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */ firstExtent = lastExtent = capExtent = loc; @@ -251,6 +262,13 @@ namespace mongo { int backgroundIndexBuildInProgress; // 1 if in prog char reserved[76]; + /* when a background index build is in progress, we don't count the index in nIndexes until + complete, yet need to still use it in _indexRecord() - thus we use this function for that. + */ + int nIndexesBeingBuilt() const { + return nIndexes + backgroundIndexBuildInProgress; + } + /* NOTE: be careful with flags. are we manipulating them in read locks? if so, this isn't thread safe. TODO */ @@ -264,6 +282,10 @@ namespace mongo { return _indexes[idxNo]; return extra()->details[idxNo-NIndexesBase]; } + IndexDetails& backgroundIdx() { + DEV assert(backgroundIndexBuildInProgress); + return idx(nIndexes); + } class IndexIterator { friend class NamespaceDetails; @@ -324,7 +346,7 @@ namespace mongo { /* add a new index. does not add to system.indexes etc. - just to NamespaceDetails. caller must populate returned object. */ - IndexDetails& addIndex(const char *thisns); + IndexDetails& addIndex(const char *thisns, bool resetTransient=true); void aboutToDeleteAnIndex() { flags &= ~Flag_HaveIdIndex; @@ -410,7 +432,7 @@ namespace mongo { void checkMigrate(); - long long storageSize(); + long long storageSize( int * numExtents = 0 ); private: bool cappedMayDelete() const { @@ -450,7 +472,7 @@ namespace mongo { static std::map< string, shared_ptr< NamespaceDetailsTransient > > _map; public: NamespaceDetailsTransient(const char *ns) : _ns(ns), _keysComputed(false), _qcWriteCount(), _cll_enabled() { } - /* _get() is not threadsafe */ + /* _get() is not threadsafe -- see get_inlock() comments */ static NamespaceDetailsTransient& _get(const char *ns); /* use get_w() when doing write operations */ static NamespaceDetailsTransient& get_w(const char *ns) { @@ -484,12 +506,16 @@ namespace mongo { /* IndexSpec caching */ private: map<const IndexDetails*,IndexSpec> _indexSpecs; + static mongo::mutex _isMutex; public: const IndexSpec& getIndexSpec( const IndexDetails * details ){ - DEV assertInWriteLock(); IndexSpec& spec = _indexSpecs[details]; - if ( spec.meta.isEmpty() ){ - spec.reset( details->info ); + if ( ! spec._finishedInit ){ + scoped_lock lk(_isMutex); + if ( ! spec._finishedInit ){ + spec.reset( details ); + assert( spec._finishedInit ); + } } return spec; } @@ -499,7 +525,7 @@ namespace mongo { int _qcWriteCount; map< QueryPattern, pair< BSONObj, long long > > _qcCache; public: - static boost::mutex _qcMutex; + static mongo::mutex _qcMutex; /* you must be in the qcMutex when calling this (and using the returned val): */ static NamespaceDetailsTransient& get_inlock(const char *ns) { return _get(ns); @@ -555,9 +581,9 @@ namespace mongo { BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) ); public: NamespaceIndex(const string &dir, const string &database) : - ht( 0 ), - dir_( dir ), - database_( database ) {} + ht( 0 ), + dir_( dir ), + database_( database ) {} /* returns true if new db will be created if we init lazily */ bool exists() const; @@ -637,6 +663,7 @@ namespace mongo { private: boost::filesystem::path path() const; + void maybeMkdir() const; MemoryMappedFile f; HashTable<Namespace,NamespaceDetails> *ht; @@ -644,7 +671,8 @@ namespace mongo { string database_; }; - extern string dbpath; // --dbpath parm + extern string dbpath; // --dbpath parm + extern bool directoryperdb; // Rename a namespace within current 'client' db. // (Arguments should include db name) diff --git a/db/nonce.cpp b/db/nonce.cpp index 4c677be..d8db58d 100644 --- a/db/nonce.cpp +++ b/db/nonce.cpp @@ -49,8 +49,8 @@ namespace mongo { } nonce Security::getNonce(){ - static boost::mutex m; - boostlock lk(m); + static mongo::mutex m; + scoped_lock lk(m); /* question/todo: /dev/random works on OS X. is it better to use that than random() / srandom()? diff --git a/db/pdfile.cpp b/db/pdfile.cpp index 18df5f1..1c4608c 100644 --- a/db/pdfile.cpp +++ b/db/pdfile.cpp @@ -30,6 +30,7 @@ _ disallow system* manipulations from the database. #include "../util/mmap.h" #include "../util/hashtab.h" #include "../util/file_allocator.h" +#include "../util/processinfo.h" #include "btree.h" #include <algorithm> #include <list> @@ -40,10 +41,63 @@ _ disallow system* manipulations from the database. #include "queryutil.h" #include "extsort.h" #include "curop.h" +#include "background.h" namespace mongo { + map<string, unsigned> BackgroundOperation::dbsInProg; + set<string> BackgroundOperation::nsInProg; + + bool BackgroundOperation::inProgForDb(const char *db) { + assertInWriteLock(); + return dbsInProg[db] != 0; + } + + bool BackgroundOperation::inProgForNs(const char *ns) { + assertInWriteLock(); + return nsInProg.count(ns) != 0; + } + + void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { + uassert(12586, "cannot perform operation: a background operation is currently running for this database", + !inProgForDb(db)); + } + + void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { + uassert(12587, "cannot perform operation: a background operation is currently running for this collection", + !inProgForNs(ns)); + } + + BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { + assertInWriteLock(); + dbsInProg[_ns.db]++; + assert( nsInProg.count(_ns.ns()) == 0 ); + nsInProg.insert(_ns.ns()); + } + + BackgroundOperation::~BackgroundOperation() { + assertInWriteLock(); + dbsInProg[_ns.db]--; + nsInProg.erase(_ns.ns()); + } + + void BackgroundOperation::dump(stringstream& ss) { + if( nsInProg.size() ) { + ss << "\n<b>Background Jobs in Progress</b>\n"; + for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ ) + ss << " " << *i << '\n'; + } + for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { + if( i->second ) + ss << "database " << i->first << ": " << i->second << '\n'; + } + } + + /* ----------------------------------------- */ + string dbpath = "/data/db/"; + bool directoryperdb = false; + string repairpath; DataFileMgr theDataFileMgr; DatabaseHolder dbHolder; @@ -53,7 +107,8 @@ namespace mongo { extern int otherTraceLevel; void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0); void ensureIdIndexForNewNs(const char *ns) { - if ( !strstr( ns, ".system." ) && !strstr( ns, ".$freelist" ) ) { + if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) && + strstr( ns, ".$freelist" ) == 0 ){ log( 1 ) << "adding _id index for new collection" << endl; ensureHaveIdIndex( ns ); } @@ -63,10 +118,13 @@ namespace mongo { stringstream ss; Client * c = currentClient.get(); if ( c ){ - Database *database = c->database(); - if ( database ) { - ss << database->name << ' '; - ss << cc().ns() << ' '; + Client::Context * cx = c->getContext(); + if ( cx ){ + Database *database = cx->db(); + if ( database ) { + ss << database->name << ' '; + ss << cx->ns() << ' '; + } } } return ss.str(); @@ -105,7 +163,7 @@ namespace mongo { addNewNamespaceToCatalog(ns, j.isEmpty() ? 0 : &j); long long size = initialExtentSize(128); - BSONElement e = j.findElement("size"); + BSONElement e = j.getField("size"); if ( e.isNumber() ) { size = (long long) e.number(); size += 256; @@ -116,10 +174,10 @@ namespace mongo { bool newCapped = false; int mx = 0; - e = j.findElement("capped"); + e = j.getField("capped"); if ( e.type() == Bool && e.boolean() ) { newCapped = true; - e = j.findElement("max"); + e = j.getField("max"); if ( e.isNumber() ) { mx = (int) e.number(); } @@ -127,7 +185,7 @@ namespace mongo { // $nExtents just for debug/testing. We create '$nExtents' extents, // each of size 'size'. - e = j.findElement( "$nExtents" ); + e = j.getField( "$nExtents" ); int nExtents = int( e.number() ); Database *database = cc().database(); if ( nExtents > 0 ) { @@ -487,13 +545,11 @@ namespace mongo { /*---------------------------------------------------------------------*/ auto_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) { - DiskLoc loc; - bool found = nsindex(ns)->find(ns, loc); - if ( !found ) { - // out() << "info: findAll() namespace does not exist: " << ns << endl; + NamespaceDetails * d = nsdetails( ns ); + if ( ! d ) return auto_ptr<Cursor>(new BasicCursor(DiskLoc())); - } + DiskLoc loc = d->firstExtent; Extent *e = getExtent(loc); DEBUGGING { @@ -512,40 +568,42 @@ namespace mongo { } out() << endl; - nsdetails(ns)->dumpDeleted(&extents); + d->dumpDeleted(&extents); } - if ( !nsdetails( ns )->capped ) { - if ( !startLoc.isNull() ) - return auto_ptr<Cursor>(new BasicCursor( startLoc )); - while ( e->firstRecord.isNull() && !e->xnext.isNull() ) { - /* todo: if extent is empty, free it for reuse elsewhere. - that is a bit complicated have to clean up the freelists. - */ - RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead " << ns << endl; - // find a nonempty extent - // it might be nice to free the whole extent here! but have to clean up free recs then. - e = e->getNextExtent(); - } - return auto_ptr<Cursor>(new BasicCursor( e->firstRecord )); - } else { - return auto_ptr< Cursor >( new ForwardCappedCursor( nsdetails( ns ), startLoc ) ); + if ( d->capped ) + return auto_ptr< Cursor >( new ForwardCappedCursor( d , startLoc ) ); + + if ( !startLoc.isNull() ) + return auto_ptr<Cursor>(new BasicCursor( startLoc )); + + while ( e->firstRecord.isNull() && !e->xnext.isNull() ) { + /* todo: if extent is empty, free it for reuse elsewhere. + that is a bit complicated have to clean up the freelists. + */ + RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead " << ns << endl; + // find a nonempty extent + // it might be nice to free the whole extent here! but have to clean up free recs then. + e = e->getNextExtent(); } + return auto_ptr<Cursor>(new BasicCursor( e->firstRecord )); } /* get a table scan cursor, but can be forward or reverse direction. order.$natural - if set, > 0 means forward (asc), < 0 backward (desc). */ auto_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) { - BSONElement el = order.findElement("$natural"); // e.g., { $natural : -1 } + BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 } if ( el.number() >= 0 ) return DataFileMgr::findAll(ns, startLoc); - + // "reverse natural order" NamespaceDetails *d = nsdetails(ns); + if ( !d ) return auto_ptr<Cursor>(new BasicCursor(DiskLoc())); + if ( !d->capped ) { if ( !startLoc.isNull() ) return auto_ptr<Cursor>(new ReverseCursor( startLoc )); @@ -583,6 +641,8 @@ namespace mongo { NamespaceDetails* d = nsdetails(nsToDrop.c_str()); uassert( 10086 , (string)"ns not found: " + nsToDrop , d ); + BackgroundOperation::assertNoBgOpInProgForNs(nsToDrop.c_str()); + NamespaceString s(nsToDrop); assert( s.db == cc().database()->name ); if( s.isSystem() ) { @@ -634,29 +694,33 @@ namespace mongo { log(1) << "dropCollection: " << name << endl; NamespaceDetails *d = nsdetails(name.c_str()); assert( d ); + + BackgroundOperation::assertNoBgOpInProgForNs(name.c_str()); + if ( d->nIndexes != 0 ) { try { - assert( deleteIndexes(d, name.c_str(), "*", errmsg, result, true) ); + assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) ); } catch( DBException& ) { - uasserted(12503,"drop: deleteIndexes for collection failed - consider trying repair"); + uasserted(12503,"drop: dropIndexes for collection failed - consider trying repair"); } assert( d->nIndexes == 0 ); } - log(1) << "\t deleteIndexes done" << endl; + log(1) << "\t dropIndexes done" << endl; result.append("ns", name.c_str()); ClientCursor::invalidate(name.c_str()); + Top::global.collectionDropped( name ); dropNS(name); } int nUnindexes = 0; - void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) { + /* unindex all keys in index for this record. */ + static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) { BSONObjSetDefaultOrder keys; id.getKeysFromObject(obj, keys); for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) { BSONObj j = *i; - // out() << "UNINDEX: j:" << j.toString() << " head:" << id.head.toString() << dl.toString() << endl; if ( otherTraceLevel >= 5 ) { out() << "_unindexRecord() " << obj.toString(); out() << "\n unindex:" << j.toString() << endl; @@ -666,9 +730,9 @@ namespace mongo { try { ok = id.head.btree()->unindex(id.head, id, j, dl); } - catch (AssertionException&) { + catch (AssertionException& e) { problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl; - out() << "Assertion failure: _unindex failed" << '\n'; + out() << "Assertion failure: _unindex failed: " << e.what() << '\n'; out() << " obj:" << obj.toString() << '\n'; out() << " key:" << j.toString() << '\n'; out() << " dl:" << dl.toString() << endl; @@ -682,12 +746,14 @@ namespace mongo { } /* unindex all keys in all indexes for this record. */ - void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) { - if ( d->nIndexes == 0 ) return; + static void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) { BSONObj obj(todelete); - NamespaceDetails::IndexIterator i = d->ii(); - while( i.more() ) { - _unindexRecord(i.next(), obj, dl, !noWarn); + int n = d->nIndexes; + for ( int i = 0; i < n; i++ ) + _unindexRecord(d->idx(i), obj, dl, !noWarn); + if( d->backgroundIndexBuildInProgress ) { + // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it + _unindexRecord(d->idx(n), obj, dl, false); } } @@ -763,19 +829,20 @@ namespace mongo { /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record. */ - const DiskLoc DataFileMgr::update(const char *ns, - Record *toupdate, const DiskLoc& dl, - const char *_buf, int _len, OpDebug& debug) + const DiskLoc DataFileMgr::updateRecord( + const char *ns, + NamespaceDetails *d, + NamespaceDetailsTransient *nsdt, + Record *toupdate, const DiskLoc& dl, + const char *_buf, int _len, OpDebug& debug) { StringBuilder& ss = debug.str; dassert( toupdate == dl.rec() ); - NamespaceDetails *d = nsdetails(ns); - BSONObj objOld(toupdate); BSONObj objNew(_buf); - assert( objNew.objsize() == _len ); - assert( objNew.objdata() == _buf ); + DEV assert( objNew.objsize() == _len ); + DEV assert( objNew.objdata() == _buf ); if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) { /* add back the old _id value if the update removes it. Note this implementation is slow @@ -795,7 +862,7 @@ namespace mongo { */ vector<IndexChanges> changes; getIndexChanges(changes, *d, objNew, objOld); - dupCheck(changes, *d); + dupCheck(changes, *d, dl); if ( toupdate->netLength() < objNew.objsize() ) { // doesn't fit. reallocate ----------------------------------------------------- @@ -807,13 +874,14 @@ namespace mongo { return insert(ns, objNew.objdata(), objNew.objsize(), false); } - NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp(); + nsdt->notifyOfWriteOp(); d->paddingFits(); /* have any index keys changed? */ { unsigned keyUpdates = 0; - for ( int x = 0; x < d->nIndexes; x++ ) { + int z = d->nIndexesBeingBuilt(); + for ( int x = 0; x < z; x++ ) { IndexDetails& idx = d->idx(x); for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) { try { @@ -859,10 +927,8 @@ namespace mongo { return sz; } - int deb=0; - - /* add keys to indexes for a new record */ - inline void _indexRecord(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc newRecordLoc, bool dupsAllowed) { + /* add keys to index idxNo for a new record */ + static inline void _indexRecord(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) { IndexDetails& idx = d->idx(idxNo); BSONObjSetDefaultOrder keys; idx.getKeysFromObject(obj, keys); @@ -872,12 +938,16 @@ namespace mongo { if( ++n == 2 ) { d->setIndexIsMultikey(idxNo); } - assert( !newRecordLoc.isNull() ); + assert( !recordLoc.isNull() ); try { - idx.head.btree()->bt_insert(idx.head, newRecordLoc, + idx.head.btree()->bt_insert(idx.head, recordLoc, *i, order, dupsAllowed, idx); } - catch (AssertionException& ) { + catch (AssertionException& e) { + if( e.code == 10287 && idxNo == d->nIndexes ) { + DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; + continue; + } if( !dupsAllowed ) { // dup key exception, presumably. throw; @@ -913,10 +983,10 @@ namespace mongo { } // throws DBException - /* _ TODO dropDups - */ unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { - // testSorting(); + assert( d->backgroundIndexBuildInProgress == 0 ); + CurOp * op = cc().curop(); + Timer t; log() << "Buildindex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl; @@ -926,13 +996,16 @@ namespace mongo { BSONObj order = idx.keyPattern(); idx.head.Null(); + + if ( logLevel > 1 ) printMemInfo( "before index start" ); /* get and sort all the keys ----- */ unsigned long long n = 0; auto_ptr<Cursor> c = theDataFileMgr.findAll(ns); BSONObjExternalSorter sorter(order); + sorter.hintNumObjects( d->nrecords ); unsigned long long nkeys = 0; - ProgressMeter pm( d->nrecords , 10 ); + ProgressMeter & pm = op->setMessage( "index: (1/3) external sort" , d->nrecords , 10 ); while ( c->ok() ) { BSONObj o = c->current(); DiskLoc loc = c->currLoc(); @@ -947,12 +1020,20 @@ namespace mongo { sorter.add(*i, loc); nkeys++; } - + c->advance(); n++; pm.hit(); + if ( logLevel > 1 && n % 10000 == 0 ){ + printMemInfo( "\t iterating objects" ); + } + }; + pm.finished(); + + if ( logLevel > 1 ) printMemInfo( "before final sort" ); sorter.sort(); + if ( logLevel > 1 ) printMemInfo( "after final sort" ); log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl; @@ -963,21 +1044,23 @@ namespace mongo { BtreeBuilder btBuilder(dupsAllowed, idx); BSONObj keyLast; auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator(); - ProgressMeter pm2( nkeys , 10 ); + pm = op->setMessage( "index: (2/3) btree bottom up" , nkeys , 10 ); while( i->more() ) { RARELY killCurrentOp.checkForInterrupt(); BSONObjExternalSorter::Data d = i->next(); - //cout<<"TEMP SORTER next " << d.first.toString() << endl; try { btBuilder.addKey(d.first, d.second); } - catch( AssertionException& ) { + catch( AssertionException& e ) { if ( dupsAllowed ){ // unknow exception?? throw; } + if( e.interrupted() ) + throw; + if ( ! dropDups ) throw; @@ -987,8 +1070,11 @@ namespace mongo { dupsToDrop.push_back(d.second); uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 ); } - pm2.hit(); + pm.hit(); } + pm.finished(); + op->setMessage( "index: (3/3) btree-middle" ); + log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl; btBuilder.commit(); wassert( btBuilder.getn() == nkeys || dropDups ); } @@ -1001,32 +1087,61 @@ namespace mongo { return n; } - static class BackgroundIndexBuildJobs { + class BackgroundIndexBuildJob : public BackgroundOperation { unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { bool dupsAllowed = !idx.unique(); bool dropDups = idx.dropDups(); + ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->nrecords ); + unsigned long long n = 0; - auto_ptr<Cursor> c = theDataFileMgr.findAll(ns); - while ( c->ok() ) { - BSONObj js = c->current(); + auto_ptr<ClientCursor> cc; + { + auto_ptr<Cursor> c = theDataFileMgr.findAll(ns); + cc.reset( new ClientCursor(c, ns, false) ); + } + CursorId id = cc->cursorid; + + while ( cc->c->ok() ) { + BSONObj js = cc->c->current(); try { - _indexRecord(d, idxNo, js, c->currLoc(),dupsAllowed); - c->advance(); + _indexRecord(d, idxNo, js, cc->c->currLoc(), dupsAllowed); + cc->c->advance(); } catch( AssertionException& e ) { + if( e.interrupted() ) + throw; + if ( dropDups ) { - DiskLoc toDelete = c->currLoc(); - c->advance(); + DiskLoc toDelete = cc->c->currLoc(); + bool ok = cc->c->advance(); + cc->updateLocation(); theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true ); + if( ClientCursor::find(id, false) == 0 ) { + cc.release(); + if( !ok ) { + /* we were already at the end. normal. */ + } + else { + uasserted(12585, "cursor gone during bg index; dropDups"); + } + break; + } } else { - _log() << endl; - log(2) << "addExistingToIndex exception " << e.what() << endl; + log() << "background addExistingToIndex exception " << e.what() << endl; throw; } } n++; - }; + progress.hit(); + + if ( n % 128 == 0 && !cc->yield() ) { + cc.release(); + uasserted(12584, "cursor gone during bg index"); + break; + } + } + progress.done(); return n; } @@ -1034,72 +1149,76 @@ namespace mongo { that way on a crash/restart, we don't think we are still building one. */ set<NamespaceDetails*> bgJobsInProgress; - void prep(NamespaceDetails *d) { + void prep(const char *ns, NamespaceDetails *d) { assertInWriteLock(); - assert( bgJobsInProgress.count(d) == 0 ); bgJobsInProgress.insert(d); d->backgroundIndexBuildInProgress = 1; + d->nIndexes--; } - - public: - /* Note you cannot even do a foreground index build if a background is in progress, - as bg build assumes it is the last index in the array! - */ - void checkInProg(NamespaceDetails *d) { + void done(const char *ns, NamespaceDetails *d) { + d->nIndexes++; + d->backgroundIndexBuildInProgress = 0; + NamespaceDetailsTransient::get_w(ns).addedIndex(); // clear query optimizer cache assertInWriteLock(); - uassert(12580, "already building an index for this namespace in background", bgJobsInProgress.count(d) == 0); } -/* todo: clean bg flag on loading of NamespaceDetails */ + public: + BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { } unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { - unsigned long long n; - prep(d); + unsigned long long n = 0; + + prep(ns.c_str(), d); + assert( idxNo == d->nIndexes ); try { idx.head = BtreeBucket::addBucket(idx); n = addExistingToIndex(ns.c_str(), d, idx, idxNo); } catch(...) { - assertInWriteLock(); - bgJobsInProgress.erase(d); - d->backgroundIndexBuildInProgress = 0; + if( cc().database() && nsdetails(ns.c_str()) == d ) { + assert( idxNo == d->nIndexes ); + done(ns.c_str(), d); + } + else { + log() << "ERROR: db gone during bg index?" << endl; + } throw; } + assert( idxNo == d->nIndexes ); + done(ns.c_str(), d); return n; } - } backgroundIndex; + }; // throws DBException - static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { - log() << "building new index on " << idx.keyPattern() << " for " << ns << "..." << endl; + static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { + log() << "building new index on " << idx.keyPattern() << " for " << ns << endl; Timer t; unsigned long long n; - BSONObj info = idx.info.obj(); - bool background = info["background"].trueValue(); - if( background ) { - log() << "WARNING: background index build not yet implemented" << endl; + if( background ) { + log(2) << "buildAnIndex: background=true\n"; } + assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be... if( !background ) { n = fastBuildIndex(ns.c_str(), d, idx, idxNo); assert( !idx.head.isNull() ); } else { - n = backgroundIndex.go(ns, d, idx, idxNo); + BackgroundIndexBuildJob j(ns.c_str()); + n = j.go(ns, d, idx, idxNo); } log() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl; } /* add keys to indexes for a new record */ - void indexRecord(NamespaceDetails *d, const void *buf, int len, DiskLoc newRecordLoc) { - BSONObj obj((const char *)buf); - - /*UNIQUE*/ - for ( int i = 0; i < d->nIndexes; i++ ) { + static void indexRecord(NamespaceDetails *d, BSONObj obj, DiskLoc loc) { + int n = d->nIndexesBeingBuilt(); + for ( int i = 0; i < n; i++ ) { try { bool unique = d->idx(i).unique(); - _indexRecord(d, i, obj, newRecordLoc, /*dupsAllowed*/!unique); + _indexRecord(d, i, obj, loc, /*dupsAllowed*/!unique); } catch( DBException& ) { /* try to roll back previously added index entries @@ -1108,7 +1227,7 @@ namespace mongo { */ for( int j = 0; j <= i; j++ ) { try { - _unindexRecord(d->idx(j), obj, newRecordLoc, false); + _unindexRecord(d->idx(j), obj, loc, false); } catch(...) { log(3) << "unindex fails on rollback after unique failure\n"; @@ -1119,7 +1238,7 @@ namespace mongo { } } - extern BSONObj id_obj; // { _id : ObjectId("000000000000000000000000") } + extern BSONObj id_obj; // { _id : 1 } void ensureHaveIdIndex(const char *ns) { NamespaceDetails *d = nsdetails(ns); @@ -1179,12 +1298,31 @@ namespace mongo { bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection); + // We are now doing two btree scans for all unique indexes (one here, and one when we've + // written the record to the collection. This could be made more efficient inserting + // dummy data here, keeping pointers to the btree nodes holding the dummy data and then + // updating the dummy data with the DiskLoc of the real record. + void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) { + for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) { + if( d->idx(idxNo).unique() ) { + IndexDetails& idx = d->idx(idxNo); + BSONObjSetDefaultOrder keys; + idx.getKeysFromObject(obj, keys); + BSONObj order = idx.keyPattern(); + for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) { + uassert( 12582, "duplicate key insert for unique index of capped collection", + idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() ); + } + } + } + } + /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc after the call -- that will prevent a double buffer copy in some cases (btree.cpp). */ DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) { bool wouldAddIndex = false; - uassert( 10093 , "cannot insert into reserved $ collection", god || strchr(ns, '$') == 0 ); + massert( 10093 , "cannot insert into reserved $ collection", god || strchr(ns, '$') == 0 ); uassert( 10094 , "invalid ns", strchr( ns , '.' ) > 0 ); const char *sys = strstr(ns, "system."); if ( sys ) { @@ -1212,8 +1350,8 @@ namespace mongo { /* todo: shouldn't be in the namespace catalog until after the allocations here work. also if this is an addIndex, those checks should happen before this! */ - // This creates first file in the database. - cc().database()->newestFile()->createExtent(ns, initialExtentSize(len)); + // This may create first file in the database. + cc().database()->allocExtent(ns, initialExtentSize(len), false); d = nsdetails(ns); if ( !god ) ensureIdIndexForNewNs(ns); @@ -1225,10 +1363,8 @@ namespace mongo { string tabletoidxns; if ( addIndex ) { BSONObj io((const char *) obuf); - backgroundIndex.checkInProg(d); - if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) ) { + if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) ) return DiskLoc(); - } } const BSONElement *newId = &writeId; @@ -1262,6 +1398,13 @@ namespace mongo { d->paddingFactor = 1.0; lenWHdr = len + Record::HeaderSize; } + + // If the collection is capped, check if the new object will violate a unique index + // constraint before allocating space. + if ( d->nIndexes && d->capped && !god ) { + checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) ); + } + DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); if ( loc.isNull() ) { // out of space @@ -1321,27 +1464,35 @@ namespace mongo { NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp(); if ( tableToIndex ) { + BSONObj info = loc.obj(); + bool background = info["background"].trueValue(); + int idxNo = tableToIndex->nIndexes; - IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str()); // clear transient info caches so they refresh; increments nIndexes + IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes idx.info = loc; try { - buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo); + buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background); } catch( DBException& ) { - // save our error msg string as an exception on deleteIndexes will overwrite our message + // save our error msg string as an exception or dropIndexes will overwrite our message LastError *le = lastError.get(); - assert( le ); - string saveerrmsg = le->msg; - assert( !saveerrmsg.empty() ); + int savecode = 0; + string saveerrmsg; + if ( le ) { + savecode = le->code; + saveerrmsg = le->msg; + } // roll back this index string name = idx.indexName(); BSONObjBuilder b; string errmsg; - bool ok = deleteIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true); + bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true); if( !ok ) { log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl; } - raiseError(12506,saveerrmsg.c_str()); + + assert( le && !saveerrmsg.empty() ); + raiseError(savecode,saveerrmsg.c_str()); throw; } } @@ -1349,11 +1500,13 @@ namespace mongo { /* add this record to our indexes */ if ( d->nIndexes ) { try { - indexRecord(d, r->data/*buf*/, len, loc); + BSONObj obj(r->data); + indexRecord(d, obj, loc); } catch( AssertionException& e ) { // should be a dup key error on _id index - if( tableToIndex || d->capped ) { + if( tableToIndex || d->capped ) { + massert( 12583, "unexpected index insertion failure on capped collection", !d->capped ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; uassert_nothrow(s.c_str()); @@ -1406,19 +1559,6 @@ namespace mongo { return r; } - void DataFileMgr::init(const string& path ) { - /* boost::filesystem::path path( dir ); - path /= "temp.dat"; - string pathString = path.string(); - temp.open(pathString.c_str(), 64 * 1024 * 1024); - */ - } - - void pdfileInit() { - // namespaceIndex.init(dbpath); - theDataFileMgr.init(dbpath); - } - } // namespace mongo #include "clientcursor.h" @@ -1427,63 +1567,75 @@ namespace mongo { void dropDatabase(const char *ns) { // ns is of the form "<dbname>.$cmd" - char cl[256]; - nsToDatabase(ns, cl); - log(1) << "dropDatabase " << cl << endl; - assert( cc().database()->name == cl ); + char db[256]; + nsToDatabase(ns, db); + log(1) << "dropDatabase " << db << endl; + assert( cc().database()->name == db ); + + BackgroundOperation::assertNoBgOpInProgForDb(db); - closeDatabase( cl ); - _deleteDataFiles(cl); + closeDatabase( db ); + _deleteDataFiles(db); } typedef boost::filesystem::path Path; // back up original database files to 'temp' dir void _renameForBackup( const char *database, const Path &reservedPath ) { + Path newPath( reservedPath ); + if ( directoryperdb ) + newPath /= database; class Renamer : public FileOp { public: - Renamer( const Path &reservedPath ) : reservedPath_( reservedPath ) {} + Renamer( const Path &newPath ) : newPath_( newPath ) {} private: - const boost::filesystem::path &reservedPath_; + const boost::filesystem::path &newPath_; virtual bool apply( const Path &p ) { if ( !boost::filesystem::exists( p ) ) return false; - boost::filesystem::rename( p, reservedPath_ / ( p.leaf() + ".bak" ) ); + boost::filesystem::rename( p, newPath_ / ( p.leaf() + ".bak" ) ); return true; } virtual const char * op() const { return "renaming"; } - } renamer( reservedPath ); + } renamer( newPath ); _applyOpToDataFiles( database, renamer, true ); } // move temp files to standard data dir void _replaceWithRecovered( const char *database, const char *reservedPathString ) { - class : public FileOp { + Path newPath( dbpath ); + if ( directoryperdb ) + newPath /= database; + class Replacer : public FileOp { + public: + Replacer( const Path &newPath ) : newPath_( newPath ) {} + private: + const boost::filesystem::path &newPath_; virtual bool apply( const Path &p ) { if ( !boost::filesystem::exists( p ) ) return false; - boost::filesystem::rename( p, boost::filesystem::path(dbpath) / p.leaf() ); + boost::filesystem::rename( p, newPath_ / p.leaf() ); return true; } virtual const char * op() const { return "renaming"; } - } renamer; - _applyOpToDataFiles( database, renamer, true, reservedPathString ); + } replacer( newPath ); + _applyOpToDataFiles( database, replacer, true, reservedPathString ); } // generate a directory name for storing temp data files Path uniqueReservedPath( const char *prefix ) { - Path dbPath = Path( dbpath ); + Path repairPath = Path( repairpath ); Path reservedPath; int i = 0; bool exists = false; do { stringstream ss; ss << prefix << "_repairDatabase_" << i++; - reservedPath = dbPath / ss.str(); + reservedPath = repairPath / ss.str(); BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) ); } while ( exists ); return reservedPath; @@ -1540,6 +1692,8 @@ namespace mongo { problem() << "repairDatabase " << dbName << endl; assert( cc().database()->name == dbName ); + BackgroundOperation::assertNoBgOpInProgForDb(dbName); + boost::intmax_t totalSize = dbSize( dbName ); boost::intmax_t freeSize = freeSpace(); if ( freeSize > -1 && freeSize < totalSize ) { @@ -1553,14 +1707,19 @@ namespace mongo { Path reservedPath = uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ? - "backup" : "tmp" ); + "backup" : "$tmp" ); BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) ); string reservedPathString = reservedPath.native_directory_string(); - assert( setClient( dbName, reservedPathString.c_str() ) ); - - bool res = cloneFrom(localhost.c_str(), errmsg, dbName, - /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false); - closeDatabase( dbName, reservedPathString.c_str() ); + + bool res; + { // clone to temp location, which effectively does repair + Client::Context ctx( dbName, reservedPathString ); + assert( ctx.justCreated() ); + + res = cloneFrom(localhost.c_str(), errmsg, dbName, + /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false); + closeDatabase( dbName, reservedPathString.c_str() ); + } if ( !res ) { problem() << "clone failed for " << dbName << " with error: " << errmsg << endl; @@ -1569,13 +1728,15 @@ namespace mongo { return false; } - assert( !setClient( dbName ) ); + Client::Context ctx( dbName ); closeDatabase( dbName ); - if ( backupOriginalFiles ) + if ( backupOriginalFiles ) { _renameForBackup( dbName, reservedPath ); - else + } else { _deleteDataFiles( dbName ); + BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) ); + } _replaceWithRecovered( dbName, reservedPathString.c_str() ); @@ -1591,6 +1752,8 @@ namespace mongo { string c = database; c += '.'; boost::filesystem::path p(path); + if ( directoryperdb ) + p /= database; boost::filesystem::path q; q = p / (c+"ns"); bool ok = false; @@ -1619,8 +1782,8 @@ namespace mongo { NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); } - bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result ){ - log(2) << "DatabaseHolder::closeAll path:" << path << endl; + bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ){ + log() << "DatabaseHolder::closeAll path:" << path << endl; dbMutex.assertWriteLocked(); map<string,Database*>& m = _paths[path]; @@ -1633,14 +1796,23 @@ namespace mongo { BSONObjBuilder bb( result.subarrayStart( "dbs" ) ); int n = 0; + int nNotClosed = 0; for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) { string name = *i; log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl; - setClient( name.c_str() , path ); - closeDatabase( name.c_str() , path ); - bb.append( bb.numStr( n++ ).c_str() , name ); + Client::Context ctx( name , path ); + if( !force && BackgroundOperation::inProgForDb(name.c_str()) ) { + log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl; + nNotClosed++; + } + else { + closeDatabase( name.c_str() , path ); + bb.append( bb.numStr( n++ ).c_str() , name ); + } } bb.done(); + if( nNotClosed ) + result.append("nNotClosed", nNotClosed); return true; } diff --git a/db/pdfile.h b/db/pdfile.h index 19a8322..85dc191 100644 --- a/db/pdfile.h +++ b/db/pdfile.h @@ -27,7 +27,7 @@ #include "../stdafx.h" #include "../util/mmap.h" -#include "storage.h" +#include "diskloc.h" #include "jsobjmanipulator.h" #include "namespace.h" #include "client.h" @@ -98,8 +98,10 @@ namespace mongo { static Extent* allocFromFreeList(const char *ns, int approxSize, bool capped = false); /** @return DiskLoc where item ends up */ - const DiskLoc update( + const DiskLoc updateRecord( const char *ns, + NamespaceDetails *d, + NamespaceDetailsTransient *nsdt, Record *toupdate, const DiskLoc& dl, const char *buf, int len, OpDebug& debug); // The object o may be updated if modified on insert. @@ -392,6 +394,10 @@ namespace mongo { void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath ); inline void _deleteDataFiles(const char *database) { + if ( directoryperdb ) { + BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) ); + return; + } class : public FileOp { virtual bool apply( const boost::filesystem::path &p ) { return boost::filesystem::remove( p ); @@ -443,6 +449,6 @@ namespace mongo { void ensureHaveIdIndex(const char *ns); - bool deleteIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex ); + bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex ); } // namespace mongo diff --git a/db/query.cpp b/db/query.cpp index 9c82609..761a312 100644 --- a/db/query.cpp +++ b/db/query.cpp @@ -55,11 +55,11 @@ namespace mongo { justOne_( justOne ), count_(), bestCount_( bestCount ), - nScanned_() { + _nscanned() { } virtual void init() { c_ = qp().newCursor(); - matcher_.reset( new CoveredIndexMatcher( qp().query(), qp().indexKey() ) ); + _matcher.reset( new CoveredIndexMatcher( qp().query(), qp().indexKey() ) ); } virtual void next() { if ( !c_->ok() ) { @@ -69,20 +69,20 @@ namespace mongo { DiskLoc rloc = c_->currLoc(); - if ( matcher_->matches(c_->currKey(), rloc ) ) { + if ( _matcher->matches(c_->currKey(), rloc ) ) { if ( !c_->getsetdup(rloc) ) ++count_; } c_->advance(); - ++nScanned_; + ++_nscanned; if ( count_ > bestCount_ ) bestCount_ = count_; if ( count_ > 0 ) { if ( justOne_ ) setComplete(); - else if ( nScanned_ >= 100 && count_ == bestCount_ ) + else if ( _nscanned >= 100 && count_ == bestCount_ ) setComplete(); } } @@ -95,16 +95,17 @@ namespace mongo { bool justOne_; int count_; int &bestCount_; - long long nScanned_; + long long _nscanned; auto_ptr< Cursor > c_; - auto_ptr< CoveredIndexMatcher > matcher_; + auto_ptr< CoveredIndexMatcher > _matcher; }; /* ns: namespace, e.g. <database>.<collection> pattern: the "where" clause / criteria justOne: stop after 1 match + god: allow access to system namespaces, and don't yield */ - int deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop, bool god) { + long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop, bool god) { if( !god ) { if ( strstr(ns, ".system.") ) { /* note a delete from system.indexes would corrupt the db @@ -124,7 +125,7 @@ namespace mongo { return 0; uassert( 10101 , "can't remove from a capped collection" , ! d->capped ); - int nDeleted = 0; + long long nDeleted = 0; QueryPlanSet s( ns, pattern, BSONObj() ); int best = 0; DeleteOp original( justOne, best ); @@ -136,18 +137,14 @@ namespace mongo { CoveredIndexMatcher matcher(pattern, creal->indexKeyPattern()); - auto_ptr<ClientCursor> cc; - cc.reset( new ClientCursor() ); - cc->c = creal; - cc->ns = ns; - cc->noTimeout(); + auto_ptr<ClientCursor> cc( new ClientCursor(creal, ns, false) ); cc->setDoingDeletes( true ); CursorId id = cc->cursorid; unsigned long long nScanned = 0; do { - if ( ++nScanned % 128 == 0 && !matcher.docMatcher().atomic() ) { + if ( ++nScanned % 128 == 0 && !god && !matcher.docMatcher().atomic() ) { if ( ! cc->yield() ){ cc.release(); // has already been deleted elsewhere break; @@ -233,32 +230,9 @@ namespace mongo { log( k == n ) << "killcursors: found " << k << " of " << n << '\n'; } - BSONObj id_obj = fromjson("{\"_id\":ObjectId( \"000000000000000000000000\" )}"); + BSONObj id_obj = fromjson("{\"_id\":1}"); BSONObj empty_obj = fromjson("{}"); - /* This is for languages whose "objects" are not well ordered (JSON is well ordered). - [ { a : ... } , { b : ... } ] -> { a : ..., b : ... } - */ - inline BSONObj transformOrderFromArrayFormat(BSONObj order) { - /* note: this is slow, but that is ok as order will have very few pieces */ - BSONObjBuilder b; - char p[2] = "0"; - - while ( 1 ) { - BSONObj j = order.getObjectField(p); - if ( j.isEmpty() ) - break; - BSONElement e = j.firstElement(); - uassert( 10102 , "bad order array", !e.eoo()); - uassert( 10103 , "bad order array [2]", e.isNumber()); - b.append(e); - (*p)++; - uassert( 10104 , "too many ordering elements", *p <= '9'); - } - - return b.obj(); - } - //int dump = 0; @@ -328,7 +302,7 @@ namespace mongo { } else { BSONObj js = c->current(); - fillQueryResultFromObj(b, cc->filter.get(), js); + fillQueryResultFromObj(b, cc->fields.get(), js); n++; if ( (ntoreturn>0 && (n >= ntoreturn || b.len() > MaxBytesToReturnToClientAtOnce)) || (ntoreturn==0 && b.len()>1*1024*1024) ) { @@ -365,8 +339,8 @@ namespace mongo { virtual void init() { query_ = spec_.getObjectField( "query" ); c_ = qp().newCursor(); - matcher_.reset( new CoveredIndexMatcher( query_, c_->indexKeyPattern() ) ); - if ( qp().exactKeyMatch() && ! matcher_->needRecord() ) { + _matcher.reset( new CoveredIndexMatcher( query_, c_->indexKeyPattern() ) ); + if ( qp().exactKeyMatch() && ! _matcher->needRecord() ) { query_ = qp().simplifiedQuery( qp().indexKey() ); bc_ = dynamic_cast< BtreeCursor* >( c_.get() ); bc_->forgetEndKey(); @@ -398,7 +372,7 @@ namespace mongo { _gotOne(); } } else { - if ( !matcher_->matches(c_->currKey(), c_->currLoc() ) ) { + if ( !_matcher->matches(c_->currKey(), c_->currLoc() ) ) { } else if( !c_->getsetdup(c_->currLoc()) ) { _gotOne(); @@ -434,7 +408,7 @@ namespace mongo { auto_ptr< Cursor > c_; BSONObj query_; BtreeCursor *bc_; - auto_ptr< CoveredIndexMatcher > matcher_; + auto_ptr< CoveredIndexMatcher > _matcher; BSONObj firstMatch_; }; @@ -479,438 +453,389 @@ namespace mongo { // Implements database 'query' requests using the query optimizer's QueryOp interface class UserQueryOp : public QueryOp { public: - UserQueryOp( int ntoskip, int ntoreturn, const BSONObj &order, bool wantMore, - bool explain, FieldMatcher *filter, int queryOptions ) : - b_( 32768 ), - ntoskip_( ntoskip ), - ntoreturn_( ntoreturn ), - order_( order ), - wantMore_( wantMore ), - explain_( explain ), - filter_( filter ), - ordering_(), - nscanned_(), - queryOptions_( queryOptions ), - n_(), - soSize_(), - saveClientCursor_(), - findingStart_( (queryOptions & QueryOption_OplogReplay) != 0 ), - findingStartCursor_() - { - uassert( 10105 , "bad skip value in query", ntoskip >= 0); - } - + + UserQueryOp( const ParsedQuery& pq ) : + //int ntoskip, int ntoreturn, const BSONObj &order, bool wantMore, + // bool explain, FieldMatcher *filter, int queryOptions ) : + _buf( 32768 ) , // TODO be smarter here + _pq( pq ) , + _ntoskip( pq.getSkip() ) , + _nscanned(0), _nscannedObjects(0), + _n(0), + _inMemSort(false), + _saveClientCursor(false), + _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ) + {} + virtual void init() { - b_.skip( sizeof( QueryResult ) ); + _buf.skip( sizeof( QueryResult ) ); - // findingStart mode is used to find the first operation of interest when - // we are scanning through a repl log. For efficiency in the common case, - // where the first operation of interest is closer to the tail than the head, - // we start from the tail of the log and work backwards until we find the - // first operation of interest. Then we scan forward from that first operation, - // actually returning results to the client. During the findingStart phase, - // we release the db mutex occasionally to avoid blocking the db process for - // an extended period of time. - if ( findingStart_ ) { - // Use a ClientCursor here so we can release db mutex while scanning - // oplog (can take quite a while with large oplogs). - findingStartCursor_ = new ClientCursor(); - findingStartCursor_->noTimeout(); - findingStartCursor_->c = qp().newReverseCursor(); - findingStartCursor_->ns = qp().ns(); + if ( _oplogReplay ) { + _findingStartCursor.reset( new FindingStartCursor( qp() ) ); } else { - c_ = qp().newCursor(); + _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() ); } - - matcher_.reset(new CoveredIndexMatcher(qp().query(), qp().indexKey())); - + _matcher.reset(new CoveredIndexMatcher( qp().query() , qp().indexKey())); + if ( qp().scanAndOrderRequired() ) { - ordering_ = true; - so_.reset( new ScanAndOrder( ntoskip_, ntoreturn_, order_ ) ); - wantMore_ = false; + _inMemSort = true; + _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder() ) ); } } + virtual void next() { - if ( findingStart_ ) { - if ( !findingStartCursor_ || !findingStartCursor_->c->ok() ) { - findingStart_ = false; - c_ = qp().newCursor(); - } else if ( !matcher_->matches( findingStartCursor_->c->currKey(), findingStartCursor_->c->currLoc() ) ) { - findingStart_ = false; - c_ = qp().newCursor( findingStartCursor_->c->currLoc() ); + if ( _findingStartCursor.get() ) { + if ( _findingStartCursor->done() ) { + _c = _findingStartCursor->cRelease(); + _findingStartCursor.reset( 0 ); } else { - findingStartCursor_->c->advance(); - RARELY { - CursorId id = findingStartCursor_->cursorid; - findingStartCursor_->updateLocation(); - { - dbtemprelease t; - } - findingStartCursor_ = ClientCursor::find( id, false ); - } - return; + _findingStartCursor->next(); } + return; } - if ( findingStartCursor_ ) { - ClientCursor::erase( findingStartCursor_->cursorid ); - findingStartCursor_ = 0; - } - - if ( !c_->ok() ) { + if ( !_c->ok() ) { finish(); return; } - bool mayCreateCursor1 = wantMore_ && ntoreturn_ != 1 && useCursors; + bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors; if( 0 ) { - BSONObj js = c_->current(); - cout << "SCANNING " << js << endl; + cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl; } - nscanned_++; - if ( !matcher_->matches(c_->currKey(), c_->currLoc() ) ) { - ; + _nscanned++; + if ( !_matcher->matches(_c->currKey(), _c->currLoc() , &_details ) ) { + // not a match, continue onward + if ( _details.loadedObject ) + _nscannedObjects++; } else { - DiskLoc cl = c_->currLoc(); - if( !c_->getsetdup(cl) ) { - BSONObj js = c_->current(); + _nscannedObjects++; + DiskLoc cl = _c->currLoc(); + if( !_c->getsetdup(cl) ) { // got a match. + + BSONObj js = _pq.returnKey() ? _c->currKey() : _c->current(); assert( js.objsize() >= 0 ); //defensive for segfaults - if ( ordering_ ) { + + if ( _inMemSort ) { // note: no cursors for non-indexed, ordered results. results must be fairly small. - so_->add(js); + _so->add(js); } - else if ( ntoskip_ > 0 ) { - ntoskip_--; - } else { - if ( explain_ ) { - n_++; - if ( n_ >= ntoreturn_ && !wantMore_ ) { + else if ( _ntoskip > 0 ) { + _ntoskip--; + } + else { + if ( _pq.isExplain() ) { + _n++; + if ( _n >= _pq.getNumToReturn() && !_pq.wantMore() ) { // .limit() was used, show just that much. finish(); return; } } else { - fillQueryResultFromObj(b_, filter_, js); - n_++; - if ( (ntoreturn_>0 && (n_ >= ntoreturn_ || b_.len() > MaxBytesToReturnToClientAtOnce)) || - (ntoreturn_==0 && (b_.len()>1*1024*1024 || n_>=101)) ) { - /* if ntoreturn is zero, we return up to 101 objects. on the subsequent getmore, there - is only a size limit. The idea is that on a find() where one doesn't use much results, - we don't return much, but once getmore kicks in, we start pushing significant quantities. - - The n limit (vs. size) is important when someone fetches only one small field from big - objects, which causes massive scanning server-side. - */ + if ( _pq.returnKey() ){ + BSONObjBuilder bb( _buf ); + bb.appendKeys( _c->indexKeyPattern() , js ); + bb.done(); + } + else { + fillQueryResultFromObj( _buf , _pq.getFields() , js ); + } + _n++; + if ( ! _c->supportGetMore() ){ + if ( _pq.enough( _n ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ){ + finish(); + return; + } + } + else if ( _pq.enoughForFirstBatch( _n , _buf.len() ) ){ /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */ if ( mayCreateCursor1 ) { - c_->advance(); - if ( c_->ok() ) { + _c->advance(); + if ( _c->ok() ) { // more...so save a cursor - saveClientCursor_ = true; + _saveClientCursor = true; } } finish(); return; - } + } } } } } - c_->advance(); + _c->advance(); } + void finish() { - if ( explain_ ) { - n_ = ordering_ ? so_->size() : n_; - } else if ( ordering_ ) { - so_->fill(b_, filter_, n_); - } - if ( mayCreateCursor2() ) { - c_->setTailable(); + if ( _pq.isExplain() ) { + _n = _inMemSort ? _so->size() : _n; + } + else if ( _inMemSort ) { + _so->fill( _buf, _pq.getFields() , _n ); } + + if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 ) + _c->setTailable(); + // If the tailing request succeeded. - if ( c_->tailable() ) { - saveClientCursor_ = true; - } + if ( _c->tailable() ) + _saveClientCursor = true; + setComplete(); } - virtual bool mayRecordPlan() const { return ntoreturn_ != 1; } + + virtual bool mayRecordPlan() const { return _pq.getNumToReturn() != 1; } + virtual QueryOp *clone() const { - return new UserQueryOp( ntoskip_, ntoreturn_, order_, wantMore_, explain_, filter_, queryOptions_ ); - } - BufBuilder &builder() { return b_; } - bool scanAndOrderRequired() const { return ordering_; } - auto_ptr< Cursor > cursor() { return c_; } - auto_ptr< CoveredIndexMatcher > matcher() { return matcher_; } - int n() const { return n_; } - long long nscanned() const { return nscanned_; } - bool saveClientCursor() const { return saveClientCursor_; } - bool mayCreateCursor2() const { return ( queryOptions_ & QueryOption_CursorTailable ) && ntoreturn_ != 1; } + return new UserQueryOp( _pq ); + } + + BufBuilder &builder() { return _buf; } + bool scanAndOrderRequired() const { return _inMemSort; } + auto_ptr< Cursor > cursor() { return _c; } + auto_ptr< CoveredIndexMatcher > matcher() { return _matcher; } + int n() const { return _n; } + long long nscanned() const { return _nscanned; } + long long nscannedObjects() const { return _nscannedObjects; } + bool saveClientCursor() const { return _saveClientCursor; } + private: - BufBuilder b_; - int ntoskip_; - int ntoreturn_; - BSONObj order_; - bool wantMore_; - bool explain_; - FieldMatcher *filter_; - bool ordering_; - auto_ptr< Cursor > c_; - long long nscanned_; - int queryOptions_; - auto_ptr< CoveredIndexMatcher > matcher_; - int n_; - int soSize_; - bool saveClientCursor_; - auto_ptr< ScanAndOrder > so_; - bool findingStart_; - ClientCursor * findingStartCursor_; + BufBuilder _buf; + const ParsedQuery& _pq; + + long long _ntoskip; + long long _nscanned; + long long _nscannedObjects; + int _n; // found so far + + MatchDetails _details; + + bool _inMemSort; + auto_ptr< ScanAndOrder > _so; + + auto_ptr< Cursor > _c; + + auto_ptr< CoveredIndexMatcher > _matcher; + + bool _saveClientCursor; + bool _oplogReplay; + auto_ptr< FindingStartCursor > _findingStartCursor; }; /* run a query -- includes checking for and running a Command */ auto_ptr< QueryResult > runQuery(Message& m, QueryMessage& q, CurOp& curop ) { StringBuilder& ss = curop.debug().str; + ParsedQuery pq( q ); const char *ns = q.ns; int ntoskip = q.ntoskip; - int _ntoreturn = q.ntoreturn; BSONObj jsobj = q.query; - auto_ptr< FieldMatcher > filter = q.fields; // what fields to return (unspecified = full object) int queryOptions = q.queryOptions; BSONObj snapshotHint; - Timer t; if( logLevel >= 2 ) log() << "runQuery: " << ns << jsobj << endl; long long nscanned = 0; - bool wantMore = true; - int ntoreturn = _ntoreturn; - if ( _ntoreturn < 0 ) { - /* _ntoreturn greater than zero is simply a hint on how many objects to send back per - "cursor batch". - A negative number indicates a hard limit. - */ - ntoreturn = -_ntoreturn; - wantMore = false; - } - ss << "query " << ns << " ntoreturn:" << ntoreturn; + ss << ns << " ntoreturn:" << pq.getNumToReturn(); curop.setQuery(jsobj); - BufBuilder bb; BSONObjBuilder cmdResBuf; long long cursorid = 0; - bb.skip(sizeof(QueryResult)); - auto_ptr< QueryResult > qr; int n = 0; Client& c = cc(); - /* we assume you are using findOne() for running a cmd... */ - if ( ntoreturn == 1 && runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { - n = 1; - qr.reset( (QueryResult *) bb.buf() ); - bb.decouple(); - qr->setResultFlagsToOk(); - qr->len = bb.len(); - ss << " reslen:" << bb.len(); - // qr->channel = 0; - qr->setOperation(opReply); - qr->cursorId = cursorid; - qr->startingFrom = 0; - qr->nReturned = n; + + if ( pq.couldBeCommand() ){ + BufBuilder bb; + bb.skip(sizeof(QueryResult)); + + if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { + ss << " command "; + curop.markCommand(); + n = 1; + qr.reset( (QueryResult *) bb.buf() ); + bb.decouple(); + qr->setResultFlagsToOk(); + qr->len = bb.len(); + ss << " reslen:" << bb.len(); + // qr->channel = 0; + qr->setOperation(opReply); + qr->cursorId = cursorid; + qr->startingFrom = 0; + qr->nReturned = n; + } + return qr; } - else { - /* regular query */ - - AuthenticationInfo *ai = currentClient.get()->ai; - uassert( 10106 , "unauthorized", ai->isAuthorized(c.database()->name.c_str())); - - /* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair - so that queries to a pair are realtime consistent as much as possible. use setSlaveOk() to - query the nonmaster member of a replica pair. - */ - uassert( 10107 , "not master", isMaster() || (queryOptions & QueryOption_SlaveOk) || slave == SimpleSlave ); - - BSONElement hint; - BSONObj min; - BSONObj max; - bool explain = false; - bool _gotquery = false; - bool snapshot = false; - BSONObj query; - { - BSONElement e = jsobj.findElement("$query"); - if ( e.eoo() ) - e = jsobj.findElement("query"); - if ( !e.eoo() && (e.type() == Object || e.type() == Array) ) { - query = e.embeddedObject(); - _gotquery = true; - } + + // regular query + + mongolock lk(false); // read lock + Client::Context ctx( ns , dbpath , &lk ); + + /* we allow queries to SimpleSlave's -- but not to the slave (nonmaster) member of a replica pair + so that queries to a pair are realtime consistent as much as possible. use setSlaveOk() to + query the nonmaster member of a replica pair. + */ + uassert( 10107 , "not master" , isMaster() || pq.hasOption( QueryOption_SlaveOk ) || replSettings.slave == SimpleSlave ); + + BSONElement hint = useHints ? pq.getHint() : BSONElement(); + bool explain = pq.isExplain(); + bool snapshot = pq.isSnapshot(); + BSONObj query = pq.getFilter(); + BSONObj order = pq.getOrder(); + + if ( pq.hasOption( QueryOption_CursorTailable ) ) { + NamespaceDetails *d = nsdetails( ns ); + uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped ); + if ( order.isEmpty() ) { + order = BSON( "$natural" << 1 ); + } else { + uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == BSON( "$natural" << 1 ) ); } - BSONObj order; - { - BSONElement e = jsobj.findElement("$orderby"); - if ( e.eoo() ) - e = jsobj.findElement("orderby"); - if ( !e.eoo() ) { - order = e.embeddedObjectUserCheck(); - if ( e.type() == Array ) - order = transformOrderFromArrayFormat(order); + } + + if( snapshot ) { + NamespaceDetails *d = nsdetails(ns); + if ( d ){ + int i = d->findIdIndex(); + if( i < 0 ) { + if ( strstr( ns , ".system." ) == 0 ) + log() << "warning: no _id index on $snapshot query, ns:" << ns << endl; } - } - if ( !_gotquery && order.isEmpty() ) - query = jsobj; - else { - explain = jsobj.getBoolField("$explain"); - if ( useHints ) - hint = jsobj.getField("$hint"); - min = jsobj.getObjectField("$min"); - max = jsobj.getObjectField("$max"); - BSONElement e = jsobj.getField("$snapshot"); - snapshot = !e.eoo() && e.trueValue(); - if( snapshot ) { - uassert( 12001 , "E12001 can't sort with $snapshot", order.isEmpty()); - uassert( 12002 , "E12002 can't use hint with $snapshot", hint.eoo()); - NamespaceDetails *d = nsdetails(ns); - if ( d ){ - int i = d->findIdIndex(); - if( i < 0 ) { - if ( strstr( ns , ".system." ) == 0 ) - log() << "warning: no _id index on $snapshot query, ns:" << ns << endl; - } - else { - /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here. - probably need a better way to specify "use the _id index" as a hint. if someone is - in the query optimizer please fix this then! - */ - BSONObjBuilder b; - b.append("$hint", d->idx(i).indexName()); - snapshotHint = b.obj(); - hint = snapshotHint.firstElement(); - } - } + else { + /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here. + probably need a better way to specify "use the _id index" as a hint. if someone is + in the query optimizer please fix this then! + */ + BSONObjBuilder b; + b.append("$hint", d->idx(i).indexName()); + snapshotHint = b.obj(); + hint = snapshotHint.firstElement(); } } + } - /* The ElemIter will not be happy if this isn't really an object. So throw exception - here when that is true. - (Which may indicate bad data from client.) - */ - if ( query.objsize() == 0 ) { - out() << "Bad query object?\n jsobj:"; - out() << jsobj.toString() << "\n query:"; - out() << query.toString() << endl; - uassert( 10110 , "bad query object", false); - } + /* The ElemIter will not be happy if this isn't really an object. So throw exception + here when that is true. + (Which may indicate bad data from client.) + */ + if ( query.objsize() == 0 ) { + out() << "Bad query object?\n jsobj:"; + out() << jsobj.toString() << "\n query:"; + out() << query.toString() << endl; + uassert( 10110 , "bad query object", false); + } - bool idHackWorked = false; - if ( strcmp( query.firstElement().fieldName() , "_id" ) == 0 && query.nFields() == 1 && query.firstElement().isSimpleType() ){ - nscanned = 1; + if ( ! explain && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) { + nscanned = 1; - bool nsFound = false; - bool indexFound = false; + bool nsFound = false; + bool indexFound = false; - BSONObj resObject; - bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound ); - if ( nsFound == false || indexFound == true ){ - idHackWorked = true; - if ( found ){ - n = 1; - fillQueryResultFromObj( bb , filter.get() , resObject ); - } - qr.reset( (QueryResult *) bb.buf() ); - bb.decouple(); - qr->setResultFlagsToOk(); - qr->len = bb.len(); - ss << " reslen:" << bb.len(); - qr->setOperation(opReply); - qr->cursorId = cursorid; - qr->startingFrom = 0; - qr->nReturned = n; - } - } - - if ( ! idHackWorked ){ // non-simple _id lookup - BSONObj oldPlan; - if ( explain && hint.eoo() && min.isEmpty() && max.isEmpty() ) { - QueryPlanSet qps( ns, query, order ); - if ( qps.usingPrerecordedPlan() ) - oldPlan = qps.explain(); - } - QueryPlanSet qps( ns, query, order, &hint, !explain, min, max ); - UserQueryOp original( ntoskip, ntoreturn, order, wantMore, explain, filter.get(), queryOptions ); - shared_ptr< UserQueryOp > o = qps.runOp( original ); - UserQueryOp &dqo = *o; - massert( 10362 , dqo.exceptionMessage(), dqo.complete() ); - n = dqo.n(); - nscanned = dqo.nscanned(); - if ( dqo.scanAndOrderRequired() ) - ss << " scanAndOrder "; - auto_ptr< Cursor > c = dqo.cursor(); - log( 5 ) << " used cursor: " << c.get() << endl; - if ( dqo.saveClientCursor() ) { - ClientCursor *cc = new ClientCursor(); - if ( queryOptions & QueryOption_NoCursorTimeout ) - cc->noTimeout(); - cc->c = c; - cursorid = cc->cursorid; - cc->query = jsobj.getOwned(); - DEV out() << " query has more, cursorid: " << cursorid << endl; - cc->matcher = dqo.matcher(); - cc->ns = ns; - cc->pos = n; - cc->filter = filter; - cc->originalMessage = m; - cc->updateLocation(); - if ( !cc->c->ok() && cc->c->tailable() ) { - DEV out() << " query has no more but tailable, cursorid: " << cursorid << endl; - } else { - DEV out() << " query has more, cursorid: " << cursorid << endl; - } - } - if ( explain ) { - BSONObjBuilder builder; - builder.append("cursor", c->toString()); - builder.append("startKey", c->prettyStartKey()); - builder.append("endKey", c->prettyEndKey()); - builder.append("nscanned", double( dqo.nscanned() ) ); - builder.append("n", n); - if ( dqo.scanAndOrderRequired() ) - builder.append("scanAndOrder", true); - builder.append("millis", t.millis()); - if ( !oldPlan.isEmpty() ) - builder.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() ); - if ( hint.eoo() ) - builder.appendElements(qps.explain()); - BSONObj obj = builder.done(); - fillQueryResultFromObj(dqo.builder(), 0, obj); + BSONObj resObject; + bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound ); + if ( nsFound == false || indexFound == true ){ + BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32); + bb.skip(sizeof(QueryResult)); + + ss << " idhack "; + if ( found ){ n = 1; + fillQueryResultFromObj( bb , pq.getFields() , resObject ); } - qr.reset( (QueryResult *) dqo.builder().buf() ); - dqo.builder().decouple(); - qr->cursorId = cursorid; + qr.reset( (QueryResult *) bb.buf() ); + bb.decouple(); qr->setResultFlagsToOk(); - qr->len = dqo.builder().len(); - ss << " reslen:" << qr->len; + qr->len = bb.len(); + ss << " reslen:" << bb.len(); qr->setOperation(opReply); + qr->cursorId = cursorid; qr->startingFrom = 0; - qr->nReturned = n; + qr->nReturned = n; + return qr; + } + } + + // regular, not QO bypass query + + BSONObj oldPlan; + if ( explain && ! pq.hasIndexSpecifier() ){ + QueryPlanSet qps( ns, query, order ); + if ( qps.usingPrerecordedPlan() ) + oldPlan = qps.explain(); + } + QueryPlanSet qps( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax() ); + UserQueryOp original( pq ); + shared_ptr< UserQueryOp > o = qps.runOp( original ); + UserQueryOp &dqo = *o; + massert( 10362 , dqo.exceptionMessage(), dqo.complete() ); + n = dqo.n(); + nscanned = dqo.nscanned(); + if ( dqo.scanAndOrderRequired() ) + ss << " scanAndOrder "; + auto_ptr<Cursor> cursor = dqo.cursor(); + log( 5 ) << " used cursor: " << cursor.get() << endl; + if ( dqo.saveClientCursor() ) { + // the clientcursor now owns the Cursor* and 'c' is released: + ClientCursor *cc = new ClientCursor(cursor, ns, !(queryOptions & QueryOption_NoCursorTimeout)); + cursorid = cc->cursorid; + cc->query = jsobj.getOwned(); + DEV out() << " query has more, cursorid: " << cursorid << endl; + cc->matcher = dqo.matcher(); + cc->pos = n; + cc->fields = pq.getFieldPtr(); + cc->originalMessage = m; + cc->updateLocation(); + if ( !cc->c->ok() && cc->c->tailable() ) { + DEV out() << " query has no more but tailable, cursorid: " << cursorid << endl; + } else { + DEV out() << " query has more, cursorid: " << cursorid << endl; } } + if ( explain ) { + BSONObjBuilder builder; + builder.append("cursor", cursor->toString()); + builder.appendArray("indexBounds", cursor->prettyIndexBounds()); + builder.appendNumber("nscanned", dqo.nscanned() ); + builder.appendNumber("nscannedObjects", dqo.nscannedObjects() ); + builder.append("n", n); + if ( dqo.scanAndOrderRequired() ) + builder.append("scanAndOrder", true); + builder.append("millis", curop.elapsedMillis()); + if ( !oldPlan.isEmpty() ) + builder.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() ); + if ( hint.eoo() ) + builder.appendElements(qps.explain()); + BSONObj obj = builder.done(); + fillQueryResultFromObj(dqo.builder(), 0, obj); + n = 1; + } + qr.reset( (QueryResult *) dqo.builder().buf() ); + dqo.builder().decouple(); + qr->cursorId = cursorid; + qr->setResultFlagsToOk(); + qr->len = dqo.builder().len(); + ss << " reslen:" << qr->len; + qr->setOperation(opReply); + qr->startingFrom = 0; + qr->nReturned = n; + - int duration = t.millis(); - Database *database = c.database(); - if ( (database && database->profile) || duration >= 100 ) { + int duration = curop.elapsedMillis(); + bool dbprofile = curop.shouldDBProfile( duration ); + if ( dbprofile || duration >= cmdLine.slowMS ) { ss << " nscanned:" << nscanned << ' '; if ( ntoskip ) ss << " ntoskip:" << ntoskip; - if ( database && database->profile ) + if ( dbprofile ) ss << " \nquery: "; ss << jsobj << ' '; } @@ -22,7 +22,7 @@ #include "../util/message.h" #include "dbmessage.h" #include "jsobj.h" -#include "storage.h" +#include "diskloc.h" /* db request message format @@ -71,13 +71,15 @@ namespace mongo { + extern const int MaxBytesToReturnToClientAtOnce; + // for an existing query (ie a ClientCursor), send back additional information. QueryResult* getMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op); struct UpdateResult { bool existing; bool mod; - unsigned long long num; + long long num; UpdateResult( bool e, bool m, unsigned long long n ) : existing(e) , mod(m), num(n ){} @@ -100,16 +102,213 @@ namespace mongo { /* returns true if an existing object was updated, false if no existing object was found. multi - update multiple objects - mostly useful with things like $set + god - allow access to system namespaces and don't yield */ - UpdateResult updateObjects(const char *ns, BSONObj updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug ); + UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug ); // If justOne is true, deletedId is set to the id of the deleted object. - int deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false); + long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false); long long runCount(const char *ns, const BSONObj& cmd, string& err); auto_ptr< QueryResult > runQuery(Message& m, QueryMessage& q, CurOp& curop ); + /* This is for languages whose "objects" are not well ordered (JSON is well ordered). + [ { a : ... } , { b : ... } ] -> { a : ..., b : ... } + */ + inline BSONObj transformOrderFromArrayFormat(BSONObj order) { + /* note: this is slow, but that is ok as order will have very few pieces */ + BSONObjBuilder b; + char p[2] = "0"; + + while ( 1 ) { + BSONObj j = order.getObjectField(p); + if ( j.isEmpty() ) + break; + BSONElement e = j.firstElement(); + uassert( 10102 , "bad order array", !e.eoo()); + uassert( 10103 , "bad order array [2]", e.isNumber()); + b.append(e); + (*p)++; + uassert( 10104 , "too many ordering elements", *p <= '9'); + } + + return b.obj(); + } + + /** + * this represents a total user query + * includes fields from the query message, both possible query levels + * parses everything up front + */ + class ParsedQuery { + public: + ParsedQuery( QueryMessage& qm ) + : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ){ + init( qm.query ); + initFields( qm.fields ); + } + ParsedQuery( const char* ns , int ntoskip , int ntoreturn , int queryoptions , const BSONObj& query , const BSONObj& fields ) + : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ){ + init( query ); + initFields( fields ); + } + + ~ParsedQuery(){} + + const char * ns() const { return _ns; } + + const BSONObj& getFilter() const { return _filter; } + FieldMatcher* getFields() const { return _fields.get(); } + shared_ptr<FieldMatcher> getFieldPtr() const { return _fields; } + + int getSkip() const { return _ntoskip; } + int getNumToReturn() const { return _ntoreturn; } + bool wantMore() const { return _wantMore; } + int getOptions() const { return _options; } + bool hasOption( int x ) const { return x & _options; } + + + bool isExplain() const { return _explain; } + bool isSnapshot() const { return _snapshot; } + bool returnKey() const { return _returnKey; } + + const BSONObj& getMin() const { return _min; } + const BSONObj& getMax() const { return _max; } + const BSONObj& getOrder() const { return _order; } + const BSONElement& getHint() const { return _hint; } + + bool couldBeCommand() const { + /* we assume you are using findOne() for running a cmd... */ + return _ntoreturn == 1 && strstr( _ns , ".$cmd" ); + } + + bool hasIndexSpecifier() const { + return ! _hint.eoo() || ! _min.isEmpty() || ! _max.isEmpty(); + } + + /* if ntoreturn is zero, we return up to 101 objects. on the subsequent getmore, there + is only a size limit. The idea is that on a find() where one doesn't use much results, + we don't return much, but once getmore kicks in, we start pushing significant quantities. + + The n limit (vs. size) is important when someone fetches only one small field from big + objects, which causes massive scanning server-side. + */ + bool enoughForFirstBatch( int n , int len ) const { + if ( _ntoreturn == 0 ) + return ( len > 1024 * 1024 ) || n >= 101; + return n >= _ntoreturn || len > MaxBytesToReturnToClientAtOnce; + } + + bool enough( int n ) const { + if ( _ntoreturn == 0 ) + return false; + return n >= _ntoreturn; + } + + private: + void init( const BSONObj& q ){ + _reset(); + uassert( 10105 , "bad skip value in query", _ntoskip >= 0); + + if ( _ntoreturn < 0 ){ + /* _ntoreturn greater than zero is simply a hint on how many objects to send back per + "cursor batch". + A negative number indicates a hard limit. + */ + _wantMore = false; + _ntoreturn = -_ntoreturn; + } + + + BSONElement e = q["query"]; + if ( ! e.isABSONObj() ) + e = q["$query"]; + + if ( e.isABSONObj() ){ + _filter = e.embeddedObject(); + _initTop( q ); + } + else { + _filter = q; + } + } + + void _reset(){ + _wantMore = true; + _explain = false; + _snapshot = false; + _returnKey = false; + } + + void _initTop( const BSONObj& top ){ + BSONObjIterator i( top ); + while ( i.more() ){ + BSONElement e = i.next(); + const char * name = e.fieldName(); + + if ( strcmp( "$orderby" , name ) == 0 || + strcmp( "orderby" , name ) == 0 ){ + if ( e.type() == Object ) + _order = e.embeddedObject(); + else if ( e.type() == Array ) + _order = transformOrderFromArrayFormat( _order ); + else + assert( 0 ); + } + else if ( strcmp( "$explain" , name ) == 0 ) + _explain = e.trueValue(); + else if ( strcmp( "$snapshot" , name ) == 0 ) + _snapshot = e.trueValue(); + else if ( strcmp( "$min" , name ) == 0 ) + _min = e.embeddedObject(); + else if ( strcmp( "$max" , name ) == 0 ) + _max = e.embeddedObject(); + else if ( strcmp( "$hint" , name ) == 0 ) + _hint = e; + else if ( strcmp( "$returnKey" , name ) == 0 ) + _returnKey = e.trueValue(); + + } + + if ( _snapshot ){ + uassert( 12001 , "E12001 can't sort with $snapshot", _order.isEmpty() ); + uassert( 12002 , "E12002 can't use hint with $snapshot", _hint.eoo() ); + } + + } + + void initFields( const BSONObj& fields ){ + if ( fields.isEmpty() ) + return; + _fields.reset( new FieldMatcher() ); + _fields->add( fields ); + } + + ParsedQuery( const ParsedQuery& other ){ + assert(0); + } + + const char* _ns; + int _ntoskip; + int _ntoreturn; + int _options; + + BSONObj _filter; + shared_ptr< FieldMatcher > _fields; + + bool _wantMore; + + bool _explain; + bool _snapshot; + bool _returnKey; + BSONObj _min; + BSONObj _max; + BSONElement _hint; + BSONObj _order; + }; + + } // namespace mongo #include "clientcursor.h" diff --git a/db/queryoptimizer.cpp b/db/queryoptimizer.cpp index 499417a..fa08323 100644 --- a/db/queryoptimizer.cpp +++ b/db/queryoptimizer.cpp @@ -24,6 +24,9 @@ #include "queryoptimizer.h" #include "cmdline.h" +//#define DEBUGQO(x) cout << x << endl; +#define DEBUGQO(x) + namespace mongo { void checkTableScanAllowed( const char * ns ){ @@ -39,7 +42,7 @@ namespace mongo { uassert( 10111 , (string)"table scans not allowed:" + ns , ! cmdLine.notablescan ); } - + double elementDirection( const BSONElement &e ) { if ( e.isNumber() ) return e.number(); @@ -48,7 +51,7 @@ namespace mongo { QueryPlan::QueryPlan( NamespaceDetails *_d, int _idxNo, - const FieldRangeSet &fbs, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey ) : + const FieldRangeSet &fbs, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) : d(_d), idxNo(_idxNo), fbs_( fbs ), order_( order ), @@ -58,7 +61,9 @@ namespace mongo { exactKeyMatch_( false ), direction_( 0 ), endKeyInclusive_( endKey.isEmpty() ), - unhelpful_( false ) { + unhelpful_( false ), + _special( special ), + _type(0){ if ( !fbs_.matchPossible() ) { unhelpful_ = true; @@ -75,6 +80,14 @@ namespace mongo { return; } + if ( _special.size() ){ + optimal_ = true; + _type = index_->getSpec().getType(); + massert( 13040 , (string)"no type for special: " + _special , _type ); + scanAndOrderRequired_ = _type->scanAndOrderRequired( fbs.query() , order ); + return; + } + BSONObj idxKey = index_->keyPattern(); BSONObjIterator o( order ); BSONObjIterator k( idxKey ); @@ -163,7 +176,11 @@ namespace mongo { unhelpful_ = true; } - auto_ptr< Cursor > QueryPlan::newCursor( const DiskLoc &startLoc ) const { + auto_ptr< Cursor > QueryPlan::newCursor( const DiskLoc &startLoc , int numWanted ) const { + + if ( _type ) + return _type->newCursor( fbs_.query() , order_ , numWanted ); + if ( !fbs_.matchPossible() ){ if ( fbs_.nNontrivialRanges() ) checkTableScanAllowed( fbs_.ns() ); @@ -206,13 +223,14 @@ namespace mongo { void QueryPlan::registerSelf( long long nScanned ) const { if ( fbs_.matchPossible() ) { - boostlock lk(NamespaceDetailsTransient::_qcMutex); + scoped_lock lk(NamespaceDetailsTransient::_qcMutex); NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( fbs_.pattern( order_ ), indexKey(), nScanned ); } } QueryPlanSet::QueryPlanSet( const char *_ns, const BSONObj &query, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max ) : ns(_ns), + query_( query.getOwned() ), fbs_( _ns, query ), mayRecordPlan_( true ), usingPrerecordedPlan_( false ), @@ -223,9 +241,7 @@ namespace mongo { min_( min.getOwned() ), max_( max.getOwned() ) { if ( hint && !hint->eoo() ) { - BSONObjBuilder b; - b.append( *hint ); - hint_ = b.obj(); + hint_ = hint->wrap(); } init(); } @@ -242,6 +258,7 @@ namespace mongo { } void QueryPlanSet::init() { + DEBUGQO( "QueryPlanSet::init " << ns << "\t" << query_ ); plans_.clear(); mayRecordPlan_ = true; usingPrerecordedPlan_ = false; @@ -297,9 +314,43 @@ namespace mongo { plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(*idx), fbs_, order_, min_, max_ ) ) ); return; } - + + if ( isSimpleIdQuery( query_ ) ){ + int idx = d->findIdIndex(); + if ( idx >= 0 ){ + usingPrerecordedPlan_ = true; + mayRecordPlan_ = false; + plans_.push_back( PlanPtr( new QueryPlan( d , idx , fbs_ , order_ ) ) ); + return; + } + } + + if ( query_.isEmpty() && order_.isEmpty() ){ + plans_.push_back( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ) ); + return; + } + + DEBUGQO( "\t special : " << fbs_.getSpecial() ); + if ( fbs_.getSpecial().size() ){ + _special = fbs_.getSpecial(); + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + int j = i.pos(); + IndexDetails& ii = i.next(); + const IndexSpec& spec = ii.getSpec(); + if ( spec.getTypeName() == _special && spec.suitability( query_ , order_ ) ){ + usingPrerecordedPlan_ = true; + mayRecordPlan_ = true; + plans_.push_back( PlanPtr( new QueryPlan( d , j , fbs_ , order_ , + BSONObj() , BSONObj() , _special ) ) ); + return; + } + } + uassert( 13038 , (string)"can't find special index: " + _special + " for: " + query_.toString() , 0 ); + } + if ( honorRecordedPlan_ ) { - boostlock lk(NamespaceDetailsTransient::_qcMutex); + scoped_lock lk(NamespaceDetailsTransient::_qcMutex); NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( ns ); BSONObj bestIndex = nsd.indexForPattern( fbs_.pattern( order_ ) ); if ( !bestIndex.isEmpty() ) { @@ -334,7 +385,7 @@ namespace mongo { if ( !d ) return; - // If table scan is optimal or natural order requested + // If table scan is optimal or natural order requested or tailable cursor requested if ( !fbs_.matchPossible() || ( fbs_.nNontrivialRanges() == 0 && order_.isEmpty() ) || ( !order_.isEmpty() && !strcmp( order_.firstElement().fieldName(), "$natural" ) ) ) { // Table scan plan @@ -342,8 +393,19 @@ namespace mongo { return; } + bool normalQuery = hint_.isEmpty() && min_.isEmpty() && max_.isEmpty(); + PlanSet plans; for( int i = 0; i < d->nIndexes; ++i ) { + IndexDetails& id = d->idx(i); + const IndexSpec& spec = id.getSpec(); + IndexSuitability suitability = HELPFUL; + if ( normalQuery ){ + suitability = spec.suitability( query_ , order_ ); + if ( suitability == USELESS ) + continue; + } + PlanPtr p( new QueryPlan( d, i, fbs_, order_ ) ); if ( p->optimal() ) { addPlan( p, checkFirst ); @@ -367,7 +429,7 @@ namespace mongo { if ( res->complete() || plans_.size() > 1 ) return res; { - boostlock lk(NamespaceDetailsTransient::_qcMutex); + scoped_lock lk(NamespaceDetailsTransient::_qcMutex); NamespaceDetailsTransient::get_inlock( fbs_.ns() ).registerIndexForPattern( fbs_.pattern( order_ ), BSONObj(), 0 ); } init(); @@ -380,7 +442,10 @@ namespace mongo { vector< BSONObj > arr; for( PlanSet::const_iterator i = plans_.begin(); i != plans_.end(); ++i ) { auto_ptr< Cursor > c = (*i)->newCursor(); - arr.push_back( BSON( "cursor" << c->toString() << "startKey" << c->prettyStartKey() << "endKey" << c->prettyEndKey() ) ); + BSONObjBuilder explain; + explain.append( "cursor", c->toString() ); + explain.appendArray( "indexBounds", c->prettyIndexBounds() ); + arr.push_back( explain.obj() ); } BSONObjBuilder b; b.append( "allPlans", arr ); @@ -433,7 +498,7 @@ namespace mongo { } if ( errCount == ops.size() ) break; - if ( plans_.usingPrerecordedPlan_ && nScanned > plans_.oldNScanned_ * 10 ) { + if ( plans_.usingPrerecordedPlan_ && nScanned > plans_.oldNScanned_ * 10 && plans_._special.empty() ) { plans_.addOtherPlans( true ); PlanSet::iterator i = plans_.plans_.begin(); ++i; @@ -558,7 +623,7 @@ namespace mongo { return 0; } - setClient( ns ); + Client::Context ctx( ns ); IndexDetails *id = 0; NamespaceDetails *d = nsdetails( ns ); if ( !d ) { @@ -576,9 +641,11 @@ namespace mongo { while( i.more() ) { IndexDetails& ii = i.next(); if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) { - id = ⅈ - keyPattern = ii.keyPattern(); - break; + if ( ii.getSpec().getType() == 0 ){ + id = ⅈ + keyPattern = ii.keyPattern(); + break; + } } } diff --git a/db/queryoptimizer.h b/db/queryoptimizer.h index e4a79d8..1cb5052 100644 --- a/db/queryoptimizer.h +++ b/db/queryoptimizer.h @@ -25,6 +25,8 @@ namespace mongo { class IndexDetails; + class IndexType; + class QueryPlan : boost::noncopyable { public: QueryPlan(NamespaceDetails *_d, @@ -32,7 +34,8 @@ namespace mongo { const FieldRangeSet &fbs, const BSONObj &order, const BSONObj &startKey = BSONObj(), - const BSONObj &endKey = BSONObj() ); + const BSONObj &endKey = BSONObj() , + string special="" ); /* If true, no other index can do better. */ bool optimal() const { return optimal_; } @@ -46,10 +49,11 @@ namespace mongo { requested sort order */ bool unhelpful() const { return unhelpful_; } int direction() const { return direction_; } - auto_ptr< Cursor > newCursor( const DiskLoc &startLoc = DiskLoc() ) const; + auto_ptr< Cursor > newCursor( const DiskLoc &startLoc = DiskLoc() , int numWanted=0 ) const; auto_ptr< Cursor > newReverseCursor() const; BSONObj indexKey() const; const char *ns() const { return fbs_.ns(); } + NamespaceDetails *nsd() const { return d; } BSONObj query() const { return fbs_.query(); } BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return fbs_.simplifiedQuery( fields ); } const FieldRange &range( const char *fieldName ) const { return fbs_.range( fieldName ); } @@ -69,6 +73,8 @@ namespace mongo { BoundList indexBounds_; bool endKeyInclusive_; bool unhelpful_; + string _special; + IndexType * _type; }; // Inherit from this interface to implement a new query operation. @@ -78,11 +84,15 @@ namespace mongo { public: QueryOp() : complete_(), qp_(), error_() {} virtual ~QueryOp() {} + + /** this gets called after a query plan is set? ERH 2/16/10 */ virtual void init() = 0; virtual void next() = 0; virtual bool mayRecordPlan() const = 0; - // Return a copy of the inheriting class, which will be run with its own - // query plan. + + /** @return a copy of the inheriting class, which will be run with its own + query plan. + */ virtual QueryOp *clone() const = 0; bool complete() const { return complete_; } bool error() const { return error_; } @@ -143,6 +153,7 @@ namespace mongo { static void nextOp( QueryOp &op ); }; const char *ns; + BSONObj query_; FieldRangeSet fbs_; PlanSet plans_; bool mayRecordPlan_; @@ -153,9 +164,17 @@ namespace mongo { bool honorRecordedPlan_; BSONObj min_; BSONObj max_; + string _special; }; // NOTE min, max, and keyPattern will be updated to be consistent with the selected index. IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ); + + inline bool isSimpleIdQuery( const BSONObj& query ){ + return + strcmp( query.firstElement().fieldName() , "_id" ) == 0 && + query.nFields() == 1 && + query.firstElement().isSimpleType(); + } } // namespace mongo diff --git a/db/queryutil.cpp b/db/queryutil.cpp index d8854be..c01b89e 100644 --- a/db/queryutil.cpp +++ b/db/queryutil.cpp @@ -24,96 +24,118 @@ #include "../util/unittest.h" namespace mongo { - namespace { - /** returns a string that when used as a matcher, would match a super set of regex() - returns "" for complex regular expressions - used to optimize queries in some simple regex cases that start with '^' - */ - inline string simpleRegexHelper(const char* regex, const char* flags){ - string r = ""; - - bool extended = false; - while (*flags){ - switch (*(flags++)){ - case 'm': // multiline - continue; - case 'x': // extended - extended = true; - break; - default: - return r; // cant use index - } - } + /** returns a string that when used as a matcher, would match a super set of regex() + returns "" for complex regular expressions + used to optimize queries in some simple regex cases that start with '^' - if ( *(regex++) != '^' ) - return r; + if purePrefix != NULL, sets it to whether the regex can be converted to a range query + */ + string simpleRegex(const char* regex, const char* flags, bool* purePrefix){ + string r = ""; - stringstream ss; + if (purePrefix) *purePrefix = false; - while(*regex){ - char c = *(regex++); - if ( c == '*' || c == '?' ){ - // These are the only two symbols that make the last char optional - r = ss.str(); - r = r.substr( 0 , r.size() - 1 ); - return r; //breaking here fails with /^a?/ - } else if (c == '\\'){ - // slash followed by non-alphanumeric represents the following char - c = *(regex++); - if ((c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') || - (c >= '0' && c <= '0') || - (c == '\0')) - { - r = ss.str(); - break; - } else { - ss << c; - } - } else if (strchr("^$.[|()+{", c)){ - // list of "metacharacters" from man pcrepattern - r = ss.str(); + bool extended = false; + while (*flags){ + switch (*(flags++)){ + case 'm': // multiline + continue; + case 'x': // extended + extended = true; break; - } else if (extended && c == '#'){ - // comment + default: + return r; // cant use index + } + } + + if ( *(regex++) != '^' ) + return r; + + stringstream ss; + + while(*regex){ + char c = *(regex++); + if ( c == '*' || c == '?' ){ + // These are the only two symbols that make the last char optional + r = ss.str(); + r = r.substr( 0 , r.size() - 1 ); + return r; //breaking here fails with /^a?/ + } else if (c == '\\'){ + // slash followed by non-alphanumeric represents the following char + c = *(regex++); + if ((c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '0') || + (c == '\0')) + { r = ss.str(); break; - } else if (extended && isspace(c)){ - continue; } else { - // self-matching char ss << c; } - } - - if ( r.size() == 0 && *regex == 0 ) + } else if (strchr("^$.[|()+{", c)){ + // list of "metacharacters" from man pcrepattern r = ss.str(); + break; + } else if (extended && c == '#'){ + // comment + r = ss.str(); + break; + } else if (extended && isspace(c)){ + continue; + } else { + // self-matching char + ss << c; + } + } - return r; + if ( r.empty() && *regex == 0 ){ + r = ss.str(); + if (purePrefix) *purePrefix = !r.empty(); } - inline string simpleRegex(const BSONElement& e){ - switch(e.type()){ - case RegEx: - return simpleRegexHelper(e.regex(), e.regexFlags()); - case Object:{ - BSONObj o = e.embeddedObject(); - return simpleRegexHelper(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe()); - } - default: assert(false); return ""; //return squashes compiler warning + + return r; + } + inline string simpleRegex(const BSONElement& e){ + switch(e.type()){ + case RegEx: + return simpleRegex(e.regex(), e.regexFlags()); + case Object:{ + BSONObj o = e.embeddedObject(); + return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe()); } + default: assert(false); return ""; //return squashes compiler warning } } + + string simpleRegexEnd( string regex ) { + ++regex[ regex.length() - 1 ]; + return regex; + } - FieldRange::FieldRange( const BSONElement &e, bool optimize ) { - if ( !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) { + + FieldRange::FieldRange( const BSONElement &e, bool isNot, bool optimize ) { + // NOTE with $not, we could potentially form a complementary set of intervals. + if ( !isNot && !e.eoo() && e.type() != RegEx && e.getGtLtOp() == BSONObj::opIN ) { set< BSONElement, element_lt > vals; + vector< FieldRange > regexes; + uassert( 12580 , "invalid query" , e.isABSONObj() ); BSONObjIterator i( e.embeddedObject() ); - while( i.more() ) - vals.insert( i.next() ); + while( i.more() ) { + BSONElement ie = i.next(); + if ( ie.type() == RegEx ) { + regexes.push_back( FieldRange( ie, false, optimize ) ); + } else { + vals.insert( ie ); + } + } for( set< BSONElement, element_lt >::const_iterator i = vals.begin(); i != vals.end(); ++i ) intervals_.push_back( FieldInterval(*i) ); + for( vector< FieldRange >::const_iterator i = regexes.begin(); i != regexes.end(); ++i ) + *this |= *i; + return; } @@ -149,15 +171,66 @@ namespace mongo { || (e.type() == Object && !e.embeddedObject()["$regex"].eoo()) ) { - const string r = simpleRegex(e); - if ( r.size() ) { - lower = addObj( BSON( "" << r ) ).firstElement(); - upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement(); - upperInclusive = false; - } + if ( !isNot ) { // no optimization for negated regex - we could consider creating 2 intervals comprising all nonmatching prefixes + const string r = simpleRegex(e); + if ( r.size() ) { + lower = addObj( BSON( "" << r ) ).firstElement(); + upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement(); + upperInclusive = false; + } else { + BSONObjBuilder b1(32), b2(32); + b1.appendMinForType( "" , String ); + lower = addObj( b1.obj() ).firstElement(); + + b2.appendMaxForType( "" , String ); + upper = addObj( b2.obj() ).firstElement(); + upperInclusive = false; //MaxForType String is an empty Object + } + + // regex matches self - regex type > string type + if (e.type() == RegEx){ + BSONElement re = addObj( BSON( "" << e ) ).firstElement(); + intervals_.push_back( FieldInterval(re) ); + } else { + BSONObj orig = e.embeddedObject(); + BSONObjBuilder b; + b.appendRegex("", orig["$regex"].valuestrsafe(), orig["$options"].valuestrsafe()); + BSONElement re = addObj( b.obj() ).firstElement(); + intervals_.push_back( FieldInterval(re) ); + } + + } return; } - switch( e.getGtLtOp() ) { + int op = e.getGtLtOp(); + if ( isNot ) { + switch( op ) { + case BSONObj::Equality: + case BSONObj::opALL: + case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in) + case BSONObj::opTYPE: + op = BSONObj::NE; // no bound calculation + break; + case BSONObj::NE: + op = BSONObj::Equality; + break; + case BSONObj::LT: + op = BSONObj::GTE; + break; + case BSONObj::LTE: + op = BSONObj::GT; + break; + case BSONObj::GT: + op = BSONObj::LTE; + break; + case BSONObj::GTE: + op = BSONObj::LT; + break; + default: // otherwise doesn't matter + break; + } + } + switch( op ) { case BSONObj::Equality: lower = upper = e; break; @@ -174,8 +247,32 @@ namespace mongo { case BSONObj::opALL: { massert( 10370 , "$all requires array", e.type() == Array ); BSONObjIterator i( e.embeddedObject() ); - if ( i.more() ) - lower = upper = i.next(); + bool bound = false; + while ( i.more() ){ + BSONElement x = i.next(); + if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ){ + // taken care of elsewhere + } + else if ( x.type() != RegEx ) { + lower = upper = x; + bound = true; + break; + } + } + if ( !bound ) { // if no good non regex bound found, try regex bounds + BSONObjIterator i( e.embeddedObject() ); + while( i.more() ) { + BSONElement x = i.next(); + if ( x.type() != RegEx ) + continue; + string simple = simpleRegex( x.regex(), x.regexFlags() ); + if ( !simple.empty() ) { + lower = addObj( BSON( "" << simple ) ).firstElement(); + upper = addObj( BSON( "" << simpleRegexEnd( simple ) ) ).firstElement(); + break; + } + } + } break; } case BSONObj::opMOD: { @@ -206,10 +303,18 @@ namespace mongo { break; } + case BSONObj::opREGEX: + case BSONObj::opOPTIONS: + // do nothing + break; case BSONObj::opELEM_MATCH: { log() << "warning: shouldn't get here?" << endl; break; } + case BSONObj::opNEAR: + case BSONObj::opWITHIN: + _special = "2d"; + break; default: break; } @@ -269,19 +374,118 @@ namespace mongo { intervals_ = newIntervals; for( vector< BSONObj >::const_iterator i = other.objData_.begin(); i != other.objData_.end(); ++i ) objData_.push_back( *i ); + if ( _special.size() == 0 && other._special.size() ) + _special = other._special; return *this; } - string FieldRange::simpleRegexEnd( string regex ) { - ++regex[ regex.length() - 1 ]; - return regex; - } + void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector< FieldInterval > &newIntervals ) { + if ( low.bound_.eoo() ) { + low = lower.lower_; high = lower.upper_; + } else { + if ( high.bound_.woCompare( lower.lower_.bound_, false ) < 0 ) { // when equal but neither inclusive, just assume they overlap, since current btree scanning code just as efficient either way + FieldInterval tmp; + tmp.lower_ = low; + tmp.upper_ = high; + newIntervals.push_back( tmp ); + low = lower.lower_; high = lower.upper_; + } else { + high = lower.upper_; + } + } + } + + const FieldRange &FieldRange::operator|=( const FieldRange &other ) { + vector< FieldInterval > newIntervals; + FieldBound low; + FieldBound high; + vector< FieldInterval >::const_iterator i = intervals_.begin(); + vector< FieldInterval >::const_iterator j = other.intervals_.begin(); + while( i != intervals_.end() && j != other.intervals_.end() ) { + int cmp = i->lower_.bound_.woCompare( j->lower_.bound_, false ); + if ( ( cmp == 0 && i->lower_.inclusive_ ) || cmp < 0 ) { + handleInterval( *i, low, high, newIntervals ); + ++i; + } else { + handleInterval( *j, low, high, newIntervals ); + ++j; + } + } + while( i != intervals_.end() ) { + handleInterval( *i, low, high, newIntervals ); + ++i; + } + while( j != other.intervals_.end() ) { + handleInterval( *j, low, high, newIntervals ); + ++j; + } + FieldInterval tmp; + tmp.lower_ = low; + tmp.upper_ = high; + newIntervals.push_back( tmp ); + intervals_ = newIntervals; + for( vector< BSONObj >::const_iterator i = other.objData_.begin(); i != other.objData_.end(); ++i ) + objData_.push_back( *i ); + if ( _special.size() == 0 && other._special.size() ) + _special = other._special; + return *this; + } BSONObj FieldRange::addObj( const BSONObj &o ) { objData_.push_back( o ); return o; } + string FieldRangeSet::getSpecial() const { + string s = ""; + for ( map<string,FieldRange>::iterator i=ranges_.begin(); i!=ranges_.end(); i++ ){ + if ( i->second.getSpecial().size() == 0 ) + continue; + uassert( 13033 , "can't have 2 special fields" , s.size() == 0 ); + s = i->second.getSpecial(); + } + return s; + } + + void FieldRangeSet::processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize ) { + BSONElement g = f; + int op2 = g.getGtLtOp(); + if ( op2 == BSONObj::opALL ) { + BSONElement h = g; + massert( 13050 , "$all requires array", h.type() == Array ); + BSONObjIterator i( h.embeddedObject() ); + if( i.more() ) { + BSONElement x = i.next(); + if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) { + g = x.embeddedObject().firstElement(); + op2 = g.getGtLtOp(); + } + } + } + if ( op2 == BSONObj::opELEM_MATCH ) { + BSONObjIterator k( g.embeddedObjectUserCheck() ); + while ( k.more() ){ + BSONElement h = k.next(); + StringBuilder buf(32); + buf << fieldName << "." << h.fieldName(); + string fullname = buf.str(); + + int op3 = getGtLtOp( h ); + if ( op3 == BSONObj::Equality ){ + ranges_[ fullname ] &= FieldRange( h , isNot , optimize ); + } + else { + BSONObjIterator l( h.embeddedObject() ); + while ( l.more() ){ + ranges_[ fullname ] &= FieldRange( l.next() , isNot , optimize ); + } + } + } + } else { + ranges_[ fieldName ] &= FieldRange( f , isNot , optimize ); + } + } + FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query , bool optimize ) : ns_( ns ), query_( query.getOwned() ) { BSONObjIterator i( query_ ); @@ -293,36 +497,38 @@ namespace mongo { if ( strcmp( e.fieldName(), "$where" ) == 0 ) continue; - int op = getGtLtOp( e ); + bool equality = ( getGtLtOp( e ) == BSONObj::Equality ); + if ( equality && e.type() == Object ) { + equality = ( strcmp( e.embeddedObject().firstElement().fieldName(), "$not" ) != 0 ); + } - if ( op == BSONObj::Equality || op == BSONObj::opREGEX || op == BSONObj::opOPTIONS ) { - ranges_[ e.fieldName() ] &= FieldRange( e , optimize ); - } - else if ( op == BSONObj::opELEM_MATCH ){ - BSONObjIterator i( e.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck() ); - while ( i.more() ){ - BSONElement f = i.next(); - StringBuilder buf(32); - buf << e.fieldName() << "." << f.fieldName(); - string fullname = buf.str(); - - int op2 = getGtLtOp( f ); - if ( op2 == BSONObj::Equality ){ - ranges_[ fullname ] &= FieldRange( f , optimize ); - } - else { - BSONObjIterator j( f.embeddedObject() ); - while ( j.more() ){ - ranges_[ fullname ] &= FieldRange( j.next() , optimize ); + if ( equality || ( e.type() == Object && !e.embeddedObject()[ "$regex" ].eoo() ) ) { + ranges_[ e.fieldName() ] &= FieldRange( e , false , optimize ); + } + if ( !equality ) { + BSONObjIterator j( e.embeddedObject() ); + while( j.more() ) { + BSONElement f = j.next(); + if ( strcmp( f.fieldName(), "$not" ) == 0 ) { + switch( f.type() ) { + case Object: { + BSONObjIterator k( f.embeddedObject() ); + while( k.more() ) { + BSONElement g = k.next(); + uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality ); + processOpElement( e.fieldName(), g, true, optimize ); + } + break; + } + case RegEx: + processOpElement( e.fieldName(), f, true, optimize ); + break; + default: + uassert( 13041, "invalid use of $not", false ); } + } else { + processOpElement( e.fieldName(), f, false, optimize ); } - } - } - else { - BSONObjIterator i( e.embeddedObject() ); - while( i.more() ) { - BSONElement f = i.next(); - ranges_[ e.fieldName() ] &= FieldRange( f , optimize ); } } } @@ -445,8 +651,8 @@ namespace mongo { /////////////////// void FieldMatcher::add( const BSONObj& o ){ - massert( 10371 , "can only add to FieldMatcher once", source_.isEmpty()); - source_ = o; + massert( 10371 , "can only add to FieldMatcher once", _source.isEmpty()); + _source = o; BSONObjIterator i( o ); int true_false = -1; @@ -457,23 +663,24 @@ namespace mongo { // validate input if (true_false == -1){ true_false = e.trueValue(); - include_ = !e.trueValue(); - }else{ - if((bool) true_false != e.trueValue()) - errmsg = "You cannot currently mix including and excluding fields. Contact us if this is an issue."; + _include = !e.trueValue(); + } + else{ + uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." , + (bool)true_false == e.trueValue() ); } } } void FieldMatcher::add(const string& field, bool include){ if (field.empty()){ // this is the field the user referred to - include_ = include; + _include = include; } else { const size_t dot = field.find('.'); const string subfield = field.substr(0,dot); const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); - boost::shared_ptr<FieldMatcher>& fm = fields_[subfield]; + boost::shared_ptr<FieldMatcher>& fm = _fields[subfield]; if (!fm) fm.reset(new FieldMatcher(!include)); @@ -482,7 +689,7 @@ namespace mongo { } BSONObj FieldMatcher::getSpec() const{ - return source_; + return _source; } //b will be the value part of an array-typed BSONElement @@ -509,7 +716,7 @@ namespace mongo { break; } default: - if (include_) + if (_include) b.appendAs(e, b.numStr(i++).c_str()); } @@ -518,18 +725,20 @@ namespace mongo { } void FieldMatcher::append( BSONObjBuilder& b , const BSONElement& e ) const { - FieldMap::const_iterator field = fields_.find( e.fieldName() ); + FieldMap::const_iterator field = _fields.find( e.fieldName() ); - if (field == fields_.end()){ - if (include_) + if (field == _fields.end()){ + if (_include) b.append(e); - } else { + } + else { FieldMatcher& subfm = *field->second; - - if (subfm.fields_.empty() || !(e.type()==Object || e.type()==Array) ){ - if (subfm.include_) + + if (subfm._fields.empty() || !(e.type()==Object || e.type()==Array) ){ + if (subfm._include) b.append(e); - } else if (e.type() == Object){ + } + else if (e.type() == Object){ BSONObjBuilder subb; BSONObjIterator it(e.embeddedObject()); while (it.more()){ @@ -537,7 +746,8 @@ namespace mongo { } b.append(e.fieldName(), subb.obj()); - } else { //Array + } + else { //Array BSONObjBuilder subb; subfm.appendArray(subb, e.embeddedObject()); b.appendArray(e.fieldName(), subb.obj()); diff --git a/db/queryutil.h b/db/queryutil.h index 2122a7f..7d8be78 100644 --- a/db/queryutil.h +++ b/db/queryutil.h @@ -48,8 +48,9 @@ namespace mongo { // determine index limits class FieldRange { public: - FieldRange( const BSONElement &e = BSONObj().firstElement() , bool optimize=true ); + FieldRange( const BSONElement &e = BSONObj().firstElement() , bool isNot=false , bool optimize=true ); const FieldRange &operator&=( const FieldRange &other ); + const FieldRange &operator|=( const FieldRange &other ); BSONElement min() const { assert( !empty() ); return intervals_[ 0 ].lower_.bound_; } BSONElement max() const { assert( !empty() ); return intervals_[ intervals_.size() - 1 ].upper_.bound_; } bool minInclusive() const { assert( !empty() ); return intervals_[ 0 ].lower_.inclusive_; } @@ -69,11 +70,13 @@ namespace mongo { } bool empty() const { return intervals_.empty(); } const vector< FieldInterval > &intervals() const { return intervals_; } + string getSpecial() const { return _special; } + private: BSONObj addObj( const BSONObj &o ); - string simpleRegexEnd( string regex ); vector< FieldInterval > intervals_; vector< BSONObj > objData_; + string _special; }; // implements query pattern matching, used to determine if a query is @@ -171,7 +174,9 @@ namespace mongo { } QueryPattern pattern( const BSONObj &sort = BSONObj() ) const; BoundList indexBounds( const BSONObj &keyPattern, int direction ) const; + string getSpecial() const; private: + void processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize ); static FieldRange *trivialRange_; static FieldRange &trivialRange(); mutable map< string, FieldRange > ranges_; @@ -185,26 +190,34 @@ namespace mongo { class FieldMatcher { public: - FieldMatcher(bool include=false) : errmsg(NULL), include_(include) {} + FieldMatcher(bool include=false) : _include(include){} void add( const BSONObj& o ); void append( BSONObjBuilder& b , const BSONElement& e ) const; BSONObj getSpec() const; - - const char* errmsg; //null if FieldMatcher is valid private: void add( const string& field, bool include ); void appendArray( BSONObjBuilder& b , const BSONObj& a ) const; - bool include_; // true if default at this level is to include + bool _include; // true if default at this level is to include //TODO: benchmark vector<pair> vs map typedef map<string, boost::shared_ptr<FieldMatcher> > FieldMap; - FieldMap fields_; - BSONObj source_; + FieldMap _fields; + BSONObj _source; }; + /** returns a string that when used as a matcher, would match a super set of regex() + returns "" for complex regular expressions + used to optimize queries in some simple regex cases that start with '^' + + if purePrefix != NULL, sets it to whether the regex can be converted to a range query + */ + string simpleRegex(const char* regex, const char* flags, bool* purePrefix=NULL); + + /** returns the upper bound of a query that matches prefix */ + string simpleRegexEnd( string prefix ); } // namespace mongo @@ -1,4 +1,20 @@ // rec.h +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + /* TODO for _RECSTORE diff --git a/db/reccache.cpp b/db/reccache.cpp index 66dd4e3..6e1f3de 100644 --- a/db/reccache.cpp +++ b/db/reccache.cpp @@ -1,134 +1,150 @@ -// storage.cpp
-
-#include "stdafx.h"
-#include "pdfile.h"
-#include "reccache.h"
-#include "rec.h"
-#include "db.h"
-
-namespace mongo {
-
-RecCache theRecCache(BucketSize);
-
-// 100k * 8KB = 800MB
-unsigned RecCache::MAXNODES = 50000;
-
-void setRecCacheSize(unsigned mb) {
- unsigned long long MB = mb;
- log(2) << "reccache size: " << MB << "MB\n";
- uassert( 10114 , "bad cache size", MB > 0 && MB < 1000000 );
- RecCache::MAXNODES = (unsigned) MB * 1024 * 1024 / 8192;
- log(3) << "RecCache::MAXNODES=" << RecCache::MAXNODES << '\n';
-}
-
-void writerThread() {
- sleepsecs(10);
- while( 1 ) {
- try {
- theRecCache.writeLazily();
- }
- catch(...) {
- log() << "exception in writerThread()" << endl;
- sleepsecs(3);
- }
- }
-}
-
+/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +// storage.cpp + +#include "stdafx.h" +#include "pdfile.h" +#include "reccache.h" +#include "rec.h" +#include "db.h" + +namespace mongo { + +RecCache theRecCache(BucketSize); + +// 100k * 8KB = 800MB +unsigned RecCache::MAXNODES = 50000; + +void setRecCacheSize(unsigned mb) { + unsigned long long MB = mb; + log(2) << "reccache size: " << MB << "MB\n"; + uassert( 10114 , "bad cache size", MB > 0 && MB < 1000000 ); + RecCache::MAXNODES = (unsigned) MB * 1024 * 1024 / 8192; + log(3) << "RecCache::MAXNODES=" << RecCache::MAXNODES << '\n'; +} + +void writerThread() { + sleepsecs(10); + while( 1 ) { + try { + theRecCache.writeLazily(); + } + catch(...) { + log() << "exception in writerThread()" << endl; + sleepsecs(3); + } + } +} + // called on program exit. -void recCacheCloseAll() {
-#if defined(_RECSTORE)
- theRecCache.closing();
-#endif
-}
-
-int ndirtywritten;
-
-inline static string escape(const char *ns) {
- char buf[256];
- char *p = buf;
- while( 1 ) {
- if( *ns == '$' ) *p = '~';
- else
- *p = *ns;
- if( *ns == 0 )
- break;
- p++; ns++;
- }
- assert( p - buf < (int) sizeof(buf) );
- return buf;
-}
-
-inline static string unescape(const char *ns) {
- char buf[256];
- char *p = buf;
- while( 1 ) {
- if( *ns == '~' ) *p = '$';
- else
- *p = *ns;
- if( *ns == 0 )
- break;
- p++; ns++;
- }
- assert( p - buf < (int) sizeof(buf) );
- return buf;
-}
-
-string RecCache::directory() {
- return cc().database()->path;
-}
-
-/* filename format is
-
- <n>-<ns>.idx
-*/
-
-BasicRecStore* RecCache::_initStore(string fname) {
-
- assert( strchr(fname.c_str(), '/') == 0 );
- assert( strchr(fname.c_str(), '\\') == 0 );
-
- stringstream ss(fname);
- int n;
- ss >> n;
- assert( n >= 0 );
- char ch;
- ss >> ch;
- assert( ch == '-' );
- string rest;
- ss >> rest;
- const char *p = rest.c_str();
- const char *q = strstr(p, ".idx");
- assert( q );
- string escaped_ns(p, q-p);
-
- // arbitrary limit. if you are hitting, we should use fewer files and put multiple
- // indexes in a single file (which is easy to do)
- massert( 10374 , "too many index files", n < 10000 );
-
- if( stores.size() < (unsigned)n+1 )
- stores.resize(n+1);
- assert( stores[n] == 0 );
- BasicRecStore *rs = new BasicRecStore(n);
- path pf(directory());
- pf /= fname;
- string full = pf.string();
- rs->init(full.c_str(), recsize);
- stores[n] = rs;
- string ns = unescape(escaped_ns.c_str());
- storesByNsKey[mknskey(ns.c_str())] = rs;
- return rs;
-}
-
-BasicRecStore* RecCache::initStore(int n) {
- string ns;
- {
- stringstream ss;
- ss << '/' << n << '-';
- ns = ss.str();
- }
-
- /* this will be slow if there are thousands of files */
- path dir(directory());
+void recCacheCloseAll() { +#if defined(_RECSTORE) + theRecCache.closing(); +#endif +} + +int ndirtywritten; + +inline static string escape(const char *ns) { + char buf[256]; + char *p = buf; + while( 1 ) { + if( *ns == '$' ) *p = '~'; + else + *p = *ns; + if( *ns == 0 ) + break; + p++; ns++; + } + assert( p - buf < (int) sizeof(buf) ); + return buf; +} + +inline static string unescape(const char *ns) { + char buf[256]; + char *p = buf; + while( 1 ) { + if( *ns == '~' ) *p = '$'; + else + *p = *ns; + if( *ns == 0 ) + break; + p++; ns++; + } + assert( p - buf < (int) sizeof(buf) ); + return buf; +} + +string RecCache::directory() { + return cc().database()->path; +} + +/* filename format is + + <n>-<ns>.idx +*/ + +BasicRecStore* RecCache::_initStore(string fname) { + + assert( strchr(fname.c_str(), '/') == 0 ); + assert( strchr(fname.c_str(), '\\') == 0 ); + + stringstream ss(fname); + int n; + ss >> n; + assert( n >= 0 ); + char ch; + ss >> ch; + assert( ch == '-' ); + string rest; + ss >> rest; + const char *p = rest.c_str(); + const char *q = strstr(p, ".idx"); + assert( q ); + string escaped_ns(p, q-p); + + // arbitrary limit. if you are hitting, we should use fewer files and put multiple + // indexes in a single file (which is easy to do) + massert( 10374 , "too many index files", n < 10000 ); + + if( stores.size() < (unsigned)n+1 ) + stores.resize(n+1); + assert( stores[n] == 0 ); + BasicRecStore *rs = new BasicRecStore(n); + path pf(directory()); + pf /= fname; + string full = pf.string(); + rs->init(full.c_str(), recsize); + stores[n] = rs; + string ns = unescape(escaped_ns.c_str()); + storesByNsKey[mknskey(ns.c_str())] = rs; + return rs; +} + +BasicRecStore* RecCache::initStore(int n) { + string ns; + { + stringstream ss; + ss << '/' << n << '-'; + ns = ss.str(); + } + + /* this will be slow if there are thousands of files */ + path dir(directory()); directory_iterator end; try { directory_iterator i(dir); @@ -152,27 +168,27 @@ BasicRecStore* RecCache::initStore(int n) { } stringstream ss; ss << "index datafile missing? n=" << n; - uasserted(12500,ss.str());
- return 0;
-}
-
-/* find the filename for a given ns.
- format is
- <n>-<escaped_ns>.idx
- returns filename. found is true if found. If false, a proposed name is returned for (optional) creation
- of the file.
-*/
-string RecCache::findStoreFilename(const char *_ns, bool& found) {
- string namefrag;
- {
- stringstream ss;
- ss << '-';
- ss << escape(_ns);
- ss << ".idx";
- namefrag = ss.str();
- }
-
- path dir(directory());
+ uasserted(12500,ss.str()); + return 0; +} + +/* find the filename for a given ns. + format is + <n>-<escaped_ns>.idx + returns filename. found is true if found. If false, a proposed name is returned for (optional) creation + of the file. +*/ +string RecCache::findStoreFilename(const char *_ns, bool& found) { + string namefrag; + { + stringstream ss; + ss << '-'; + ss << escape(_ns); + ss << ".idx"; + namefrag = ss.str(); + } + + path dir(directory()); directory_iterator end; int nmax = -1; try { @@ -204,198 +220,198 @@ string RecCache::findStoreFilename(const char *_ns, bool& found) { ss << nmax+1 << namefrag; found = false; return ss.str(); -}
-
-void RecCache::initStoreByNs(const char *_ns, const string& nskey) {
- bool found;
- string fn = findStoreFilename(_ns, found);
+} + +void RecCache::initStoreByNs(const char *_ns, const string& nskey) { + bool found; + string fn = findStoreFilename(_ns, found); _initStore(fn); -}
-
-inline void RecCache::writeIfDirty(Node *n) {
- if( n->dirty ) {
- ndirtywritten++;
- n->dirty = false;
- store(n->loc).update(fileOfs(n->loc), n->data, recsize);
- }
-}
-
-void RecCache::closeFiles(string dbname, string path) {
- assertInWriteLock();
- boostlock lk(rcmutex);
-
- // first we write all dirty pages. it is not easy to check which Nodes are for a particular
- // db, so we just write them all.
- writeDirty( dirtyl.begin(), true );
-
- string key = path + dbname + '.';
- unsigned sz = key.size();
- for( map<string, BasicRecStore*>::iterator i = storesByNsKey.begin(); i != storesByNsKey.end(); i++ ) {
- map<string, BasicRecStore*>::iterator j = i;
- i++;
- if( strncmp(j->first.c_str(), key.c_str(), sz) == 0 ) {
- assert( stores[j->second->fileNumber] != 0 );
- stores[j->second->fileNumber] = 0;
- delete j->second;
- storesByNsKey.erase(j);
- }
- }
-}
-
-void RecCache::closing() {
- boostlock lk(rcmutex);
- (cout << "TEMP: recCacheCloseAll() writing dirty pages...\n").flush();
- writeDirty( dirtyl.begin(), true );
- for( unsigned i = 0; i < stores.size(); i++ ) {
- if( stores[i] ) {
- delete stores[i];
- }
- }
- (cout << "TEMP: write dirty done\n").flush();
-}
-
-/* note that this is written in order, as much as possible, given that dirtyl is of type set. */
-void RecCache::writeDirty( set<DiskLoc>::iterator startAt, bool rawLog ) {
- try {
- ndirtywritten=0;
- for( set<DiskLoc>::iterator i = startAt; i != dirtyl.end(); i++ ) {
- map<DiskLoc, Node*>::iterator j = m.find(*i);
- if( j != m.end() )
- writeIfDirty(j->second);
- }
- OCCASIONALLY out() << "TEMP: ndirtywritten: " << ndirtywritten << endl;
- }
- catch(...) {
+} + +inline void RecCache::writeIfDirty(Node *n) { + if( n->dirty ) { + ndirtywritten++; + n->dirty = false; + store(n->loc).update(fileOfs(n->loc), n->data, recsize); + } +} + +void RecCache::closeFiles(string dbname, string path) { + assertInWriteLock(); + scoped_lock lk(rcmutex); + + // first we write all dirty pages. it is not easy to check which Nodes are for a particular + // db, so we just write them all. + writeDirty( dirtyl.begin(), true ); + + string key = path + dbname + '.'; + unsigned sz = key.size(); + for( map<string, BasicRecStore*>::iterator i = storesByNsKey.begin(); i != storesByNsKey.end(); i++ ) { + map<string, BasicRecStore*>::iterator j = i; + i++; + if( strncmp(j->first.c_str(), key.c_str(), sz) == 0 ) { + assert( stores[j->second->fileNumber] != 0 ); + stores[j->second->fileNumber] = 0; + delete j->second; + storesByNsKey.erase(j); + } + } +} + +void RecCache::closing() { + scoped_lock lk(rcmutex); + (cout << "TEMP: recCacheCloseAll() writing dirty pages...\n").flush(); + writeDirty( dirtyl.begin(), true ); + for( unsigned i = 0; i < stores.size(); i++ ) { + if( stores[i] ) { + delete stores[i]; + } + } + (cout << "TEMP: write dirty done\n").flush(); +} + +/* note that this is written in order, as much as possible, given that dirtyl is of type set. */ +void RecCache::writeDirty( set<DiskLoc>::iterator startAt, bool rawLog ) { + try { + ndirtywritten=0; + for( set<DiskLoc>::iterator i = startAt; i != dirtyl.end(); i++ ) { + map<DiskLoc, Node*>::iterator j = m.find(*i); + if( j != m.end() ) + writeIfDirty(j->second); + } + OCCASIONALLY out() << "TEMP: ndirtywritten: " << ndirtywritten << endl; + } + catch(...) { const char *message = "Problem: bad() in RecCache::writeDirty, file io error\n"; if ( rawLog ) rawOut( message ); else ( log() << message ).flush(); - }
- dirtyl.clear();
-}
-
-void RecCache::writeLazily() {
- int sleep = 0;
- int k;
- {
- boostlock lk(rcmutex);
- Timer t;
- set<DiskLoc>::iterator i = dirtyl.end();
- for( k = 0; k < 100; k++ ) {
- if( i == dirtyl.begin() ) {
- // we're not very far behind
- sleep = k < 20 ? 2000 : 1000;
- break;
- }
- i--;
- }
- writeDirty(i);
- if( sleep == 0 ) {
- sleep = t.millis() * 4 + 10;
- }
- }
-
- OCCASIONALLY cout << "writeLazily " << k << " sleep:" << sleep << '\n';
- sleepmillis(sleep);
-}
-
-void RecCache::_ejectOld() {
- boostlock lk(rcmutex);
- if( nnodes <= MAXNODES )
- return;
- Node *n = oldest;
- while( 1 ) {
- if( nnodes <= MAXNODES - 4 ) {
- n->older = 0;
- oldest = n;
- assert( oldest ) ;
- break;
- }
- nnodes--;
- assert(n);
- Node *nxt = n->newer;
- writeIfDirty(n);
- m.erase(n->loc);
- delete n;
- n = nxt;
- }
-}
-
-void RecCache::dump() {
- Node *n = oldest;
- Node *last = 0;
- while( n ) {
- assert( n->older == last );
- last = n;
-// cout << n << ' ' << n->older << ' ' << n->newer << '\n';
- n=n->newer;
- }
- assert( newest == last );
-// cout << endl;
-}
-
-/* cleans up everything EXCEPT storesByNsKey.
- note this function is slow should not be invoked often
-*/
-void RecCache::closeStore(BasicRecStore *rs) {
- int n = rs->fileNumber + Base;
- for( set<DiskLoc>::iterator i = dirtyl.begin(); i != dirtyl.end(); ) {
- DiskLoc k = *i++;
- if( k.a() == n )
- dirtyl.erase(k);
- }
-
- for( map<DiskLoc,Node*>::iterator i = m.begin(); i != m.end(); ) {
- DiskLoc k = i->first;
- i++;
- if( k.a() == n )
- m.erase(k);
- }
-
- assert( stores[rs->fileNumber] != 0 );
- stores[rs->fileNumber] = 0;
-/*
- for( unsigned i = 0; i < stores.size(); i++ ) {
- if( stores[i] == rs ) {
- stores[i] = 0;
- break;
- }
- }*/
- delete rs; // closes file
-}
-
-void RecCache::drop(const char *_ns) {
- // todo: test with a non clean shutdown file
- boostlock lk(rcmutex);
-
- map<string, BasicRecStore*>::iterator it = storesByNsKey.find(mknskey(_ns));
- string fname;
- if( it != storesByNsKey.end() ) {
- fname = it->second->filename;
- closeStore(it->second); // cleans up stores[] etc.
- storesByNsKey.erase(it);
- }
- else {
- bool found;
- fname = findStoreFilename(_ns, found);
- if( !found ) {
- log() << "RecCache::drop: no idx file found for " << _ns << endl;
- return;
- }
- path pf(directory());
- pf /= fname;
- fname = pf.string();
- }
- try {
- if( !boost::filesystem::exists(fname) )
- log() << "RecCache::drop: can't find file to remove " << fname << endl;
- boost::filesystem::remove(fname);
- }
- catch(...) {
- log() << "RecCache::drop: exception removing file " << fname << endl;
- }
-}
-
-}
+ } + dirtyl.clear(); +} + +void RecCache::writeLazily() { + int sleep = 0; + int k; + { + scoped_lock lk(rcmutex); + Timer t; + set<DiskLoc>::iterator i = dirtyl.end(); + for( k = 0; k < 100; k++ ) { + if( i == dirtyl.begin() ) { + // we're not very far behind + sleep = k < 20 ? 2000 : 1000; + break; + } + i--; + } + writeDirty(i); + if( sleep == 0 ) { + sleep = t.millis() * 4 + 10; + } + } + + OCCASIONALLY cout << "writeLazily " << k << " sleep:" << sleep << '\n'; + sleepmillis(sleep); +} + +void RecCache::_ejectOld() { + scoped_lock lk(rcmutex); + if( nnodes <= MAXNODES ) + return; + Node *n = oldest; + while( 1 ) { + if( nnodes <= MAXNODES - 4 ) { + n->older = 0; + oldest = n; + assert( oldest ) ; + break; + } + nnodes--; + assert(n); + Node *nxt = n->newer; + writeIfDirty(n); + m.erase(n->loc); + delete n; + n = nxt; + } +} + +void RecCache::dump() { + Node *n = oldest; + Node *last = 0; + while( n ) { + assert( n->older == last ); + last = n; +// cout << n << ' ' << n->older << ' ' << n->newer << '\n'; + n=n->newer; + } + assert( newest == last ); +// cout << endl; +} + +/* cleans up everything EXCEPT storesByNsKey. + note this function is slow should not be invoked often +*/ +void RecCache::closeStore(BasicRecStore *rs) { + int n = rs->fileNumber + Base; + for( set<DiskLoc>::iterator i = dirtyl.begin(); i != dirtyl.end(); ) { + DiskLoc k = *i++; + if( k.a() == n ) + dirtyl.erase(k); + } + + for( map<DiskLoc,Node*>::iterator i = m.begin(); i != m.end(); ) { + DiskLoc k = i->first; + i++; + if( k.a() == n ) + m.erase(k); + } + + assert( stores[rs->fileNumber] != 0 ); + stores[rs->fileNumber] = 0; +/* + for( unsigned i = 0; i < stores.size(); i++ ) { + if( stores[i] == rs ) { + stores[i] = 0; + break; + } + }*/ + delete rs; // closes file +} + +void RecCache::drop(const char *_ns) { + // todo: test with a non clean shutdown file + scoped_lock lk(rcmutex); + + map<string, BasicRecStore*>::iterator it = storesByNsKey.find(mknskey(_ns)); + string fname; + if( it != storesByNsKey.end() ) { + fname = it->second->filename; + closeStore(it->second); // cleans up stores[] etc. + storesByNsKey.erase(it); + } + else { + bool found; + fname = findStoreFilename(_ns, found); + if( !found ) { + log() << "RecCache::drop: no idx file found for " << _ns << endl; + return; + } + path pf(directory()); + pf /= fname; + fname = pf.string(); + } + try { + if( !boost::filesystem::exists(fname) ) + log() << "RecCache::drop: can't find file to remove " << fname << endl; + boost::filesystem::remove(fname); + } + catch(...) { + log() << "RecCache::drop: exception removing file " << fname << endl; + } +} + +} diff --git a/db/reccache.h b/db/reccache.h index 42943c5..d354587 100644 --- a/db/reccache.h +++ b/db/reccache.h @@ -1,4 +1,20 @@ // reccache.h +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + /* CachedBasicRecStore This is our store which implements a traditional page-cache type of storage @@ -33,7 +49,7 @@ class RecCache { bool dirty; Node *older, *newer; // lru }; - boost::mutex &rcmutex; // mainly to coordinate with the lazy writer thread + mongo::mutex rcmutex; // mainly to coordinate with the lazy writer thread unsigned recsize; map<DiskLoc, Node*> m; // the cache Node *newest, *oldest; @@ -118,7 +134,7 @@ private: public: /* all public functions (except constructor) should use the mutex */ - RecCache(unsigned recsz) : rcmutex( *( new boost::mutex() ) ), recsize(recsz) { + RecCache(unsigned recsz) : recsize(recsz) { nnodes = 0; newest = oldest = 0; } @@ -140,7 +156,7 @@ public: */ void dirty(DiskLoc d) { assert( d.a() >= Base ); - boostlock lk(rcmutex); + scoped_lock lk(rcmutex); map<DiskLoc, Node*>::iterator i = m.find(d); if( i != m.end() ) { Node *n = i->second; @@ -155,7 +171,7 @@ public: assert( d.a() >= Base ); assert( len == recsize ); - boostlock lk(rcmutex); + scoped_lock lk(rcmutex); map<DiskLoc, Node*>::iterator i = m.find(d); if( i != m.end() ) { touch(i->second); @@ -172,7 +188,7 @@ public: void drop(const char *ns); DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { - boostlock lk(rcmutex); + scoped_lock lk(rcmutex); BasicRecStore& rs = store(ns); fileofs o = rs.insert((const char *) obuf, len); assert( o % recsize == 0 ); @@ -229,9 +245,11 @@ public: */ inline void dbunlocking_read() { + /* Client *c = currentClient.get(); if ( c ) c->top.clientStop(); + */ } inline void dbunlocking_write() { @@ -1,8 +1,24 @@ // reci.h +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + #pragma once -#include "storage.h" +#include "diskloc.h" namespace mongo { diff --git a/db/recstore.h b/db/recstore.h index 2e6a90a..bdb3d77 100644 --- a/db/recstore.h +++ b/db/recstore.h @@ -1,108 +1,124 @@ -// recstore.h
-
-#pragma once
-
-#include "../util/file.h"
-
-namespace mongo {
-
-using boost::uint32_t;
-using boost::uint64_t;
-
-/* Current version supports only consistent record sizes within a store. */
-
-class BasicRecStore {
- struct RecStoreHeader {
- uint32_t version;
- uint32_t recsize;
- uint64_t leof; // logical eof, actual file might be prealloc'd further
- uint64_t firstDeleted; // 0 = no deleted recs
- uint32_t cleanShutdown; // 0 = clean
- char reserved[8192-8-8-4-4-4]; // we want our records page-aligned in the file if they are a multiple of a page's size -- so we make this 8KB with that goal
- RecStoreHeader() {
- version = 65;
- recsize = 0;
- leof = sizeof(RecStoreHeader);
- firstDeleted = 0;
- cleanShutdown = 1;
- memset(reserved, 0, sizeof(reserved));
- }
- };
-
-public:
- BasicRecStore(int _fileNumber) : fileNumber(_fileNumber) { }
- ~BasicRecStore();
- void init(const char *fn, unsigned recsize);
- fileofs insert(const char *buf, unsigned len);
- void update(fileofs o, const char *buf, unsigned len);
- void remove(fileofs o, unsigned len);
- void get(fileofs o, char *buf, unsigned len);
-
- int fileNumber; // this goes in DiskLoc::a
-
- string filename;
-
-private:
-
- void writeHeader();
- File f;
- fileofs len;
- RecStoreHeader h; // h.reserved is wasteful here; fix later.
- void write(fileofs ofs, const char *data, unsigned len) {
- f.write(ofs, data, len);
- massert( 10380 , "basicrecstore write io error", !f.bad());
- }
-};
-
-/* --- implementation --- */
-
-inline BasicRecStore::~BasicRecStore() {
- h.cleanShutdown = 0;
- if( f.is_open() ) {
- writeHeader();
- f.fsync();
- }
-}
-
-inline void BasicRecStore::writeHeader() {
- write(0, (const char *) &h, 28); // update header in file for new leof
- uassert( 10115 , "file io error in BasicRecStore [1]", !f.bad());
-}
-
-inline fileofs BasicRecStore::insert(const char *buf, unsigned reclen) {
- if( h.firstDeleted ) {
- uasserted(11500, "deleted not yet implemented recstoreinsert");
- }
- massert( 10381 , "bad len", reclen == h.recsize);
- fileofs ofs = h.leof;
- h.leof += reclen;
- if( h.leof > len ) {
- // grow the file. we grow quite a bit to avoid excessive file system fragmentations
- len += (len / 8) + h.recsize;
- uassert( 10116 , "recstore file too big for 32 bit", len <= 0x7fffffff || sizeof(std::streamoff) > 4 );
- write(len, "", 0);
- }
- writeHeader();
- write(ofs, buf, reclen);
- uassert( 10117 , "file io error in BasicRecStore [2]", !f.bad());
- return ofs;
-}
-
-/* so far, it's ok to read or update a subset of a record */
-
-inline void BasicRecStore::update(fileofs o, const char *buf, unsigned len) {
- assert(o <= h.leof && o >= sizeof(RecStoreHeader));
- write(o, buf, len);
-}
-
-inline void BasicRecStore::get(fileofs o, char *buf, unsigned len) {
- assert(o <= h.leof && o >= sizeof(RecStoreHeader));
- f.read(o, buf, len);
- massert( 10382 , "basicrestore::get I/O error", !f.bad());
-}
-
-inline void BasicRecStore::remove(fileofs o, unsigned len) {
- uasserted(11501, "not yet implemented recstoreremove");
-}
-
-}
+// recstore.h +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#pragma once + +#include "../util/file.h" + +namespace mongo { + +using boost::uint32_t; +using boost::uint64_t; + +/* Current version supports only consistent record sizes within a store. */ + +class BasicRecStore { + struct RecStoreHeader { + uint32_t version; + uint32_t recsize; + uint64_t leof; // logical eof, actual file might be prealloc'd further + uint64_t firstDeleted; // 0 = no deleted recs + uint32_t cleanShutdown; // 0 = clean + char reserved[8192-8-8-4-4-4]; // we want our records page-aligned in the file if they are a multiple of a page's size -- so we make this 8KB with that goal + RecStoreHeader() { + version = 65; + recsize = 0; + leof = sizeof(RecStoreHeader); + firstDeleted = 0; + cleanShutdown = 1; + memset(reserved, 0, sizeof(reserved)); + } + }; + +public: + BasicRecStore(int _fileNumber) : fileNumber(_fileNumber) { } + ~BasicRecStore(); + void init(const char *fn, unsigned recsize); + fileofs insert(const char *buf, unsigned len); + void update(fileofs o, const char *buf, unsigned len); + void remove(fileofs o, unsigned len); + void get(fileofs o, char *buf, unsigned len); + + int fileNumber; // this goes in DiskLoc::a + + string filename; + +private: + + void writeHeader(); + File f; + fileofs len; + RecStoreHeader h; // h.reserved is wasteful here; fix later. + void write(fileofs ofs, const char *data, unsigned len) { + f.write(ofs, data, len); + massert( 10380 , "basicrecstore write io error", !f.bad()); + } +}; + +/* --- implementation --- */ + +inline BasicRecStore::~BasicRecStore() { + h.cleanShutdown = 0; + if( f.is_open() ) { + writeHeader(); + f.fsync(); + } +} + +inline void BasicRecStore::writeHeader() { + write(0, (const char *) &h, 28); // update header in file for new leof + uassert( 10115 , "file io error in BasicRecStore [1]", !f.bad()); +} + +inline fileofs BasicRecStore::insert(const char *buf, unsigned reclen) { + if( h.firstDeleted ) { + uasserted(11500, "deleted not yet implemented recstoreinsert"); + } + massert( 10381 , "bad len", reclen == h.recsize); + fileofs ofs = h.leof; + h.leof += reclen; + if( h.leof > len ) { + // grow the file. we grow quite a bit to avoid excessive file system fragmentations + len += (len / 8) + h.recsize; + uassert( 10116 , "recstore file too big for 32 bit", len <= 0x7fffffff || sizeof(std::streamoff) > 4 ); + write(len, "", 0); + } + writeHeader(); + write(ofs, buf, reclen); + uassert( 10117 , "file io error in BasicRecStore [2]", !f.bad()); + return ofs; +} + +/* so far, it's ok to read or update a subset of a record */ + +inline void BasicRecStore::update(fileofs o, const char *buf, unsigned len) { + assert(o <= h.leof && o >= sizeof(RecStoreHeader)); + write(o, buf, len); +} + +inline void BasicRecStore::get(fileofs o, char *buf, unsigned len) { + assert(o <= h.leof && o >= sizeof(RecStoreHeader)); + f.read(o, buf, len); + massert( 10382 , "basicrestore::get I/O error", !f.bad()); +} + +inline void BasicRecStore::remove(fileofs o, unsigned len) { + uasserted(11501, "not yet implemented recstoreremove"); +} + +} diff --git a/db/repl.cpp b/db/repl.cpp index 04c8d73..62b2986 100644 --- a/db/repl.cpp +++ b/db/repl.cpp @@ -1,10 +1,8 @@ // repl.cpp /* TODO - PAIRING _ on a syncexception, don't allow going back to master state? - */ /** @@ -39,6 +37,7 @@ #include "repl.h" #include "../util/message.h" #include "../client/dbclient.h" +#include "../client/connpool.h" #include "pdfile.h" #include "query.h" #include "db.h" @@ -47,6 +46,9 @@ #include "cmdline.h" namespace mongo { + + // our config from command line etc. + ReplSettings replSettings; void ensureHaveIdIndex(const char *ns); @@ -63,11 +65,12 @@ namespace mongo { */ const char *replAllDead = 0; - extern bool autoresync; time_t lastForcedResync = 0; IdTracker &idTracker = *( new IdTracker() ); + int __findingStartInitialTimeout = 5; // configurable for testing + } // namespace mongo #include "replset.h" @@ -137,6 +140,7 @@ namespace mongo { virtual bool logTheOp() { return false; } + virtual LockType locktype(){ return WRITE; } CmdReplacePeer() : Command("replacepeer") { } virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { if ( replPair == 0 ) { @@ -196,9 +200,12 @@ namespace mongo { virtual bool logTheOp() { return false; } + virtual LockType locktype(){ return WRITE; } CmdForceDead() : Command("forcedead") { } virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - replAllDead = "forced by command"; + replAllDead = "replication forced to stop by 'forcedead' command"; + log() << "*********************************************************\n"; + log() << "received 'forcedead' command, replication forced to stop" << endl; return true; } } cmdForceDead; @@ -215,6 +222,7 @@ namespace mongo { virtual bool logTheOp() { return false; } + virtual LockType locktype(){ return WRITE; } CmdResync() : Command("resync") { } virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { if ( cmdObj.getBoolField( "force" ) ) { @@ -253,12 +261,85 @@ namespace mongo { } } cmdResync; + bool anyReplEnabled(){ + return replPair || replSettings.slave || replSettings.master; + } + + void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ){ + + if ( replAllDead ) { + result.append("ismaster", 0.0); + if( authed ) { + if ( replPair ) + result.append("remote", replPair->remote); + } + string s = string("dead: ") + replAllDead; + result.append("info", s); + } + else if ( replPair ) { + result.append("ismaster", replPair->state); + if( authed ) { + result.append("remote", replPair->remote); + if ( !replPair->info.empty() ) + result.append("info", replPair->info); + } + } + else { + result.append("ismaster", replSettings.slave ? 0 : 1); + result.append("msg", "not paired"); + } + + if ( level ){ + BSONObjBuilder sources( result.subarrayStart( "sources" ) ); + + readlock lk( "local.sources" ); + Client::Context ctx( "local.sources" ); + auto_ptr<Cursor> c = findTableScan("local.sources", BSONObj()); + int n = 0; + while ( c->ok() ){ + BSONObj s = c->current(); + + BSONObjBuilder bb; + bb.append( s["host"] ); + string sourcename = s["source"].valuestr(); + if ( sourcename != "main" ) + bb.append( s["source"] ); + + { + BSONElement e = s["syncedTo"]; + BSONObjBuilder t( bb.subobjStart( "syncedTo" ) ); + t.appendDate( "time" , e.timestampTime() ); + t.append( "inc" , e.timestampInc() ); + t.done(); + } + + if ( level > 1 ){ + dbtemprelease unlock; + ScopedDbConnection conn( s["host"].valuestr() ); + BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) ); + BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) ); + bb.appendDate( "masterFirst" , first["ts"].timestampTime() ); + bb.appendDate( "masterLast" , last["ts"].timestampTime() ); + double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); + bb.append( "lagSeconds" , lag / 1000 ); + conn.done(); + } + + sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() ); + c->advance(); + } + + sources.done(); + } + } + class CmdIsMaster : public Command { public: virtual bool requiresAuth() { return false; } virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return NONE; } CmdIsMaster() : Command("ismaster") { } virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not @@ -266,30 +347,9 @@ namespace mongo { we allow unauthenticated ismaster but we aren't as verbose informationally if one is not authenticated for admin db to be safe. */ - AuthenticationInfo *ai = currentClient.get()->ai; - bool authed = ai->isAuthorized("admin"); - - if ( replAllDead ) { - result.append("ismaster", 0.0); - if( authed ) { - if ( replPair ) - result.append("remote", replPair->remote); - result.append("info", replAllDead); - } - } - else if ( replPair ) { - result.append("ismaster", replPair->state); - if( authed ) { - result.append("remote", replPair->remote); - if ( !replPair->info.empty() ) - result.append("info", replPair->info); - } - } - else { - result.append("ismaster", slave ? 0 : 1); - result.append("msg", "not paired"); - } + bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin"); + appendReplicationInfo( result , authed ); return true; } } cmdismaster; @@ -300,6 +360,7 @@ namespace mongo { virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return WRITE; } CmdIsInitialSyncComplete() : Command( "isinitialsynccomplete" ) {} virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { result.appendBool( "initialsynccomplete", getInitialSyncCompleted() ); @@ -333,7 +394,7 @@ namespace mongo { virtual bool adminOnly() { return true; } - + virtual LockType locktype(){ return WRITE; } virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { if ( replPair == 0 ) { massert( 10383 , "Another mongod instance believes incorrectly that this node is its peer", !cmdObj.getBoolField( "fromArbiter" ) ); @@ -541,12 +602,13 @@ namespace mongo { BSONObj o = jsobj(); log( 1 ) << "Saving repl source: " << o << endl; - OpDebug debug; - setClient("local.sources"); - UpdateResult res = updateObjects("local.sources", o, pattern, true/*upsert for pair feature*/, false,false,debug); - assert( ! res.mod ); - assert( res.num == 1 ); - cc().clearns(); + { + OpDebug debug; + Client::Context ctx("local.sources"); + UpdateResult res = updateObjects("local.sources", o, pattern, true/*upsert for pair feature*/, false,false,debug); + assert( ! res.mod ); + assert( res.num == 1 ); + } if ( replacing ) { /* if we were in "replace" mode, we now have synced up with the replacement, @@ -578,13 +640,13 @@ namespace mongo { and cursor in effect. */ void ReplSource::loadAll(SourceVector &v) { + Client::Context ctx("local.sources"); SourceVector old = v; v.clear(); bool gotPairWith = false; if ( !cmdLine.source.empty() ) { - setClient("local.sources"); // --source <host> specified. // check that no items are in sources other than that // add if missing @@ -594,8 +656,8 @@ namespace mongo { n++; ReplSource tmp(c->current()); if ( tmp.hostName != cmdLine.source ) { - log() << "--source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl; - log() << "terminating after 30 seconds" << endl; + log() << "repl: --source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl; + log() << "repl: terminating mongod after 30 seconds" << endl; sleepsecs(30); dbexit( EXIT_REPLICATION_ERROR ); } @@ -626,8 +688,10 @@ namespace mongo { if ( replPair ) { const string &remote = replPair->remote; - setClient( "local.sources" ); // --pairwith host specified. + if ( replSettings.fastsync ) { + Helpers::emptyCollection( "local.sources" ); // ignore saved sources + } // check that no items are in sources other than that // add if missing auto_ptr<Cursor> c = findTableScan("local.sources", BSONObj()); @@ -652,7 +716,6 @@ namespace mongo { } } - setClient("local.sources"); auto_ptr<Cursor> c = findTableScan("local.sources", BSONObj()); while ( c->ok() ) { ReplSource tmp(c->current()); @@ -664,11 +727,21 @@ namespace mongo { tmp.syncedTo = OpTime(); tmp.replacing = true; } + } + if ( ( !replPair && tmp.syncedTo.isNull() ) || + ( replPair && replSettings.fastsync ) ) { + DBDirectClient c; + if ( c.exists( "local.oplog.$main" ) ) { + BSONObj op = c.findOne( "local.oplog.$main", Query().sort( BSON( "$natural" << -1 ) ) ); + if ( !op.isEmpty() ) { + tmp.syncedTo = op[ "ts" ].date(); + tmp._lastSavedLocalTs = op[ "ts" ].date(); + } + } } addSourceToList(v, tmp, c->current(), old); c->advance(); } - cc().clearns(); if ( !gotPairWith && replPair ) { /* add the --pairwith server */ @@ -732,7 +805,7 @@ namespace mongo { string ReplSource::resyncDrop( const char *db, const char *requester ) { log() << "resync: dropping database " << db << endl; string dummyns = string( db ) + "."; - setClient(dummyns.c_str()); + Client::Context ctx(dummyns); assert( cc().database()->name == db ); dropDatabase(dummyns.c_str()); return dummyns; @@ -741,9 +814,9 @@ namespace mongo { /* grab initial copy of a database from the master */ bool ReplSource::resync(string db) { string dummyNs = resyncDrop( db.c_str(), "internal" ); - setClient( dummyNs.c_str() ); + Client::Context ctx( dummyNs ); { - log() << "resync: cloning database " << db << endl; + log() << "resync: cloning database " << db << " to get an initial copy" << endl; ReplInfo r("resync: cloning a database"); string errmsg; bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveok*/ true, /*replauth*/ true, /*snapshot*/false); @@ -753,7 +826,7 @@ namespace mongo { } } - log() << "resync: done " << db << endl; + log() << "resync: done with initial clone for db: " << db << endl; return true; } @@ -864,29 +937,21 @@ namespace mongo { throw SyncException(); } - bool justCreated; - try { - justCreated = setClient(ns); - } catch ( AssertionException& ) { - problem() << "skipping bad(?) op in oplog, setClient() failed, ns: '" << ns << "'\n"; - addDbNextPass.erase(clientName); - return; - } + Client::Context ctx( ns ); - bool empty = cc().database()->isEmpty(); + bool empty = ctx.db()->isEmpty(); bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0; - log( 6 ) << "ns: " << ns << ", justCreated: " << justCreated << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl; - - // always apply admin command command - // this is a bit hacky -- the semantics of replication/commands aren't well specified - if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) { - applyOperation( op ); - cc().clearns(); - return; - } + log( 6 ) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl; - if ( justCreated || empty || incompleteClone ) { + // always apply admin command command + // this is a bit hacky -- the semantics of replication/commands aren't well specified + if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) { + applyOperation( op ); + return; + } + + if ( ctx.justCreated() || empty || incompleteClone ) { // we must add to incomplete list now that setClient has been called incompleteCloneDbs.insert( clientName ); if ( nClonedThisPass ) { @@ -901,9 +966,9 @@ namespace mongo { log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl; } save(); - setClient( ns ); + Client::Context ctx(ns); nClonedThisPass++; - resync(cc().database()->name); + resync(ctx.db()->name); addDbNextPass.erase(clientName); incompleteCloneDbs.erase( clientName ); } @@ -927,7 +992,6 @@ namespace mongo { } addDbNextPass.erase( clientName ); } - cc().clearns(); } BSONObj ReplSource::idForOp( const BSONObj &op, bool &mod ) { @@ -981,16 +1045,20 @@ namespace mongo { void ReplSource::syncToTailOfRemoteLog() { string _ns = ns(); - BSONObj last = conn->findOne( _ns.c_str(), Query().sort( BSON( "$natural" << -1 ) ) ); + BSONObjBuilder b; + if ( !only.empty() ) { + b.appendRegex("ns", string("^") + only); + } + BSONObj last = conn->findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) ); if ( !last.isEmpty() ) { - BSONElement ts = last.findElement( "ts" ); + BSONElement ts = last.getField( "ts" ); massert( 10386 , "non Date ts found", ts.type() == Date || ts.type() == Timestamp ); syncedTo = OpTime( ts.date() ); } } OpTime ReplSource::nextLastSavedLocalTs() const { - setClient( "local.oplog.$main" ); + Client::Context ctx( "local.oplog.$main" ); auto_ptr< Cursor > c = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) ); if ( c->ok() ) return OpTime( c->current().getField( "ts" ).date() ); @@ -1003,7 +1071,10 @@ namespace mongo { } void ReplSource::resetSlave() { - massert( 10387 , "request to kill slave replication falied", + log() << "**********************************************************\n"; + log() << "Sending forcedead command to slave to stop its replication\n"; + log() << "Host: " << hostName << " paired: " << paired << endl; + massert( 10387 , "request to kill slave replication failed", conn->simpleCommand( "admin", 0, "forcedead" ) ); syncToTailOfRemoteLog(); { @@ -1015,7 +1086,7 @@ namespace mongo { } bool ReplSource::updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock ) { - setClient( "local.oplog.$main" ); + Client::Context ctx( "local.oplog.$main" ); auto_ptr< Cursor > localLog = findTableScan( "local.oplog.$main", BSON( "$natural" << -1 ) ); OpTime newTail; for( ; localLog->ok(); localLog->advance() ) { @@ -1153,67 +1224,70 @@ namespace mongo { } return true; } - - int n = 0; - BSONObj op = c->next(); - BSONElement ts = op.findElement("ts"); - if ( ts.type() != Date && ts.type() != Timestamp ) { - string err = op.getStringField("$err"); - if ( !err.empty() ) { - problem() << "repl: $err reading remote oplog: " + err << '\n'; - massert( 10390 , "got $err reading remote oplog", false ); - } - else { - problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n'; - massert( 10391 , "repl: bad object read from remote oplog", false); + + OpTime nextOpTime; + { + BSONObj op = c->next(); + BSONElement ts = op.getField("ts"); + if ( ts.type() != Date && ts.type() != Timestamp ) { + string err = op.getStringField("$err"); + if ( !err.empty() ) { + problem() << "repl: $err reading remote oplog: " + err << '\n'; + massert( 10390 , "got $err reading remote oplog", false ); + } + else { + problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n'; + massert( 10391 , "repl: bad object read from remote oplog", false); + } } - } - if ( replPair && replPair->state == ReplPair::State_Master ) { + if ( replPair && replPair->state == ReplPair::State_Master ) { - OpTime nextOpTime( ts.date() ); - if ( !tailing && !initial && nextOpTime != syncedTo ) { - log() << "remote slave log filled, forcing slave resync" << endl; - resetSlave(); - return true; - } + OpTime next( ts.date() ); + if ( !tailing && !initial && next != syncedTo ) { + log() << "remote slave log filled, forcing slave resync" << endl; + resetSlave(); + return true; + } - dblock lk; - updateSetsWithLocalOps( localLogTail, true ); - } + dblock lk; + updateSetsWithLocalOps( localLogTail, true ); + } - OpTime nextOpTime( ts.date() ); - log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; - if ( tailing || initial ) { - if ( initial ) - log(1) << "repl: initial run\n"; - else + nextOpTime = OpTime( ts.date() ); + log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; + if ( tailing || initial ) { + if ( initial ) + log(1) << "repl: initial run\n"; + else + assert( syncedTo < nextOpTime ); + c->putBack( op ); // op will be processed in the loop below + nextOpTime = OpTime(); // will reread the op below + } + else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error + Nullstream& l = log(); + l << "repl: nextOpTime " << nextOpTime.toStringLong() << ' '; + if ( nextOpTime < syncedTo ) + l << "<??"; + else + l << ">"; + + l << " syncedTo " << syncedTo.toStringLong() << '\n'; + log() << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n"; + log() << "repl: tailing: " << tailing << '\n'; + log() << "repl: data too stale, halting replication" << endl; + replInfo = replAllDead = "data too stale halted replication"; assert( syncedTo < nextOpTime ); - sync_pullOpLog_applyOperation(op, &localLogTail); - n++; - } - else if ( nextOpTime != syncedTo ) { - Nullstream& l = log(); - l << "repl: nextOpTime " << nextOpTime.toStringLong() << ' '; - if ( nextOpTime < syncedTo ) - l << "<??"; - else - l << ">"; - - l << " syncedTo " << syncedTo.toStringLong() << '\n'; - log() << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n"; - log() << "repl: tailing: " << tailing << '\n'; - log() << "repl: data too stale, halting replication" << endl; - replInfo = replAllDead = "data too stale halted replication"; - assert( syncedTo < nextOpTime ); - throw SyncException(); - } - else { - /* t == syncedTo, so the first op was applied previously. */ + throw SyncException(); + } + else { + /* t == syncedTo, so the first op was applied previously. */ + } } // apply operations { + int n = 0; time_t saveLast = time(0); while ( 1 ) { /* from a.s.: @@ -1232,7 +1306,7 @@ namespace mongo { */ if ( !c->more() ) { dblock lk; - OpTime nextLastSaved = nextLastSavedLocalTs(); // this may make c->more() become true + OpTime nextLastSaved = nextLastSavedLocalTs(); { dbtemprelease t; if ( c->more() ) { @@ -1245,11 +1319,11 @@ namespace mongo { save(); // note how far we are synced up to now log() << "repl: applied " << n << " operations" << endl; nApplied = n; - log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl; + log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl; break; } - OCCASIONALLY if( n > 100000 || time(0) - saveLast > 60 ) { + OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { // periodically note our progress, in case we are doing a lot of work and crash dblock lk; syncedTo = nextOpTime; @@ -1262,14 +1336,36 @@ namespace mongo { } BSONObj op = c->next(); - ts = op.findElement("ts"); - assert( ts.type() == Date || ts.type() == Timestamp ); + BSONElement ts = op.getField("ts"); + if( !( ts.type() == Date || ts.type() == Timestamp ) ) { + log() << "sync error: problem querying remote oplog record\n"; + log() << "op: " << op.toString() << '\n'; + log() << "halting replication" << endl; + replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; + throw SyncException(); + } OpTime last = nextOpTime; - OpTime tmp( ts.date() ); - nextOpTime = tmp; + nextOpTime = OpTime( ts.date() ); if ( !( last < nextOpTime ) ) { - problem() << "sync error: last " << last.toString() << " >= nextOpTime " << nextOpTime.toString() << endl; - uassert( 10123 , "bad 'ts' value in sources", false); + log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl; + log() << " last: " << last.toStringLong() << '\n'; + log() << " nextOpTime: " << nextOpTime.toStringLong() << '\n'; + log() << " halting replication" << endl; + replInfo = replAllDead = "sync error last >= nextOpTime"; + uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false); + } + if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) { + c->putBack( op ); + _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; + dblock lk; + if ( n > 0 ) { + syncedTo = last; + save(); + } + log() << "repl: applied " << n << " operations" << endl; + log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; + log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; + break; } sync_pullOpLog_applyOperation(op, &localLogTail); @@ -1283,8 +1379,7 @@ namespace mongo { BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}"); bool replAuthenticate(DBClientConnection *conn) { - AuthenticationInfo *ai = currentClient.get()->ai; - if( !ai->isAuthorized("admin") ) { + if( ! cc().isAdmin() ){ log() << "replauthenticate: requires admin permissions, failing\n"; return false; } @@ -1324,7 +1419,7 @@ namespace mongo { ReplInfo r("trying to connect to sync source"); if ( !conn->connect(hostName.c_str(), errmsg) || !replAuthenticate(conn.get()) ) { resetConnection(); - log() << "repl: " << errmsg << endl; + log() << "repl: " << errmsg << endl; return false; } } @@ -1335,9 +1430,16 @@ namespace mongo { returns true if everything happy. return false if you want to reconnect. */ bool ReplSource::sync(int& nApplied) { + _sleepAdviceTime = 0; ReplInfo r("sync"); - if ( !cmdLine.quiet ) - log() << "repl: " << sourceName() << '@' << hostName << endl; + if ( !cmdLine.quiet ) { + Nullstream& l = log(); + l << "repl: from "; + if( sourceName() != "main" ) { + l << "source:" << sourceName() << ' '; + } + l << "host:" << hostName << endl; + } nClonedThisPass = 0; // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName. @@ -1348,13 +1450,11 @@ namespace mongo { } if ( !connect() ) { + log(4) << "repl: can't connect to sync source" << endl; if ( replPair && paired ) { assert( startsWith(hostName.c_str(), replPair->remoteHost.c_str()) ); replPair->arbitrate(); } - { - ReplInfo r("can't connect to sync source"); - } return false; } @@ -1370,7 +1470,7 @@ namespace mongo { /* // get current mtime at the server. BSONObj o = conn->findOne("admin.$cmd", opTimeQuery); - BSONElement e = o.findElement("optime"); + BSONElement e = o.getField("optime"); if( e.eoo() ) { log() << "repl: failed to get cur optime from master" << endl; log() << " " << o.toString() << endl; @@ -1387,24 +1487,13 @@ namespace mongo { // cached copies of these...so don't rename them NamespaceDetails *localOplogMainDetails = 0; - Database *localOplogClient = 0; + Database *localOplogDB = 0; + + void replCheckCloseDatabase( Database * db ){ + localOplogDB = 0; + localOplogMainDetails = 0; + } - void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) { - if ( master ) { - _logOp(opstr, ns, "local.oplog.$main", obj, patt, b, OpTime::now()); - char cl[ 256 ]; - nsToDatabase( ns, cl ); - } - NamespaceDetailsTransient &t = NamespaceDetailsTransient::get_w( ns ); - if ( t.cllEnabled() ) { - try { - _logOp(opstr, ns, t.cllNS().c_str(), obj, patt, b, OpTime::now()); - } catch ( const DBException & ) { - t.cllInvalidate(); - } - } - } - /* we write to local.opload.$main: { ts : ..., op: ..., ns: ..., o: ... } ts: an OpTime timestamp @@ -1415,6 +1504,7 @@ namespace mongo { "c" db cmd "db" declares presence of a database (ns is set to the db name + '.') "n" no op + logNS - e.g. "local.oplog.$main" bb: if not null, specifies a boolean to pass along to the other side as b: param. used for "justOne" or "upsert" flags on 'd', 'u' @@ -1422,7 +1512,7 @@ namespace mongo { when set, indicates this is the first thing we have logged for this database. thus, the slave does not need to copy down all the data when it sees this. */ - void _logOp(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, const OpTime &ts ) { + static void _logOp(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, const OpTime &ts ) { if ( strncmp(ns, "local.", 6) == 0 ) return; @@ -1449,14 +1539,14 @@ namespace mongo { Record *r; if ( strncmp( logNS, "local.", 6 ) == 0 ) { // For now, assume this is olog main if ( localOplogMainDetails == 0 ) { - setClient("local."); - localOplogClient = cc().database(); + Client::Context ctx("local.", dbpath, 0, false); + localOplogDB = ctx.db(); localOplogMainDetails = nsdetails(logNS); } - cc().setns("", localOplogClient); // database = localOplogClient; + Client::Context ctx( "" , localOplogDB, false ); r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len); } else { - setClient( logNS ); + Client::Context ctx( logNS, dbpath, 0, false ); assert( nsdetails( logNS ) ); r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len); } @@ -1478,6 +1568,27 @@ namespace mongo { } } + static void logKeepalive() { + BSONObj obj; + _logOp("n", "", "local.oplog.$main", obj, 0, 0, OpTime::now()); + } + + void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) { + if ( replSettings.master ) { + _logOp(opstr, ns, "local.oplog.$main", obj, patt, b, OpTime::now()); + char cl[ 256 ]; + nsToDatabase( ns, cl ); + } + NamespaceDetailsTransient &t = NamespaceDetailsTransient::get_w( ns ); + if ( t.cllEnabled() ) { + try { + _logOp(opstr, ns, t.cllNS().c_str(), obj, patt, b, OpTime::now()); + } catch ( const DBException & ) { + t.cllInvalidate(); + } + } + } + /* --------------------------------------------------------------*/ /* @@ -1517,6 +1628,9 @@ namespace mongo { else if( moreToSync ) { sleepAdvice = 0; } + else if ( s->sleepAdvice() ) { + sleepAdvice = s->sleepAdvice(); + } if ( ok && !moreToSync /*&& !s->syncedTo.isNull()*/ ) { pairSync->setInitialSyncCompletedLocking(); } @@ -1560,10 +1674,10 @@ namespace mongo { { dblock lk; if ( replAllDead ) { - if ( !autoresync || !ReplSource::throttledForceResyncDead( "auto" ) ) + if ( !replSettings.autoresync || !ReplSource::throttledForceResyncDead( "auto" ) ) break; } - assert( syncing == 0 ); + assert( syncing == 0 ); // i.e., there is only one sync thread running. we will want to change/fix this. syncing++; } try { @@ -1590,7 +1704,8 @@ namespace mongo { stringstream ss; ss << "repl: sleep " << s << "sec before next pass"; string msg = ss.str(); - log() << msg << endl; + if ( ! cmdLine.quiet ) + log() << msg << endl; ReplInfo r(msg.c_str()); sleepsecs(s); } @@ -1599,14 +1714,38 @@ namespace mongo { int debug_stop_repl = 0; + static void replMasterThread() { + sleepsecs(4); + Client::initThread("replmaster"); + while( 1 ) { + { + dblock lk; + cc().getAuthenticationInfo()->authorize("admin"); + } + sleepsecs(10); + /* write a keep-alive like entry to the log. this will make things like + printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date + even when things are idle. + */ + { + writelock lk(""); + try { + logKeepalive(); + } + catch(...) { + log() << "caught exception in replMasterThread()" << endl; + } + } + } + } + void replSlaveThread() { sleepsecs(1); - + Client::initThread("replslave"); + { dblock lk; - - Client::initThread("replslave"); - currentClient.get()->ai->authorize("admin"); + cc().getAuthenticationInfo()->authorize("admin"); BSONObj obj; if ( Helpers::getSingleton("local.pair.startup", obj) ) { @@ -1642,10 +1781,16 @@ namespace mongo { dblock lk; const char * ns = "local.oplog.$main"; - setClient(ns); + Client::Context ctx(ns); - if ( nsdetails( ns ) ) + if ( nsdetails( ns ) ) { + DBDirectClient c; + BSONObj lastOp = c.findOne( ns, Query().sort( BSON( "$natural" << -1 ) ) ); + if ( !lastOp.isEmpty() ) { + OpTime::setLast( lastOp[ "ts" ].date() ); + } return; + } /* create an oplog collection, if it doesn't yet exist. */ BSONObjBuilder b; @@ -1653,13 +1798,19 @@ namespace mongo { if ( cmdLine.oplogSize != 0 ) sz = (double)cmdLine.oplogSize; else { + /* not specified. pick a default size */ sz = 50.0 * 1000 * 1000; if ( sizeof(int *) >= 8 ) { +#if defined(__APPLE__) + // typically these are desktops (dev machines), so keep it smallish + sz = (256-64) * 1000 * 1000; +#else sz = 990.0 * 1000 * 1000; boost::intmax_t free = freeSpace(); //-1 if call not supported. double fivePct = free * 0.05; if ( fivePct > sz ) sz = fivePct; +#endif } } @@ -1675,7 +1826,6 @@ namespace mongo { BSONObj o = b.done(); userCreateNS(ns, o, err, false); logOp( "n", "dummy", BSONObj() ); - cc().clearns(); } void startReplication() { @@ -1684,29 +1834,31 @@ namespace mongo { */ //boost::thread tempt(tempThread); - if ( !slave && !master && !replPair ) + if ( !replSettings.slave && !replSettings.master && !replPair ) return; { dblock lk; + cc().getAuthenticationInfo()->authorize("admin"); pairSync->init(); } - if ( slave || replPair ) { - if ( slave ) { - assert( slave == SimpleSlave ); + if ( replSettings.slave || replPair ) { + if ( replSettings.slave ) { + assert( replSettings.slave == SimpleSlave ); log(1) << "slave=true" << endl; } else - slave = ReplPairSlave; + replSettings.slave = ReplPairSlave; boost::thread repl_thread(replSlaveThread); } - if ( master || replPair ) { - if ( master ) + if ( replSettings.master || replPair ) { + if ( replSettings.master ) log(1) << "master=true" << endl; - master = true; + replSettings.master = true; createOplog(); + boost::thread t(replMasterThread); } } @@ -1720,6 +1872,7 @@ namespace mongo { virtual bool slaveOk() { return false; } + virtual LockType locktype(){ return WRITE; } CmdLogCollection() : Command( "logCollection" ) {} virtual void help( stringstream &help ) const { help << "examples: { logCollection: <collection ns>, start: 1 }, " @@ -32,6 +32,7 @@ #include "db.h" #include "dbhelpers.h" #include "query.h" +#include "queryoptimizer.h" #include "../client/dbclient.h" @@ -46,14 +47,31 @@ namespace mongo { --slave cmd line setting -> SimpleSlave */ typedef enum { NotSlave=0, SimpleSlave, ReplPairSlave } SlaveTypes; - extern SlaveTypes slave; - /* true means we are master and doing replication. if we are not writing to oplog (no --master or repl pairing), - this won't be true. - */ - extern bool master; + class ReplSettings { + public: + SlaveTypes slave; + + /* true means we are master and doing replication. if we are not writing to oplog (no --master or repl pairing), + this won't be true. + */ + bool master; + + int opIdMem; + + bool fastsync; + + bool autoresync; + + int slavedelay; + + ReplSettings() + : slave(NotSlave) , master(false) , opIdMem(100000000) , fastsync() , autoresync(false), slavedelay() { + } + + }; - extern int opIdMem; + extern ReplSettings replSettings; bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, bool slaveOk, bool useReplAuth, bool snapshot); @@ -115,6 +133,7 @@ namespace mongo { // returns false if the slave has been reset bool updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock ); string ns() const { return string( "local.oplog.$" ) + sourceName(); } + unsigned _sleepAdviceTime; public: static void applyOperation(const BSONObj& op); @@ -131,11 +150,11 @@ namespace mongo { OpTime syncedTo; /* This is for repl pairs. - _lastSavedLocalTs is the most recent point in the local log that we know is consistent
- with the remote log ( ie say the local op log has entries ABCDE and the remote op log
- has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled
- the DE-XY difference.)
- */
+ _lastSavedLocalTs is the most recent point in the local log that we know is consistent + with the remote log ( ie say the local op log has entries ABCDE and the remote op log + has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled + the DE-XY difference.) + */ OpTime _lastSavedLocalTs; int nClonedThisPass; @@ -160,7 +179,13 @@ namespace mongo { operator string() const { return sourceName() + "@" + hostName; } bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); } - + int sleepAdvice() const { + if ( !_sleepAdviceTime ) + return 0; + int wait = _sleepAdviceTime - unsigned( time( 0 ) ); + return wait > 0 ? wait : 0; + } + static bool throttledForceResyncDead( const char *requester ); static void forceResyncDead( const char *requester ); void forceResync( const char *requester ); @@ -173,7 +198,6 @@ namespace mongo { "c" db cmd "db" declares presence of a database (ns is set to the db name + '.') */ - void _logOp(const char *opstr, const char *ns, const char *logNs, const BSONObj& obj, BSONObj *patt, bool *b, const OpTime &ts); void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0); // class for managing a set of ids in memory @@ -239,9 +263,9 @@ namespace mongo { dbIds_( "local.temp.replIds" ), dbModIds_( "local.temp.replModIds" ), inMem_( true ), - maxMem_( opIdMem ) { + maxMem_( replSettings.opIdMem ) { } - void reset( int maxMem = opIdMem ) { + void reset( int maxMem = replSettings.opIdMem ) { memIds_.reset(); memModIds_.reset(); dbIds_.reset(); @@ -312,4 +336,146 @@ namespace mongo { int maxMem_; }; + bool anyReplEnabled(); + void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 ); + + void replCheckCloseDatabase( Database * db ); + + extern int __findingStartInitialTimeout; // configurable for testing + + class FindingStartCursor { + public: + FindingStartCursor( const QueryPlan & qp ) : + _qp( qp ), + _findingStart( true ), + _findingStartMode(), + _findingStartTimer( 0 ), + _findingStartCursor( 0 ) + { init(); } + bool done() const { return !_findingStart; } + auto_ptr< Cursor > cRelease() { return _c; } + void next() { + if ( !_findingStartCursor || !_findingStartCursor->c->ok() ) { + _findingStart = false; + _c = _qp.newCursor(); // on error, start from beginning + destroyClientCursor(); + return; + } + switch( _findingStartMode ) { + case Initial: { + if ( !_matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) { + _findingStart = false; // found first record out of query range, so scan normally + _c = _qp.newCursor( _findingStartCursor->c->currLoc() ); + destroyClientCursor(); + return; + } + _findingStartCursor->c->advance(); + RARELY { + if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) { + createClientCursor( startLoc( _findingStartCursor->c->currLoc() ) ); + _findingStartMode = FindExtent; + return; + } + } + maybeRelease(); + return; + } + case FindExtent: { + if ( !_matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) { + _findingStartMode = InExtent; + return; + } + DiskLoc prev = prevLoc( _findingStartCursor->c->currLoc() ); + if ( prev.isNull() ) { // hit beginning, so start scanning from here + createClientCursor(); + _findingStartMode = InExtent; + return; + } + // There might be a more efficient implementation than creating new cursor & client cursor each time, + // not worrying about that for now + createClientCursor( prev ); + maybeRelease(); + return; + } + case InExtent: { + if ( _matcher->matches( _findingStartCursor->c->currKey(), _findingStartCursor->c->currLoc() ) ) { + _findingStart = false; // found first record in query range, so scan normally + _c = _qp.newCursor( _findingStartCursor->c->currLoc() ); + destroyClientCursor(); + return; + } + _findingStartCursor->c->advance(); + maybeRelease(); + return; + } + default: { + massert( 12600, "invalid _findingStartMode", false ); + } + } + } + private: + enum FindingStartMode { Initial, FindExtent, InExtent }; + const QueryPlan &_qp; + bool _findingStart; + FindingStartMode _findingStartMode; + auto_ptr< CoveredIndexMatcher > _matcher; + Timer _findingStartTimer; + ClientCursor * _findingStartCursor; + auto_ptr< Cursor > _c; + DiskLoc startLoc( const DiskLoc &rec ) { + Extent *e = rec.rec()->myExtent( rec ); + if ( e->myLoc != _qp.nsd()->capExtent ) + return e->firstRecord; + // Likely we are on the fresh side of capExtent, so return first fresh record. + // If we are on the stale side of capExtent, then the collection is small and it + // doesn't matter if we start the extent scan with capFirstNewRecord. + return _qp.nsd()->capFirstNewRecord; + } + + DiskLoc prevLoc( const DiskLoc &rec ) { + Extent *e = rec.rec()->myExtent( rec ); + if ( e->xprev.isNull() ) + e = _qp.nsd()->lastExtent.ext(); + else + e = e->xprev.ext(); + if ( e->myLoc != _qp.nsd()->capExtent ) + return e->firstRecord; + return DiskLoc(); // reached beginning of collection + } + void createClientCursor( const DiskLoc &startLoc = DiskLoc() ) { + auto_ptr<Cursor> c = _qp.newCursor( startLoc ); + _findingStartCursor = new ClientCursor(c, _qp.ns(), false); + } + void destroyClientCursor() { + if ( _findingStartCursor ) { + ClientCursor::erase( _findingStartCursor->cursorid ); + _findingStartCursor = 0; + } + } + void maybeRelease() { + RARELY { + CursorId id = _findingStartCursor->cursorid; + _findingStartCursor->updateLocation(); + { + dbtemprelease t; + } + _findingStartCursor = ClientCursor::find( id, false ); + } + } + void init() { + // Use a ClientCursor here so we can release db mutex while scanning + // oplog (can take quite a while with large oplogs). + auto_ptr<Cursor> c = _qp.newReverseCursor(); + _findingStartCursor = new ClientCursor(c, _qp.ns(), false); + _findingStartTimer.reset(); + _findingStartMode = Initial; + BSONElement tsElt = _qp.query()[ "ts" ]; + massert( 13044, "no ts field in query", !tsElt.eoo() ); + BSONObjBuilder b; + b.append( tsElt ); + BSONObj tsQuery = b.obj(); + _matcher.reset(new CoveredIndexMatcher(tsQuery, _qp.indexKey())); + } + }; + } // namespace mongo diff --git a/db/replset.h b/db/replset.h index 98d80d6..66a8604 100644 --- a/db/replset.h +++ b/db/replset.h @@ -49,13 +49,13 @@ namespace mongo { }; int state; - string info; // commentary about our current state + ThreadSafeString info; // commentary about our current state string arbHost; // "-" for no arbiter. "host[:port]" int remotePort; string remoteHost; string remote; // host:port if port specified. // int date; // -1 not yet set; 0=slave; 1=master - + string getInfo() { stringstream ss; ss << " state: "; @@ -111,7 +111,7 @@ namespace mongo { If 'client' is not specified, the current client is used. */ inline bool isMaster( const char *client = 0 ) { - if( !slave ) + if( ! replSettings.slave ) return true; if ( !client ) { @@ -128,7 +128,7 @@ namespace mongo { return true; } else { - if( master ) { + if( replSettings.master ) { // if running with --master --slave, allow. note that master is also true // for repl pairs so the check for replPair above is important. return true; diff --git a/db/scanandorder.h b/db/scanandorder.h index 3f41433..f038069 100644 --- a/db/scanandorder.h +++ b/db/scanandorder.h @@ -40,7 +40,7 @@ namespace mongo { // returns the key value for o BSONObj getKeyFromObject(BSONObj o) { - return o.extractFields(pattern); + return o.extractFields(pattern,true); } }; diff --git a/db/security.cpp b/db/security.cpp index 747b04a..6a01627 100644 --- a/db/security.cpp +++ b/db/security.cpp @@ -21,12 +21,44 @@ #include "instance.h" #include "client.h" #include "curop.h" +#include "db.h" +#include "dbhelpers.h" namespace mongo { bool noauth = true; - + int AuthenticationInfo::warned = 0; + void AuthenticationInfo::print(){ + cout << "AuthenticationInfo: " << this << "\n"; + for ( map<string,Auth>::iterator i=m.begin(); i!=m.end(); i++ ){ + cout << "\t" << i->first << "\t" << i->second.level << "\n"; + } + cout << "END" << endl; + } + + + bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) { + if ( cc().isGod() ){ + return true; + } + + if ( isLocalHost ){ + atleastreadlock l(""); + Client::GodScope gs; + Client::Context c("admin.system.users"); + BSONObj result; + if( ! Helpers::getSingleton("admin.system.users", result) ){ + if( warned == 0 ) { + warned++; + log() << "note: no users configured in admin.system.users, allowing localhost access" << endl; + } + return true; + } + } + return false; + } + } // namespace mongo diff --git a/db/security.h b/db/security.h index f61d5e1..261b123 100644 --- a/db/security.h +++ b/db/security.h @@ -22,9 +22,8 @@ #undef assert #define assert xassert -#include "db.h" -#include "dbhelpers.h" #include "nonce.h" +#include "concurrency.h" namespace mongo { @@ -38,40 +37,42 @@ namespace mongo { }; class AuthenticationInfo : boost::noncopyable { + mongo::mutex _lock; map<string, Auth> m; // dbname -> auth static int warned; public: bool isLocalHost; AuthenticationInfo() { isLocalHost = false; } - virtual ~AuthenticationInfo() { + ~AuthenticationInfo() { } - void logout(const char *dbname) { - assertInWriteLock(); + void logout(const string& dbname ) { + scoped_lock lk(_lock); m.erase(dbname); } - void authorize(const char *dbname) { - assertInWriteLock(); + void authorize(const string& dbname ) { + scoped_lock lk(_lock); m[dbname].level = 2; } - virtual bool isAuthorized(const char *dbname) { - if( m[dbname].level == 2 ) return true; + void authorizeReadOnly(const string& dbname) { + scoped_lock lk(_lock); + m[dbname].level = 1; + } + bool isAuthorized(const string& dbname) { return _isAuthorized( dbname, 2 ); } + bool isAuthorizedReads(const string& dbname) { return _isAuthorized( dbname, 1 ); } + bool isAuthorizedForLock(const string& dbname, int lockType ) { return _isAuthorized( dbname , lockType > 0 ? 2 : 1 ); } + + void print(); + + protected: + bool _isAuthorized(const string& dbname, int level) { + if( m[dbname].level >= level ) return true; if( noauth ) return true; - if( m["admin"].level == 2 ) return true; - if( m["local"].level == 2 ) return true; - if( isLocalHost ) { - readlock l(""); - Client::Context c("admin.system.users"); - BSONObj result; - if( Helpers::getSingleton("admin.system.users", result) ) - return false; - if( warned == 0 ) { - warned++; - log() << "warning: no users configured in admin.system.users, allowing localhost access" << endl; - } - return true; - } - return false; + if( m["admin"].level >= level ) return true; + if( m["local"].level >= level ) return true; + return _isAuthorizedSpecialChecks( dbname ); } + + bool _isAuthorizedSpecialChecks( const string& dbname ); }; } // namespace mongo diff --git a/db/security_commands.cpp b/db/security_commands.cpp index 9d63744..326d6e4 100644 --- a/db/security_commands.cpp +++ b/db/security_commands.cpp @@ -1,4 +1,20 @@ // security_commands.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + // security.cpp links with both dbgrid and db. this file db only -- at least for now. // security.cpp @@ -39,6 +55,7 @@ namespace mongo { virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return NONE; } CmdGetNonce() : Command("getnonce") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { nonce *n = new nonce(security.getNonce()); @@ -58,12 +75,12 @@ namespace mongo { virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return NONE; } CmdLogout() : Command("logout") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { // database->name is the one we are logging out... - Client& client = cc(); - AuthenticationInfo *ai = client.ai; - ai->logout(client.database()->name.c_str()); + AuthenticationInfo *ai = cc().getAuthenticationInfo(); + ai->logout(nsToDatabase(ns)); return true; } } cmdLogout; @@ -77,6 +94,7 @@ namespace mongo { virtual bool slaveOk() { return true; } + virtual LockType locktype(){ return WRITE; } // TODO: make this READ CmdAuthenticate() : Command("authenticate") {} bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { log(1) << " authenticate: " << cmdObj << endl; @@ -88,7 +106,7 @@ namespace mongo { if( user.empty() || key.empty() || received_nonce.empty() ) { log() << "field missing/wrong type in received authenticate command " << cc().database()->name - << '\n'; + << endl; errmsg = "auth fails"; sleepmillis(10); return false; @@ -107,7 +125,7 @@ namespace mongo { } if ( reject ) { - log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << '\n'; + log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << endl; errmsg = "auth fails"; sleepmillis(30); return false; @@ -124,7 +142,7 @@ namespace mongo { b << "user" << user; BSONObj query = b.done(); if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) { - log() << "auth: couldn't find user " << user << ", " << systemUsers << '\n'; + log() << "auth: couldn't find user " << user << ", " << systemUsers << endl; errmsg = "auth fails"; return false; } @@ -146,13 +164,24 @@ namespace mongo { string computed = digestToString( d ); if ( key != computed ){ - log() << "auth: key mismatch " << user << ", ns:" << ns << '\n'; + log() << "auth: key mismatch " << user << ", ns:" << ns << endl; errmsg = "auth fails"; return false; } - AuthenticationInfo *ai = currentClient.get()->ai; - ai->authorize(cc().database()->name.c_str()); + AuthenticationInfo *ai = cc().getAuthenticationInfo(); + + if ( userObj[ "readOnly" ].isBoolean() && userObj[ "readOnly" ].boolean() ) { + if ( readLockSupported() ){ + ai->authorizeReadOnly( cc().database()->name.c_str() ); + } + else { + log() << "warning: old version of boost, read-only users not supported" << endl; + ai->authorize( cc().database()->name.c_str() ); + } + } else { + ai->authorize( cc().database()->name.c_str() ); + } return true; } } cmdAuthenticate; diff --git a/db/stats/counters.cpp b/db/stats/counters.cpp new file mode 100644 index 0000000..8e90902 --- /dev/null +++ b/db/stats/counters.cpp @@ -0,0 +1,131 @@ +// counters.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#include "stdafx.h" +#include "../jsobj.h" +#include "counters.h" + +namespace mongo { + + OpCounters::OpCounters(){ + int zero = 0; + + BSONObjBuilder b; + b.append( "insert" , zero ); + b.append( "query" , zero ); + b.append( "update" , zero ); + b.append( "delete" , zero ); + b.append( "getmore" , zero ); + b.append( "command" , zero ); + _obj = b.obj(); + + _insert = (int*)_obj["insert"].value(); + _query = (int*)_obj["query"].value(); + _update = (int*)_obj["update"].value(); + _delete = (int*)_obj["delete"].value(); + _getmore = (int*)_obj["getmore"].value(); + _command = (int*)_obj["command"].value(); + } + + void OpCounters::gotOp( int op , bool isCommand ){ + switch ( op ){ + case dbInsert: gotInsert(); break; + case dbQuery: + if ( isCommand ) + gotCommand(); + else + gotQuery(); + break; + + case dbUpdate: gotUpdate(); break; + case dbDelete: gotDelete(); break; + case dbGetMore: gotGetMore(); break; + case dbKillCursors: + case opReply: + case dbMsg: + break; + default: log() << "OpCounters::gotOp unknown op: " << op << endl; + } + } + + IndexCounters::IndexCounters(){ + _memSupported = _pi.blockCheckSupported(); + + _btreeMemHits = 0; + _btreeMemMisses = 0; + _btreeAccesses = 0; + + + _maxAllowed = ( numeric_limits< long long >::max() ) / 2; + _resets = 0; + + _sampling = 0; + _samplingrate = 100; + } + + void IndexCounters::append( BSONObjBuilder& b ){ + if ( ! _memSupported ){ + b.append( "note" , "not supported on this platform" ); + return; + } + + BSONObjBuilder bb( b.subobjStart( "btree" ) ); + bb.appendNumber( "accesses" , _btreeAccesses ); + bb.appendNumber( "hits" , _btreeMemHits ); + bb.appendNumber( "misses" , _btreeMemMisses ); + + bb.append( "resets" , _resets ); + + bb.append( "missRatio" , (_btreeAccesses ? (_btreeMemMisses / (double)_btreeAccesses) : 0) ); + + bb.done(); + + if ( _btreeAccesses > _maxAllowed ){ + _btreeAccesses = 0; + _btreeMemMisses = 0; + _btreeMemHits = 0; + _resets++; + } + } + + FlushCounters::FlushCounters() + : _total_time(0) + , _flushes(0) + , _last() + {} + + void FlushCounters::flushed(int ms){ + _flushes++; + _total_time += ms; + _last_time = ms; + _last = jsTime(); + } + + void FlushCounters::append( BSONObjBuilder& b ){ + b.appendNumber( "flushes" , _flushes ); + b.appendNumber( "total_ms" , _total_time ); + b.appendNumber( "average_ms" , (_flushes ? (_total_time / double(_flushes)) : 0.0) ); + b.appendNumber( "last_ms" , _last_time ); + b.append("last_finished", _last); + } + + + OpCounters globalOpCounters; + IndexCounters globalIndexCounters; + FlushCounters globalFlushCounters; +} diff --git a/db/stats/counters.h b/db/stats/counters.h new file mode 100644 index 0000000..41c2cd2 --- /dev/null +++ b/db/stats/counters.h @@ -0,0 +1,121 @@ +// counters.h +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#include "../../stdafx.h" +#include "../jsobj.h" +#include "../../util/message.h" +#include "../../util/processinfo.h" + +namespace mongo { + + /** + * for storing operation counters + * note: not thread safe. ok with that for speed + */ + class OpCounters { + public: + + OpCounters(); + + int * getInsert(){ return _insert; } + int * getQuery(){ return _query; } + int * getUpdate(){ return _update; } + int * getDelete(){ return _delete; } + int * getGetMore(){ return _getmore; } + int * getCommand(){ return _command; } + + void gotInsert(){ _insert[0]++; } + void gotQuery(){ _query[0]++; } + void gotUpdate(){ _update[0]++; } + void gotDelete(){ _delete[0]++; } + void gotGetMore(){ _getmore[0]++; } + void gotCommand(){ _command[0]++; } + + void gotOp( int op , bool isCommand ); + + BSONObj& getObj(){ return _obj; } + private: + BSONObj _obj; + int * _insert; + int * _query; + int * _update; + int * _delete; + int * _getmore; + int * _command; + }; + + extern OpCounters globalOpCounters; + + class IndexCounters { + public: + IndexCounters(); + + void btree( char * node ){ + if ( ! _memSupported ) + return; + if ( _sampling++ % _samplingrate ) + return; + btree( _pi.blockInMemory( node ) ); + } + + void btree( bool memHit ){ + if ( memHit ) + _btreeMemHits++; + else + _btreeMemMisses++; + _btreeAccesses++; + } + void btreeHit(){ _btreeMemHits++; _btreeAccesses++; } + void btreeMiss(){ _btreeMemMisses++; _btreeAccesses++; } + + void append( BSONObjBuilder& b ); + + private: + ProcessInfo _pi; + bool _memSupported; + + int _sampling; + int _samplingrate; + + int _resets; + long long _maxAllowed; + + long long _btreeMemMisses; + long long _btreeMemHits; + long long _btreeAccesses; + }; + + extern IndexCounters globalIndexCounters; + + class FlushCounters { + public: + FlushCounters(); + + void flushed(int ms); + + void append( BSONObjBuilder& b ); + + private: + long long _total_time; + long long _flushes; + int _last_time; + Date_t _last; + }; + + extern FlushCounters globalFlushCounters; +} diff --git a/db/stats/snapshots.cpp b/db/stats/snapshots.cpp new file mode 100644 index 0000000..71ddd72 --- /dev/null +++ b/db/stats/snapshots.cpp @@ -0,0 +1,144 @@ +// snapshots.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "stdafx.h" +#include "snapshots.h" +#include "../client.h" +#include "../clientcursor.h" + +/** + handles snapshotting performance metrics and other such things + */ +namespace mongo { + void SnapshotData::takeSnapshot(){ + _created = curTimeMicros64(); + _globalUsage = Top::global.getGlobalData(); + _totalWriteLockedTime = dbMutex.info().getTimeLocked(); + Top::global.cloneMap(_usage); + } + + SnapshotDelta::SnapshotDelta( const SnapshotData& older , const SnapshotData& newer ) + : _older( older ) , _newer( newer ) + { + assert( _newer._created > _older._created ); + _elapsed = _newer._created - _older._created; + + } + + Top::CollectionData SnapshotDelta::globalUsageDiff(){ + return Top::CollectionData( _older._globalUsage , _newer._globalUsage ); + } + Top::UsageMap SnapshotDelta::collectionUsageDiff(){ + Top::UsageMap u; + + for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ){ + Top::UsageMap::const_iterator j = _older._usage.find(i->first); + if (j != _older._usage.end()) + u[i->first] = Top::CollectionData( j->second , i->second ); + } + return u; + } + + Snapshots::Snapshots(int n) + : _n(n) + , _snapshots(new SnapshotData[n]) + , _loc(0) + , _stored(0) + {} + + const SnapshotData* Snapshots::takeSnapshot(){ + scoped_lock lk(_lock); + _loc = ( _loc + 1 ) % _n; + _snapshots[_loc].takeSnapshot(); + if ( _stored < _n ) + _stored++; + return &_snapshots[_loc]; + } + + auto_ptr<SnapshotDelta> Snapshots::computeDelta( int numBack ){ + scoped_lock lk(_lock); + auto_ptr<SnapshotDelta> p; + if ( numBack < numDeltas() ) + p.reset( new SnapshotDelta( getPrev(numBack+1) , getPrev(numBack) ) ); + return p; + } + + const SnapshotData& Snapshots::getPrev( int numBack ){ + int x = _loc - numBack; + if ( x < 0 ) + x += _n; + return _snapshots[x]; + } + + void Snapshots::outputLockInfoHTML( stringstream& ss ){ + scoped_lock lk(_lock); + ss << "\n<table>"; + ss << "<tr><th>elapsed(ms)</th><th>% write locked</th></tr>\n"; + + for ( int i=0; i<numDeltas(); i++ ){ + SnapshotDelta d( getPrev(i+1) , getPrev(i) ); + ss << "<tr>" + << "<td>" << ( d.elapsed() / 1000 ) << "</td>" + << "<td>" << (unsigned)(100*d.percentWriteLocked()) << "%</td>" + << "</tr>" + ; + } + + ss << "</table>\n"; + } + + void SnapshotThread::run(){ + Client::initThread("snapshotthread"); + Client& client = cc(); + + long long numLoops = 0; + + const SnapshotData* prev = 0; + + while ( ! inShutdown() ){ + try { + const SnapshotData* s = statsSnapshots.takeSnapshot(); + + if ( prev ){ + unsigned long long elapsed = s->_created - prev->_created; + + if ( cmdLine.cpu ){ + SnapshotDelta d( *prev , *s ); + log() << "cpu: elapsed:" << (elapsed/1000) <<" writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl; + } + + // TODO: this should really be somewhere else, like in a special ClientCursor thread + ClientCursor::idleTimeReport( (unsigned)(elapsed/1000) ); + } + + prev = s; + } + catch ( std::exception& e ){ + log() << "ERROR in SnapshotThread: " << e.what() << endl; + } + + numLoops++; + sleepsecs(4); + } + + client.shutdown(); + } + + Snapshots statsSnapshots; + SnapshotThread snapshotThread; +} diff --git a/db/stats/snapshots.h b/db/stats/snapshots.h new file mode 100644 index 0000000..542318a --- /dev/null +++ b/db/stats/snapshots.h @@ -0,0 +1,113 @@ +// snapshots.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once +#include "../../stdafx.h" +#include "../jsobj.h" +#include "top.h" +#include "../../util/background.h" + +/** + handles snapshotting performance metrics and other such things + */ +namespace mongo { + + class SnapshotThread; + + /** + * stores a point in time snapshot + * i.e. all counters at a given time + */ + class SnapshotData { + void takeSnapshot(); + + unsigned long long _created; + Top::CollectionData _globalUsage; + unsigned long long _totalWriteLockedTime; // micros of total time locked + Top::UsageMap _usage; + + friend class SnapshotThread; + friend class SnapshotDelta; + friend class Snapshots; + }; + + /** + * contains performance information for a time period + */ + class SnapshotDelta { + public: + SnapshotDelta( const SnapshotData& older , const SnapshotData& newer ); + + unsigned long long start() const { + return _older._created; + } + + unsigned long long elapsed() const { + return _elapsed; + } + + unsigned long long timeInWriteLock() const { + return _newer._totalWriteLockedTime - _older._totalWriteLockedTime; + } + double percentWriteLocked() const { + double e = (double) elapsed(); + double w = (double) timeInWriteLock(); + return w/e; + } + + Top::CollectionData globalUsageDiff(); + Top::UsageMap collectionUsageDiff(); + + private: + const SnapshotData& _older; + const SnapshotData& _newer; + + unsigned long long _elapsed; + }; + + class Snapshots { + public: + Snapshots(int n=100); + + const SnapshotData* takeSnapshot(); + + int numDeltas() const { return _stored-1; } + + const SnapshotData& getPrev( int numBack = 0 ); + auto_ptr<SnapshotDelta> computeDelta( int numBack = 0 ); + + + void outputLockInfoHTML( stringstream& ss ); + private: + mongo::mutex _lock; + int _n; + boost::scoped_array<SnapshotData> _snapshots; + int _loc; + int _stored; + }; + + class SnapshotThread : public BackgroundJob { + public: + void run(); + }; + + extern Snapshots statsSnapshots; + extern SnapshotThread snapshotThread; + + +} diff --git a/db/stats/top.cpp b/db/stats/top.cpp new file mode 100644 index 0000000..0f27943 --- /dev/null +++ b/db/stats/top.cpp @@ -0,0 +1,181 @@ +// top.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#include "stdafx.h" +#include "top.h" +#include "../../util/message.h" +#include "../commands.h" + +namespace mongo { + + Top::UsageData::UsageData( const UsageData& older , const UsageData& newer ) + : time(newer.time-older.time) , + count(newer.count-older.count) + { + + } + + Top::CollectionData::CollectionData( const CollectionData& older , const CollectionData& newer ) + : total( older.total , newer.total ) , + readLock( older.readLock , newer.readLock ) , + writeLock( older.writeLock , newer.writeLock ) , + queries( older.queries , newer.queries ) , + getmore( older.getmore , newer.getmore ) , + insert( older.insert , newer.insert ) , + update( older.update , newer.update ) , + remove( older.remove , newer.remove ), + commands( older.commands , newer.commands ) + { + + } + + + void Top::record( const string& ns , int op , int lockType , long long micros , bool command ){ + //cout << "record: " << ns << "\t" << op << "\t" << command << endl; + scoped_lock lk(_lock); + + if ( ( command || op == dbQuery ) && ns == _lastDropped ){ + _lastDropped = ""; + return; + } + + CollectionData& coll = _usage[ns]; + _record( coll , op , lockType , micros , command ); + _record( _global , op , lockType , micros , command ); + } + + void Top::collectionDropped( const string& ns ){ + //cout << "collectionDropped: " << ns << endl; + scoped_lock lk(_lock); + _usage.erase(ns); + _lastDropped = ns; + } + + void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ){ + c.total.inc( micros ); + + if ( lockType > 0 ) + c.writeLock.inc( micros ); + else if ( lockType < 0 ) + c.readLock.inc( micros ); + + switch ( op ){ + case 0: + // use 0 for unknown, non-specific + break; + case dbUpdate: + c.update.inc( micros ); + break; + case dbInsert: + c.insert.inc( micros ); + break; + case dbQuery: + if ( command ) + c.commands.inc( micros ); + else + c.queries.inc( micros ); + break; + case dbGetMore: + c.getmore.inc( micros ); + break; + case dbDelete: + c.remove.inc( micros ); + break; + case opReply: + case dbMsg: + case dbKillCursors: + log() << "unexpected op in Top::record: " << op << endl; + break; + default: + log() << "unknown op in Top::record: " << op << endl; + } + + } + + void Top::cloneMap(Top::UsageMap& out){ + scoped_lock lk(_lock); + out = _usage; + } + + void Top::append( BSONObjBuilder& b ){ + scoped_lock lk( _lock ); + append( b , _usage ); + } + + void Top::append( BSONObjBuilder& b , const char * name , const UsageData& map ){ + BSONObjBuilder bb( b.subobjStart( name ) ); + bb.appendNumber( "time" , map.time ); + bb.appendNumber( "count" , map.count ); + bb.done(); + } + + void Top::append( BSONObjBuilder& b , const UsageMap& map ){ + for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ){ + BSONObjBuilder bb( b.subobjStart( i->first.c_str() ) ); + + const CollectionData& coll = i->second; + + append( b , "total" , coll.total ); + + append( b , "readLock" , coll.readLock ); + append( b , "writeLock" , coll.writeLock ); + + append( b , "queries" , coll.queries ); + append( b , "getmore" , coll.getmore ); + append( b , "insert" , coll.insert ); + append( b , "update" , coll.update ); + append( b , "remove" , coll.remove ); + append( b , "commands" , coll.commands ); + + bb.done(); + } + } + + class TopCmd : public Command { + public: + TopCmd() : Command( "top" ){} + + virtual bool slaveOk(){ return true; } + virtual bool adminOnly(){ return true; } + virtual LockType locktype(){ return READ; } + virtual void help( stringstream& help ) const { help << "usage by collection"; } + + virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl){ + { + BSONObjBuilder b( result.subobjStart( "totals" ) ); + Top::global.append( b ); + b.done(); + } + return true; + } + + } topCmd; + + Top Top::global; + + TopOld::T TopOld::_snapshotStart = TopOld::currentTime(); + TopOld::D TopOld::_snapshotDuration; + TopOld::UsageMap TopOld::_totalUsage; + TopOld::UsageMap TopOld::_snapshotA; + TopOld::UsageMap TopOld::_snapshotB; + TopOld::UsageMap &TopOld::_snapshot = TopOld::_snapshotA; + TopOld::UsageMap &TopOld::_nextSnapshot = TopOld::_snapshotB; + mongo::mutex TopOld::topMutex; + + +} diff --git a/db/stats/top.h b/db/stats/top.h new file mode 100644 index 0000000..8dab3b0 --- /dev/null +++ b/db/stats/top.h @@ -0,0 +1,248 @@ +// top.h : DB usage monitor. + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <boost/date_time/posix_time/posix_time.hpp> +#undef assert +#define assert xassert + +namespace mongo { + + /** + * tracks usage by collection + */ + class Top { + + public: + class UsageData { + public: + UsageData() : time(0) , count(0){} + UsageData( const UsageData& older , const UsageData& newer ); + long long time; + long long count; + + void inc( long long micros ){ + count++; + time += micros; + } + }; + + class CollectionData { + public: + /** + * constructs a diff + */ + CollectionData(){} + CollectionData( const CollectionData& older , const CollectionData& newer ); + + UsageData total; + + UsageData readLock; + UsageData writeLock; + + UsageData queries; + UsageData getmore; + UsageData insert; + UsageData update; + UsageData remove; + UsageData commands; + }; + + typedef map<string,CollectionData> UsageMap; + + public: + void record( const string& ns , int op , int lockType , long long micros , bool command ); + void append( BSONObjBuilder& b ); + void cloneMap(UsageMap& out); + CollectionData getGlobalData(){ return _global; } + void collectionDropped( const string& ns ); + + public: // static stuff + static Top global; + + void append( BSONObjBuilder& b , const char * name , const UsageData& map ); + void append( BSONObjBuilder& b , const UsageMap& map ); + + private: + + void _record( CollectionData& c , int op , int lockType , long long micros , bool command ); + + mongo::mutex _lock; + CollectionData _global; + UsageMap _usage; + string _lastDropped; + }; + + /* Records per namespace utilization of the mongod process. + No two functions of this class may be called concurrently. + */ + class TopOld { + typedef boost::posix_time::ptime T; + typedef boost::posix_time::time_duration D; + typedef boost::tuple< D, int, int, int > UsageData; + public: + TopOld() : _read(false), _write(false) { } + + /* these are used to record activity: */ + + void clientStart( const char *client ) { + clientStop(); + _currentStart = currentTime(); + _current = client; + } + + /* indicate current request is a read operation. */ + void setRead() { _read = true; } + + void setWrite() { _write = true; } + + void clientStop() { + if ( _currentStart == T() ) + return; + D d = currentTime() - _currentStart; + + { + scoped_lock L(topMutex); + recordUsage( _current, d ); + } + + _currentStart = T(); + _read = false; + _write = false; + } + + /* these are used to fetch the stats: */ + + struct Usage { + string ns; + D time; + double pct; + int reads, writes, calls; + }; + + static void usage( vector< Usage > &res ) { + scoped_lock L(topMutex); + + // Populate parent namespaces + UsageMap snapshot; + UsageMap totalUsage; + fillParentNamespaces( snapshot, _snapshot ); + fillParentNamespaces( totalUsage, _totalUsage ); + + multimap< D, string, more > sorted; + for( UsageMap::iterator i = snapshot.begin(); i != snapshot.end(); ++i ) + sorted.insert( make_pair( i->second.get<0>(), i->first ) ); + for( multimap< D, string, more >::iterator i = sorted.begin(); i != sorted.end(); ++i ) { + if ( trivialNs( i->second.c_str() ) ) + continue; + Usage u; + u.ns = i->second; + u.time = totalUsage[ u.ns ].get<0>(); + u.pct = _snapshotDuration != D() ? 100.0 * i->first.ticks() / _snapshotDuration.ticks() : 0; + u.reads = snapshot[ u.ns ].get<1>(); + u.writes = snapshot[ u.ns ].get<2>(); + u.calls = snapshot[ u.ns ].get<3>(); + res.push_back( u ); + } + for( UsageMap::iterator i = totalUsage.begin(); i != totalUsage.end(); ++i ) { + if ( snapshot.count( i->first ) != 0 || trivialNs( i->first.c_str() ) ) + continue; + Usage u; + u.ns = i->first; + u.time = i->second.get<0>(); + u.pct = 0; + u.reads = 0; + u.writes = 0; + u.calls = 0; + res.push_back( u ); + } + } + + static void completeSnapshot() { + scoped_lock L(topMutex); + + if ( &_snapshot == &_snapshotA ) { + _snapshot = _snapshotB; + _nextSnapshot = _snapshotA; + } else { + _snapshot = _snapshotA; + _nextSnapshot = _snapshotB; + } + _snapshotDuration = currentTime() - _snapshotStart; + _snapshotStart = currentTime(); + _nextSnapshot.clear(); + } + + private: + static mongo::mutex topMutex; + static bool trivialNs( const char *ns ) { + const char *ret = strrchr( ns, '.' ); + return ret && ret[ 1 ] == '\0'; + } + typedef map<string,UsageData> UsageMap; // duration, # reads, # writes, # total calls + static T currentTime() { + return boost::posix_time::microsec_clock::universal_time(); + } + void recordUsage( const string &client, D duration ) { + recordUsageForMap( _totalUsage, client, duration ); + recordUsageForMap( _nextSnapshot, client, duration ); + } + void recordUsageForMap( UsageMap &map, const string &client, D duration ) { + UsageData& g = map[client]; + g.get< 0 >() += duration; + if ( _read && !_write ) + g.get< 1 >()++; + else if ( !_read && _write ) + g.get< 2 >()++; + g.get< 3 >()++; + } + static void fillParentNamespaces( UsageMap &to, const UsageMap &from ) { + for( UsageMap::const_iterator i = from.begin(); i != from.end(); ++i ) { + string current = i->first; + size_t dot = current.rfind( "." ); + if ( dot == string::npos || dot != current.length() - 1 ) { + inc( to[ current ], i->second ); + } + while( dot != string::npos ) { + current = current.substr( 0, dot ); + inc( to[ current ], i->second ); + dot = current.rfind( "." ); + } + } + } + static void inc( UsageData &to, const UsageData &from ) { + to.get<0>() += from.get<0>(); + to.get<1>() += from.get<1>(); + to.get<2>() += from.get<2>(); + to.get<3>() += from.get<3>(); + } + struct more { bool operator()( const D &a, const D &b ) { return a > b; } }; + string _current; + T _currentStart; + static T _snapshotStart; + static D _snapshotDuration; + static UsageMap _totalUsage; + static UsageMap _snapshotA; + static UsageMap _snapshotB; + static UsageMap &_snapshot; + static UsageMap &_nextSnapshot; + bool _read; + bool _write; + }; + +} // namespace mongo diff --git a/db/storage.cpp b/db/storage.cpp index 4da2d82..7ddfc65 100644 --- a/db/storage.cpp +++ b/db/storage.cpp @@ -1,4 +1,20 @@ // storage.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + #include "stdafx.h" #include "pdfile.h" diff --git a/db/update.cpp b/db/update.cpp index 0639a99..d6a5c5e 100644 --- a/db/update.cpp +++ b/db/update.cpp @@ -26,8 +26,11 @@ namespace mongo { + //#define DEBUGUPDATE(x) cout << x << endl; +#define DEBUGUPDATE(x) + const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" , - "$bitand" , "$bitor" , "$bit" }; + "$bitand" , "$bitor" , "$bit" , "$addToSet" }; unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*); bool Mod::_pullElementMatch( BSONElement& toMatch ) const { @@ -46,13 +49,42 @@ namespace mongo { return matcher->matches( toMatch.embeddedObject() ); } - void Mod::apply( BSONObjBuilder& b , BSONElement in ){ + template< class Builder > + void Mod::appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const { + BSONType a = in.type(); + BSONType b = elt.type(); + + if ( a == NumberDouble || b == NumberDouble ){ + ms.incType = NumberDouble; + ms.incdouble = elt.numberDouble() + in.numberDouble(); + } + else if ( a == NumberLong || b == NumberLong ){ + ms.incType = NumberLong; + ms.inclong = elt.numberLong() + in.numberLong(); + } + else { + ms.incType = NumberInt; + ms.incint = elt.numberInt() + in.numberInt(); + } + + ms.appendIncValue( bb ); + } + + template< class Builder > + void appendUnset( Builder &b ) { + } + + template<> + void appendUnset( BSONArrayBuilder &b ) { + b.appendNull(); + } + + template< class Builder > + void Mod::apply( Builder& b , BSONElement in , ModState& ms ) const { switch ( op ){ case INC: { - // TODO: this is horrible - inc( in ); - b.appendAs( elt , shortFieldName ); + appendIncremented( b , in , ms ); break; } @@ -63,10 +95,10 @@ namespace mongo { } case UNSET: { - //Explicit NOOP + appendUnset( b ); break; } - + case PUSH: { uassert( 10131 , "$push can only be applied to an array" , in.type() == Array ); BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); @@ -77,13 +109,60 @@ namespace mongo { n++; } - pushStartSize = n; + ms.pushStartSize = n; bb.appendAs( elt , bb.numStr( n ) ); bb.done(); break; } + case ADDTOSET: { + uassert( 12592 , "$addToSet can only be applied to an array" , in.type() == Array ); + BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); + + BSONObjIterator i( in.embeddedObject() ); + int n=0; + + if ( isEach() ){ + + BSONElementSet toadd; + parseEach( toadd ); + + while ( i.more() ){ + BSONElement cur = i.next(); + bb.append( cur ); + n++; + toadd.erase( cur ); + } + + for ( BSONElementSet::iterator j=toadd.begin(); j!=toadd.end(); j++ ){ + bb.appendAs( *j , BSONObjBuilder::numStr( n++ ) ); + } + + } + else { + + bool found = false; + + while ( i.more() ){ + BSONElement cur = i.next(); + bb.append( cur ); + n++; + if ( elt.woCompare( cur , false ) == 0 ) + found = true; + } + + if ( ! found ) + bb.appendAs( elt , bb.numStr( n ) ); + + } + + bb.done(); + break; + } + + + case PUSH_ALL: { uassert( 10132 , "$pushAll can only be applied to an array" , in.type() == Array ); uassert( 10133 , "$pushAll has to be passed an array" , elt.type() ); @@ -97,7 +176,7 @@ namespace mongo { n++; } - pushStartSize = n; + ms.pushStartSize = n; i = BSONObjIterator( elt.embeddedObject() ); while ( i.more() ){ @@ -172,8 +251,8 @@ namespace mongo { } } - pushStartSize = n; - assert( pushStartSize == in.embeddedObject().nFields() ); + ms.pushStartSize = n; + assert( ms.pushStartSize == in.embeddedObject().nFields() ); bb.done(); break; } @@ -226,97 +305,130 @@ namespace mongo { } } - bool ModSet::canApplyInPlaceAndVerify(const BSONObj &obj) const { - bool inPlacePossible = true; + auto_ptr<ModSetState> ModSet::prepare(const BSONObj &obj) const { + ModSetState * mss = new ModSetState( obj ); // Perform this check first, so that we don't leave a partially modified object on uassert. for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) { + ModState& ms = mss->_mods[i->first]; + const Mod& m = i->second; BSONElement e = obj.getFieldDotted(m.fieldName); - + + ms.m = &m; + ms.old = e; + if ( e.eoo() ) { - inPlacePossible = (m.op == Mod::UNSET); + mss->amIInPlacePossible( m.op == Mod::UNSET ); + continue; } - else { - switch( m.op ) { - case Mod::INC: - uassert( 10140 , "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() ); - if ( !e.isNumber() ) - inPlacePossible = false; - break; - case Mod::SET: - inPlacePossible = - m.elt.type() == e.type() && - m.elt.valuesize() == e.valuesize(); - break; - case Mod::PUSH: - case Mod::PUSH_ALL: - uassert( 10141 , "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() ); - inPlacePossible = false; - break; - case Mod::PULL: - case Mod::PULL_ALL: { - uassert( 10142 , "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() ); - BSONObjIterator i( e.embeddedObject() ); - while( inPlacePossible && i.more() ) { - BSONElement arrI = i.next(); - if ( m.op == Mod::PULL ) { - if ( m._pullElementMatch( arrI ) ) - inPlacePossible = false; - } - else if ( m.op == Mod::PULL_ALL ) { - BSONObjIterator j( m.elt.embeddedObject() ); - while( inPlacePossible && j.moreWithEOO() ) { - BSONElement arrJ = j.next(); - if ( arrJ.eoo() ) - break; - if ( arrI.woCompare( arrJ, false ) == 0 ) { - inPlacePossible = false; - } - } + + switch( m.op ) { + case Mod::INC: + uassert( 10140 , "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() ); + if ( mss->amIInPlacePossible( e.isNumber() ) ){ + // check more typing info here + if ( m.elt.type() != e.type() ){ + // if i'm incrememnting with a double, then the storage has to be a double + mss->amIInPlacePossible( m.elt.type() != NumberDouble ); + } + } + break; + + case Mod::SET: + mss->amIInPlacePossible( m.elt.type() == e.type() && + m.elt.valuesize() == e.valuesize() ); + break; + + case Mod::PUSH: + case Mod::PUSH_ALL: + uassert( 10141 , "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() ); + mss->amIInPlacePossible( false ); + break; + + case Mod::PULL: + case Mod::PULL_ALL: { + uassert( 10142 , "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() ); + BSONObjIterator i( e.embeddedObject() ); + while( mss->_inPlacePossible && i.more() ) { + BSONElement arrI = i.next(); + if ( m.op == Mod::PULL ) { + mss->amIInPlacePossible( ! m._pullElementMatch( arrI ) ); + } + else if ( m.op == Mod::PULL_ALL ) { + BSONObjIterator j( m.elt.embeddedObject() ); + while( mss->_inPlacePossible && j.moreWithEOO() ) { + BSONElement arrJ = j.next(); + if ( arrJ.eoo() ) + break; + mss->amIInPlacePossible( arrI.woCompare( arrJ, false ) ); } } - break; } - case Mod::POP: { - uassert( 10143 , "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() ); - if ( ! e.embeddedObject().isEmpty() ) - inPlacePossible = false; - break; + break; + } + + case Mod::POP: { + uassert( 10143 , "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() ); + mss->amIInPlacePossible( e.embeddedObject().isEmpty() ); + break; + } + + case Mod::ADDTOSET: { + uassert( 12591 , "Cannot apply $addToSet modifier to non-array", e.type() == Array || e.eoo() ); + + BSONObjIterator i( e.embeddedObject() ); + if ( m.isEach() ){ + BSONElementSet toadd; + m.parseEach( toadd ); + while( i.more() ) { + BSONElement arrI = i.next(); + toadd.erase( arrI ); + } + mss->amIInPlacePossible( toadd.size() == 0 ); } - default: - // mods we don't know about shouldn't be done in place - inPlacePossible = false; + else { + bool found = false; + while( i.more() ) { + BSONElement arrI = i.next(); + if ( arrI.woCompare( m.elt , false ) == 0 ){ + found = true; + break; + } + } + mss->amIInPlacePossible( found ); } + break; + } + + default: + // mods we don't know about shouldn't be done in place + mss->amIInPlacePossible( false ); } } - return inPlacePossible; + return auto_ptr<ModSetState>( mss ); } - void ModSet::applyModsInPlace(const BSONObj &obj) const { - for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) { - const Mod& m = i->second; - BSONElement e = obj.getFieldDotted(m.fieldName); + void ModSetState::applyModsInPlace() { + for ( ModStateHolder::iterator i = _mods.begin(); i != _mods.end(); ++i ) { + ModState& m = i->second; - switch ( m.op ){ + switch ( m.m->op ){ case Mod::UNSET: case Mod::PULL: case Mod::PULL_ALL: + case Mod::ADDTOSET: + // this should have been handled by prepare break; // [dm] the BSONElementManipulator statements below are for replication (correct?) case Mod::INC: - m.inc(e); - m.setElementToOurNumericValue(e); + m.m->incrementMe( m.old ); + m.fixedName = "$set"; + m.fixed = &(m.old); break; case Mod::SET: - if ( e.isNumber() && m.elt.isNumber() ) { - // todo: handle NumberLong: - m.setElementToOurNumericValue(e); - } - else { - BSONElementManipulator( e ).replaceTypeAndValue( m.elt ); - } + BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt ); break; default: uassert( 10144 , "can't apply mod in place - shouldn't have gotten here" , 0 ); @@ -342,18 +454,19 @@ namespace mongo { fields[ base + top.fieldName() ] = top; } - void ModSet::_appendNewFromMods( const string& root , Mod& m , BSONObjBuilder& b , set<string>& onedownseen ){ - const char * temp = m.fieldName; + template< class Builder > + void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ){ + const char * temp = m.fieldName(); temp += root.size(); const char * dot = strchr( temp , '.' ); if ( dot ){ - string nr( m.fieldName , 0 , 1 + ( dot - m.fieldName ) ); + string nr( m.fieldName() , 0 , 1 + ( dot - m.fieldName() ) ); string nf( temp , 0 , dot - temp ); if ( onedownseen.count( nf ) ) return; onedownseen.insert( nf ); BSONObjBuilder bb ( b.subobjStart( nf.c_str() ) ); - createNewFromMods( nr , bb , BSONObj() ); + createNewFromMods( nr , bb , BSONObj() ); // don't infer an array from name bb.done(); } else { @@ -362,29 +475,37 @@ namespace mongo { } - void ModSet::createNewFromMods( const string& root , BSONObjBuilder& b , const BSONObj &obj ){ + template< class Builder > + void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ){ BSONObjIteratorSorted es( obj ); BSONElement e = es.next(); - ModHolder::iterator m = _mods.lower_bound( root ); - ModHolder::iterator mend = _mods.lower_bound( root + "{" ); + ModStateHolder::iterator m = _mods.lower_bound( root ); + ModStateHolder::iterator mend = _mods.lower_bound( root + '{' ); set<string> onedownseen; while ( e.type() && m != mend ){ string field = root + e.fieldName(); - FieldCompareResult cmp = compareDottedFieldNames( m->second.fieldName , field ); - + FieldCompareResult cmp = compareDottedFieldNames( m->second.m->fieldName , field ); + switch ( cmp ){ case LEFT_SUBFIELD: { // Mod is embeddeed under this element uassert( 10145 , "LEFT_SUBFIELD only supports Object" , e.type() == Object || e.type() == Array ); if ( onedownseen.count( e.fieldName() ) == 0 ){ onedownseen.insert( e.fieldName() ); - BSONObjBuilder bb ( e.type() == Object ? b.subobjStart( e.fieldName() ) : b.subarrayStart( e.fieldName() ) ); - stringstream nr; nr << root << e.fieldName() << "."; - createNewFromMods( nr.str() , bb , e.embeddedObject() ); - bb.done(); + if ( e.type() == Object ) { + BSONObjBuilder bb( b.subobjStart( e.fieldName() ) ); + stringstream nr; nr << root << e.fieldName() << "."; + createNewFromMods( nr.str() , bb , e.embeddedObject() ); + bb.done(); + } else { + BSONArrayBuilder ba( b.subarrayStart( e.fieldName() ) ); + stringstream nr; nr << root << e.fieldName() << "."; + createNewFromMods( nr.str() , ba , e.embeddedObject() ); + ba.done(); + } // inc both as we handled both e = es.next(); m++; @@ -401,7 +522,7 @@ namespace mongo { m++; continue; case RIGHT_BEFORE: // field that doesn't have a MOD - b.append( e ); + b.append( e ); // if array, ignore field name e = es.next(); continue; case RIGHT_SUBFIELD: @@ -414,7 +535,7 @@ namespace mongo { // finished looping the mods, just adding the rest of the elements while ( e.type() ){ - b.append( e ); + b.append( e ); // if array, ignore field name e = es.next(); } @@ -424,9 +545,9 @@ namespace mongo { } } - BSONObj ModSet::createNewFromMods( const BSONObj &obj ) { - BSONObjBuilder b( (int)(obj.objsize() * 1.1) ); - createNewFromMods( "" , b , obj ); + BSONObj ModSetState::createNewFromMods() { + BSONObjBuilder b( (int)(_obj.objsize() * 1.1) ); + createNewFromMods( "" , b , _obj ); return b.obj(); } @@ -451,10 +572,12 @@ namespace mongo { newObj = bb.obj(); } - if ( canApplyInPlaceAndVerify( newObj ) ) - applyModsInPlace( newObj ); + auto_ptr<ModSetState> mss = prepare( newObj ); + + if ( mss->canApplyInPlace() ) + mss->applyModsInPlace(); else - newObj = createNewFromMods( newObj ); + newObj = mss->createNewFromMods(); return newObj; } @@ -468,17 +591,24 @@ namespace mongo { { $pullAll : { a:[99,1010] } } NOTE: MODIFIES source from object! */ - void ModSet::getMods(const BSONObj &from) { + ModSet::ModSet( + const BSONObj &from , + const set<string>& idxKeys, + const set<string> *backgroundKeys) + : _isIndexed(0) , _hasDynamicArray( false ) { + BSONObjIterator it(from); + while ( it.more() ) { BSONElement e = it.next(); const char *fn = e.fieldName(); + uassert( 10147 , "Invalid modifier specified" + string( fn ), e.type() == Object ); BSONObj j = e.embeddedObject(); + BSONObjIterator jt(j); Mod::Op op = opFromStr( fn ); - if ( op == Mod::INC ) - strcpy((char *) fn, "$set"); // rewrite for op log + while ( jt.more() ) { BSONElement f = jt.next(); // x:44 @@ -490,28 +620,46 @@ namespace mongo { uassert( 10151 , "have conflict mod" , ! haveConflictingMod( fieldName ) ); uassert( 10152 , "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC ); uassert( 10153 , "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) ); - + + _hasDynamicArray = _hasDynamicArray || strstr( fieldName , ".$" ) > 0; + Mod m; m.init( op , f ); m.setFieldName( f.fieldName() ); - - // horrible - to be cleaned up - if ( f.type() == NumberDouble ) { - m.ndouble = (double *) f.value(); - m.nint = 0; - } else if ( f.type() == NumberInt ) { - m.ndouble = 0; - m.nint = (int *) f.value(); - } - else if( f.type() == NumberLong ) { - m.ndouble = 0; - m.nint = 0; - m.nlong = (long long *) f.value(); + + if ( m.isIndexed( idxKeys ) || + (backgroundKeys && m.isIndexed(*backgroundKeys)) ) { + _isIndexed++; } _mods[m.fieldName] = m; + + DEBUGUPDATE( "\t\t " << fieldName << "\t" << _hasDynamicArray ); + } + } + + } + + ModSet * ModSet::fixDynamicArray( const char * elemMatchKey ) const { + ModSet * n = new ModSet(); + n->_isIndexed = _isIndexed; + n->_hasDynamicArray = _hasDynamicArray; + for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ){ + string s = i->first; + size_t idx = s.find( ".$" ); + if ( idx == string::npos ){ + n->_mods[s] = i->second; + continue; } + StringBuilder buf(s.size()+strlen(elemMatchKey)); + buf << s.substr(0,idx+1) << elemMatchKey << s.substr(idx+2); + string fixed = buf.str(); + DEBUGUPDATE( "fixed dynamic: " << s << " -->> " << fixed ); + n->_mods[fixed] = i->second; + ModHolder::iterator temp = n->_mods.find( fixed ); + temp->second.setFieldName( temp->first.c_str() ); } + return n; } void checkNoMods( BSONObj o ) { @@ -526,46 +674,58 @@ namespace mongo { class UpdateOp : public QueryOp { public: - UpdateOp() : nscanned_() {} + UpdateOp() : _nscanned() {} virtual void init() { BSONObj pattern = qp().query(); - c_.reset( qp().newCursor().release() ); - if ( !c_->ok() ) + _c.reset( qp().newCursor().release() ); + if ( ! _c->ok() ) setComplete(); else - matcher_.reset( new CoveredIndexMatcher( pattern, qp().indexKey() ) ); + _matcher.reset( new CoveredIndexMatcher( pattern, qp().indexKey() ) ); } virtual void next() { - if ( !c_->ok() ) { + if ( ! _c->ok() ) { setComplete(); return; } - nscanned_++; - if ( matcher_->matches(c_->currKey(), c_->currLoc()) ) { + _nscanned++; + if ( _matcher->matches(_c->currKey(), _c->currLoc(), &_details ) ) { setComplete(); return; } - c_->advance(); + _c->advance(); } bool curMatches(){ - return matcher_->matches(c_->currKey(), c_->currLoc() ); + return _matcher->matches(_c->currKey(), _c->currLoc() , &_details ); } virtual bool mayRecordPlan() const { return false; } virtual QueryOp *clone() const { return new UpdateOp(); } - shared_ptr< Cursor > c() { return c_; } - long long nscanned() const { return nscanned_; } + shared_ptr< Cursor > c() { return _c; } + long long nscanned() const { return _nscanned; } + MatchDetails& getMatchDetails(){ return _details; } private: - shared_ptr< Cursor > c_; - long long nscanned_; - auto_ptr< CoveredIndexMatcher > matcher_; + shared_ptr< Cursor > _c; + long long _nscanned; + auto_ptr< CoveredIndexMatcher > _matcher; + MatchDetails _details; }; - UpdateResult updateObjects(const char *ns, BSONObj updateobjOrig, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) { + UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) { + DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi ); int profile = cc().database()->profile; StringBuilder& ss = debug.str; + + if ( logLevel > 2 ) + ss << " update: " << updateobj; + + /* idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case */ + /* NOTE: when yield() is added herein, these must be refreshed after each call to yield! */ + NamespaceDetails *d = nsdetails(ns); // can be null if an upsert... + NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get_w(ns); + /* end note */ uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 ); if ( strstr(ns, ".system.") ) { @@ -573,6 +733,21 @@ namespace mongo { uassert( 10156 , "cannot update system collection", legalClientSystemNS( ns , true ) ); } + auto_ptr<ModSet> mods; + bool isOperatorUpdate = updateobj.firstElement().fieldName()[0] == '$'; + int modsIsIndexed = false; // really the # of indexes + if ( isOperatorUpdate ){ + if( d && d->backgroundIndexBuildInProgress ) { + set<string> bgKeys; + d->backgroundIdx().keyPattern().getFieldNames(bgKeys); + mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) ); + } + else { + mods.reset( new ModSet(updateobj, nsdt->indexKeys()) ); + } + modsIsIndexed = mods->isIndexed(); + } + set<DiskLoc> seenObjects; QueryPlanSet qps( ns, patternOrig, BSONObj() ); @@ -593,11 +768,10 @@ namespace mongo { c->advance(); continue; } - + BSONObj js(r); BSONObj pattern = patternOrig; - BSONObj updateobj = updateobjOrig; if ( logop ) { BSONObjBuilder idPattern; @@ -620,43 +794,46 @@ namespace mongo { /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some regular ones at the moment. */ - - const char *firstField = updateobj.firstElement().fieldName(); - - if ( firstField[0] == '$' ) { - + if ( isOperatorUpdate ) { + if ( multi ){ c->advance(); // go to next record in case this one moves if ( seenObjects.count( loc ) ) continue; - updateobj = updateobj.copy(); } - ModSet mods; - mods.getMods(updateobj); - NamespaceDetailsTransient& ndt = NamespaceDetailsTransient::get_w(ns); - set<string>& idxKeys = ndt.indexKeys(); - int isIndexed = mods.isIndexed( idxKeys ); - - if ( isIndexed && multi ){ + if ( modsIsIndexed && multi ){ c->noteLocation(); } - if ( isIndexed <= 0 && mods.canApplyInPlaceAndVerify( loc.obj() ) ) { - mods.applyModsInPlace( loc.obj() ); - //seenObjects.insert( loc ); + const BSONObj& onDisk = loc.obj(); + + ModSet * useMods = mods.get(); + + auto_ptr<ModSet> mymodset; + if ( u->getMatchDetails().elemMatchKey && mods->hasDynamicArray() ){ + useMods = mods->fixDynamicArray( u->getMatchDetails().elemMatchKey ); + mymodset.reset( useMods ); + } + + + auto_ptr<ModSetState> mss = useMods->prepare( onDisk ); + + if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ){ + mss->applyModsInPlace();// const_cast<BSONObj&>(onDisk) ); + if ( profile ) ss << " fastmod "; - if ( isIndexed ){ + if ( modsIsIndexed ){ seenObjects.insert( loc ); } } else { - BSONObj newObj = mods.createNewFromMods( loc.obj() ); - uassert( 12522 , "$ operator made objcet too large" , newObj.isValid() ); - DiskLoc newLoc = theDataFileMgr.update(ns, r, loc , newObj.objdata(), newObj.objsize(), debug); - if ( newLoc != loc || isIndexed ){ + BSONObj newObj = mss->createNewFromMods(); + uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= ( 4 * 1024 * 1024 ) ); + DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug); + if ( newLoc != loc || modsIsIndexed ) { // object moved, need to make sure we don' get again seenObjects.insert( newLoc ); } @@ -664,25 +841,27 @@ namespace mongo { } if ( logop ) { - - assert( mods.size() ); + DEV assert( mods->size() ); - if ( mods.haveArrayDepMod() ) { + if ( mss->haveArrayDepMod() ) { BSONObjBuilder patternBuilder; patternBuilder.appendElements( pattern ); - mods.appendSizeSpecForArrayDepMods( patternBuilder ); + mss->appendSizeSpecForArrayDepMods( patternBuilder ); pattern = patternBuilder.obj(); } - if ( mods.needOpLogRewrite() ) - updateobj = mods.getOpLogRewrite(); - - logOp("u", ns, updateobj, &pattern ); + if ( mss->needOpLogRewrite() ){ + DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() ); + logOp("u", ns, mss->getOpLogRewrite() , &pattern ); + } + else { + logOp("u", ns, updateobj, &pattern ); + } } numModded++; if ( ! multi ) break; - if ( multi && isIndexed ) + if ( multi && modsIsIndexed ) c->checkLocation(); continue; } @@ -691,7 +870,7 @@ namespace mongo { BSONElementManipulator::lookForTimestamps( updateobj ); checkNoMods( updateobj ); - theDataFileMgr.update(ns, r, loc , updateobj.objdata(), updateobj.objsize(), debug); + theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug); if ( logop ) logOp("u", ns, updateobj, &pattern ); return UpdateResult( 1 , 0 , 1 ); @@ -705,13 +884,9 @@ namespace mongo { ss << " nscanned:" << u->nscanned(); if ( upsert ) { - if ( updateobjOrig.firstElement().fieldName()[0] == '$' ) { + if ( updateobj.firstElement().fieldName()[0] == '$' ) { /* upsert of an $inc. build a default */ - ModSet mods; - mods.getMods(updateobjOrig); - - BSONObj newObj = mods.createNewFromQuery( patternOrig ); - + BSONObj newObj = mods->createNewFromQuery( patternOrig ); if ( profile ) ss << " fastmodinsert "; theDataFileMgr.insert(ns, newObj); @@ -722,12 +897,13 @@ namespace mongo { return UpdateResult( 0 , 1 , 1 ); } uassert( 10159 , "multi update only works with $ operators" , ! multi ); - checkNoMods( updateobjOrig ); + checkNoMods( updateobj ); if ( profile ) ss << " upsert "; - theDataFileMgr.insert(ns, updateobjOrig); + BSONObj no = updateobj; + theDataFileMgr.insert(ns, no); if ( logop ) - logOp( "i", ns, updateobjOrig ); + logOp( "i", ns, no ); return UpdateResult( 0 , 0 , 1 ); } return UpdateResult( 0 , 0 , 0 ); diff --git a/db/update.h b/db/update.h index 26a8a8d..e14b0fb 100644 --- a/db/update.h +++ b/db/update.h @@ -23,11 +23,17 @@ namespace mongo { - /* Used for modifiers such as $inc, $set, $push, ... */ + class ModState; + class ModSetState; + + /* Used for modifiers such as $inc, $set, $push, ... + * stores the info about a single operation + * once created should never be modified + */ struct Mod { // See opFromStr below - // 0 1 2 3 4 5 6 7 8 9 10 - enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT } op; + // 0 1 2 3 4 5 6 7 8 9 10 11 + enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET } op; static const char* modNames[]; static unsigned modNamesNum; @@ -35,13 +41,7 @@ namespace mongo { const char *fieldName; const char *shortFieldName; - // kind of lame; fix one day? - double *ndouble; - int *nint; - long long *nlong; - BSONElement elt; // x:5 note: this is the actual element from the updateobj - int pushStartSize; boost::shared_ptr<Matcher> matcher; void init( Op o , BSONElement& e ){ @@ -59,36 +59,32 @@ namespace mongo { else shortFieldName = fieldName; } - - /* [dm] why is this const? (or rather, why was setn const?) i see why but think maybe clearer if were not. */ - void inc(BSONElement& n) const { - uassert( 10160 , "$inc value is not a number", n.isNumber() ); - if( ndouble ) - *ndouble += n.numberDouble(); - else if( nint ) - *nint += n.numberInt(); - else - *nlong += n.numberLong(); - } - - void setElementToOurNumericValue(BSONElement& e) const { - BSONElementManipulator manip(e); - if( e.type() == NumberLong ) - manip.setLong(_getlong()); - else - manip.setNumber(_getn()); - } - - double _getn() const { - if( ndouble ) return *ndouble; - if( nint ) return *nint; - return (double) *nlong; - } - long long _getlong() const { - if( nlong ) return *nlong; - if( ndouble ) return (long long) *ndouble; - return *nint; + + /** + * @param in incrememnts the actual value inside in + */ + void incrementMe( BSONElement& in ) const { + BSONElementManipulator manip( in ); + + switch ( in.type() ){ + case NumberDouble: + manip.setNumber( elt.numberDouble() + in.numberDouble() ); + break; + case NumberLong: + manip.setLong( elt.numberLong() + in.numberLong() ); + break; + case NumberInt: + manip.setInt( elt.numberInt() + in.numberInt() ); + break; + default: + assert(0); + } + } + + template< class Builder > + void appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const; + bool operator<( const Mod &other ) const { return strcmp( fieldName, other.fieldName ) < 0; } @@ -120,34 +116,15 @@ namespace mongo { return false; } - void apply( BSONObjBuilder& b , BSONElement in ); + template< class Builder > + void apply( Builder& b , BSONElement in , ModState& ms ) const; /** * @return true iff toMatch should be removed from the array */ bool _pullElementMatch( BSONElement& toMatch ) const; - bool needOpLogRewrite() const { - switch( op ){ - case BIT: - case BITAND: - case BITOR: - // TODO: should we convert this to $set? - return false; - default: - return false; - } - } - - void appendForOpLog( BSONObjBuilder& b ) const { - const char * name = modNames[op]; - - BSONObjBuilder bb( b.subobjStart( name ) ); - bb.append( elt ); - bb.done(); - } - - void _checkForAppending( BSONElement& e ){ + void _checkForAppending( const BSONElement& e ) const { if ( e.type() == Object ){ // this is a tiny bit slow, but rare and important // only when setting something TO an object, not setting something in an object @@ -157,12 +134,38 @@ namespace mongo { } } + bool isEach() const { + if ( elt.type() != Object ) + return false; + BSONElement e = elt.embeddedObject().firstElement(); + if ( e.type() != Array ) + return false; + return strcmp( e.fieldName() , "$each" ) == 0; + } + + BSONObj getEach() const { + return elt.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck(); + } + + void parseEach( BSONElementSet& s ) const { + BSONObjIterator i(getEach()); + while ( i.more() ){ + s.insert( i.next() ); + } + } + }; - class ModSet { + /** + * stores a set of Mods + * once created, should never be changed + */ + class ModSet : boost::noncopyable { typedef map<string,Mod> ModHolder; ModHolder _mods; - + int _isIndexed; + bool _hasDynamicArray; + static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base ); FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const { @@ -180,45 +183,6 @@ namespace mongo { return compareDottedFieldNames( m->first, p->first.c_str() ); } - - void _appendNewFromMods( const string& root , Mod& m , BSONObjBuilder& b , set<string>& onedownseen ); - - void appendNewFromMod( Mod& m , BSONObjBuilder& b ){ - switch ( m.op ){ - - case Mod::PUSH: { - BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) ); - arr.appendAs( m.elt, "0" ); - arr.done(); - m.pushStartSize = -1; - break; - } - - case Mod::PUSH_ALL: { - b.appendAs( m.elt, m.shortFieldName ); - m.pushStartSize = -1; - break; - } - - case Mod::UNSET: - case Mod::PULL: - case Mod::PULL_ALL: - // no-op b/c unset/pull of nothing does nothing - break; - - case Mod::INC: - case Mod::SET: { - m._checkForAppending( m.elt ); - b.appendAs( m.elt, m.shortFieldName ); - break; - } - default: - stringstream ss; - ss << "unknown mod in appendNewFromMod: " << m.op; - throw UserException( 9015, ss.str() ); - } - - } bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) { for( string left = EmbeddedBuilder::splitDot( right ); @@ -279,39 +243,51 @@ namespace mongo { } break; } + case 'a': { + if ( fn[2] == 'd' && fn[3] == 'd' ){ + // add + if ( fn[4] == 'T' && fn[5] == 'o' && fn[6] == 'S' && fn[7] == 'e' && fn[8] == 't' && fn[9] == 0 ) + return Mod::ADDTOSET; + + } + } default: break; } uassert( 10161 , "Invalid modifier specified " + string( fn ), false ); return Mod::INC; } - public: + ModSet(){} - void getMods( const BSONObj &from ); - /** - will return if can be done in place, or uassert if there is an error - @return whether or not the mods can be done in place - */ - bool canApplyInPlaceAndVerify( const BSONObj &obj ) const; - void applyModsInPlace( const BSONObj &obj ) const; + public: + + ModSet( const BSONObj &from , + const set<string>& idxKeys = set<string>(), + const set<string>* backgroundKeys = 0 + ); - // new recursive version, will replace at some point - void createNewFromMods( const string& root , BSONObjBuilder& b , const BSONObj &obj ); + // TODO: this is inefficient - should probably just handle when iterating + ModSet * fixDynamicArray( const char * elemMatchKey ) const; - BSONObj createNewFromMods( const BSONObj &obj ); + bool hasDynamicArray() const { return _hasDynamicArray; } + /** + * creates a ModSetState suitable for operation on obj + * doesn't change or modify this ModSet or any underying Mod + */ + auto_ptr<ModSetState> prepare( const BSONObj& obj ) const; + + /** + * given a query pattern, builds an object suitable for an upsert + * will take the query spec and combine all $ operators + */ BSONObj createNewFromQuery( const BSONObj& query ); /** * */ - int isIndexed( const set<string>& idxKeys ) const { - int numIndexes = 0; - for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ){ - if ( i->second.isIndexed( idxKeys ) ) - numIndexes++; - } - return numIndexes; + int isIndexed() const { + return _isIndexed; } unsigned size() const { return _mods.size(); } @@ -341,10 +317,190 @@ namespace mongo { } + }; + + /** + * stores any information about a single Mod operating on a single Object + */ + class ModState { + public: + const Mod * m; + BSONElement old; + + const char * fixedName; + BSONElement * fixed; + int pushStartSize; + + BSONType incType; + int incint; + double incdouble; + long long inclong; + + ModState(){ + fixedName = 0; + fixed = 0; + pushStartSize = -1; + incType = EOO; + } + + Mod::Op op() const { + return m->op; + } + + const char * fieldName() const { + return m->fieldName; + } + + bool needOpLogRewrite() const { + if ( fixed || fixedName || incType ) + return true; + + switch( op() ){ + case Mod::BIT: + case Mod::BITAND: + case Mod::BITOR: + // TODO: should we convert this to $set? + return false; + default: + return false; + } + } + + void appendForOpLog( BSONObjBuilder& b ) const { + if ( incType ){ + BSONObjBuilder bb( b.subobjStart( "$set" ) ); + appendIncValue( bb ); + bb.done(); + return; + } + + const char * name = fixedName ? fixedName : Mod::modNames[op()]; + + BSONObjBuilder bb( b.subobjStart( name ) ); + if ( fixed ) + bb.appendAs( *fixed , m->fieldName ); + else + bb.append( m->elt ); + bb.done(); + } + + template< class Builder > + void apply( Builder& b , BSONElement in ){ + m->apply( b , in , *this ); + } + + template< class Builder > + void appendIncValue( Builder& b ) const { + switch ( incType ){ + case NumberDouble: + b.append( m->shortFieldName , incdouble ); break; + case NumberLong: + b.append( m->shortFieldName , inclong ); break; + case NumberInt: + b.append( m->shortFieldName , incint ); break; + default: + assert(0); + } + } + }; + + /** + * this is used to hold state, meta data while applying a ModSet to a BSONObj + * the goal is to make ModSet const so its re-usable + */ + class ModSetState : boost::noncopyable { + struct FieldCmp { + bool operator()( const string &l, const string &r ) const { + return lexNumCmp( l.c_str(), r.c_str() ) < 0; + } + }; + typedef map<string,ModState,FieldCmp> ModStateHolder; + const BSONObj& _obj; + ModStateHolder _mods; + bool _inPlacePossible; + + ModSetState( const BSONObj& obj ) + : _obj( obj ) , _inPlacePossible(true){ + } + + /** + * @return if in place is still possible + */ + bool amIInPlacePossible( bool inPlacePossible ){ + if ( ! inPlacePossible ) + _inPlacePossible = false; + return _inPlacePossible; + } + + template< class Builder > + void createNewFromMods( const string& root , Builder& b , const BSONObj &obj ); + + template< class Builder > + void _appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ); + + template< class Builder > + void appendNewFromMod( ModState& ms , Builder& b ){ + //const Mod& m = *(ms.m); // HACK + Mod& m = *((Mod*)(ms.m)); // HACK + + switch ( m.op ){ + + case Mod::PUSH: + case Mod::ADDTOSET: { + if ( m.isEach() ){ + b.appendArray( m.shortFieldName , m.getEach() ); + } + else { + BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) ); + arr.appendAs( m.elt, "0" ); + arr.done(); + } + break; + } + + case Mod::PUSH_ALL: { + b.appendAs( m.elt, m.shortFieldName ); + break; + } + + case Mod::UNSET: + case Mod::PULL: + case Mod::PULL_ALL: + // no-op b/c unset/pull of nothing does nothing + break; + + case Mod::INC: + ms.fixedName = "$set"; + case Mod::SET: { + m._checkForAppending( m.elt ); + b.appendAs( m.elt, m.shortFieldName ); + break; + } + default: + stringstream ss; + ss << "unknown mod in appendNewFromMod: " << m.op; + throw UserException( 9015, ss.str() ); + } + + } + + public: + + bool canApplyInPlace() const { + return _inPlacePossible; + } + + /** + * modified underlying _obj + */ + void applyModsInPlace(); + + BSONObj createNewFromMods(); + // re-writing for oplog bool needOpLogRewrite() const { - for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) + for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) if ( i->second.needOpLogRewrite() ) return true; return false; @@ -352,31 +508,33 @@ namespace mongo { BSONObj getOpLogRewrite() const { BSONObjBuilder b; - for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) + for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) i->second.appendForOpLog( b ); return b.obj(); } bool haveArrayDepMod() const { - for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) - if ( i->second.arrayDep() ) + for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) + if ( i->second.m->arrayDep() ) return true; return false; } void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const { - for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) { - const Mod& m = i->second; - if ( m.arrayDep() ){ + for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) { + const ModState& m = i->second; + if ( m.m->arrayDep() ){ if ( m.pushStartSize == -1 ) - b.appendNull( m.fieldName ); + b.appendNull( m.fieldName() ); else - b << m.fieldName << BSON( "$size" << m.pushStartSize ); + b << m.fieldName() << BSON( "$size" << m.pushStartSize ); } } } + + + friend class ModSet; }; - } |