summaryrefslogtreecommitdiff
path: root/db/repl.h
diff options
context:
space:
mode:
Diffstat (limited to 'db/repl.h')
-rw-r--r--db/repl.h315
1 files changed, 315 insertions, 0 deletions
diff --git a/db/repl.h b/db/repl.h
new file mode 100644
index 0000000..a4c1737
--- /dev/null
+++ b/db/repl.h
@@ -0,0 +1,315 @@
+// repl.h - replication
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* replication data overview
+
+ at the slave:
+ local.sources { host: ..., source: ..., only: ..., syncedTo: ..., localLogTs: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+ at the master:
+ local.oplog.$<source>
+ local.oplog.$main is the default
+*/
+
+#pragma once
+
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "query.h"
+
+#include "../client/dbclient.h"
+
+#include "../util/optime.h"
+
+namespace mongo {
+
+ class DBClientConnection;
+ class DBClientCursor;
+
+ /* replication slave? (possibly with slave or repl pair nonmaster)
+ --slave cmd line setting -> SimpleSlave
+ */
+ typedef enum { NotSlave=0, SimpleSlave, ReplPairSlave } SlaveTypes;
+ extern SlaveTypes slave;
+
+ /* true means we are master and doing replication. if we are not writing to oplog (no --master or repl pairing),
+ this won't be true.
+ */
+ extern bool master;
+
+ extern int opIdMem;
+
+ bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+ bool slaveOk, bool useReplAuth, bool snapshot);
+
+ /* A replication exception */
+ class SyncException : public DBException {
+ public:
+ virtual const char* what() const throw() { return "sync exception"; }
+ virtual int getCode(){ return 10001; }
+ };
+
+ /* A Source is a source from which we can pull (replicate) data.
+ stored in collection local.sources.
+
+ Can be a group of things to replicate for several databases.
+
+ { host: ..., source: ..., only: ..., syncedTo: ..., localLogTs: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+ 'source' defaults to 'main'; support for multiple source names is
+ not done (always use main for now).
+ */
+ class ReplSource {
+ bool resync(string db);
+
+ /* pull some operations from the master's oplog, and apply them. */
+ bool sync_pullOpLog(int& nApplied);
+
+ void sync_pullOpLog_applyOperation(BSONObj& op, OpTime *localLogTail);
+
+ auto_ptr<DBClientConnection> conn;
+ auto_ptr<DBClientCursor> cursor;
+
+ /* we only clone one database per pass, even if a lot need done. This helps us
+ avoid overflowing the master's transaction log by doing too much work before going
+ back to read more transactions. (Imagine a scenario of slave startup where we try to
+ clone 100 databases in one pass.)
+ */
+ set<string> addDbNextPass;
+
+ set<string> incompleteCloneDbs;
+
+ ReplSource();
+
+ // returns the dummy ns used to do the drop
+ string resyncDrop( const char *db, const char *requester );
+ // returns true if connected on return
+ bool connect();
+ // returns possibly unowned id spec for the operation.
+ static BSONObj idForOp( const BSONObj &op, bool &mod );
+ static void updateSetsWithOp( const BSONObj &op, bool mayUpdateStorage );
+ // call without the db mutex
+ void syncToTailOfRemoteLog();
+ // call with the db mutex
+ OpTime nextLastSavedLocalTs() const;
+ void setLastSavedLocalTs( const OpTime &nextLocalTs );
+ // call without the db mutex
+ void resetSlave();
+ // call with the db mutex
+ // returns false if the slave has been reset
+ bool updateSetsWithLocalOps( OpTime &localLogTail, bool mayUnlock );
+ string ns() const { return string( "local.oplog.$" ) + sourceName(); }
+
+ public:
+ static void applyOperation(const BSONObj& op);
+ bool replacing; // in "replace mode" -- see CmdReplacePeer
+ bool paired; // --pair in use
+ string hostName; // ip addr or hostname plus optionally, ":<port>"
+ string _sourceName; // a logical source name.
+ string sourceName() const {
+ return _sourceName.empty() ? "main" : _sourceName;
+ }
+ string only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating.
+
+ /* the last time point we have already synced up to (in the remote/master's oplog). */
+ OpTime syncedTo;
+
+ /* This is for repl pairs.
+ _lastSavedLocalTs is the most recent point in the local log that we know is consistent
+ with the remote log ( ie say the local op log has entries ABCDE and the remote op log
+ has ABCXY, then _lastSavedLocalTs won't be greater than C until we have reconciled
+ the DE-XY difference.)
+ */
+ OpTime _lastSavedLocalTs;
+
+ int nClonedThisPass;
+
+ typedef vector< shared_ptr< ReplSource > > SourceVector;
+ static void loadAll(SourceVector&);
+ explicit ReplSource(BSONObj);
+ bool sync(int& nApplied);
+ void save(); // write ourself to local.sources
+ void resetConnection() {
+ cursor = auto_ptr<DBClientCursor>(0);
+ conn = auto_ptr<DBClientConnection>(0);
+ }
+
+ // make a jsobj from our member fields of the form
+ // { host: ..., source: ..., syncedTo: ... }
+ BSONObj jsobj();
+
+ bool operator==(const ReplSource&r) const {
+ return hostName == r.hostName && sourceName() == r.sourceName();
+ }
+ operator string() const { return sourceName() + "@" + hostName; }
+
+ bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); }
+
+ static bool throttledForceResyncDead( const char *requester );
+ static void forceResyncDead( const char *requester );
+ void forceResync( const char *requester );
+ };
+
+ /* Write operation to the log (local.oplog.$main)
+ "i" insert
+ "u" update
+ "d" delete
+ "c" db cmd
+ "db" declares presence of a database (ns is set to the db name + '.')
+ */
+ void _logOp(const char *opstr, const char *ns, const char *logNs, const BSONObj& obj, BSONObj *patt, bool *b, const OpTime &ts);
+ void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0);
+
+ // class for managing a set of ids in memory
+ class MemIds {
+ public:
+ MemIds() : size_() {}
+ friend class IdTracker;
+ void reset() { imp_.clear(); }
+ bool get( const char *ns, const BSONObj &id ) { return imp_[ ns ].count( id ); }
+ void set( const char *ns, const BSONObj &id, bool val ) {
+ if ( val ) {
+ if ( imp_[ ns ].insert( id.getOwned() ).second ) {
+ size_ += id.objsize() + sizeof( BSONObj );
+ }
+ } else {
+ if ( imp_[ ns ].erase( id ) == 1 ) {
+ size_ -= id.objsize() + sizeof( BSONObj );
+ }
+ }
+ }
+ long long roughSize() const {
+ return size_;
+ }
+ private:
+ typedef map< string, BSONObjSetDefaultOrder > IdSets;
+ IdSets imp_;
+ long long size_;
+ };
+
+ // class for managing a set of ids in a db collection
+ // All functions must be called with db mutex held
+ class DbIds {
+ public:
+ DbIds( const string & name ) : impl_( name, BSON( "ns" << 1 << "id" << 1 ) ) {}
+ void reset() {
+ impl_.reset();
+ }
+ bool get( const char *ns, const BSONObj &id ) {
+ return impl_.get( key( ns, id ) );
+ }
+ void set( const char *ns, const BSONObj &id, bool val ) {
+ impl_.set( key( ns, id ), val );
+ }
+ private:
+ static BSONObj key( const char *ns, const BSONObj &id ) {
+ BSONObjBuilder b;
+ b << "ns" << ns;
+ // rename _id to id since there may be duplicates
+ b.appendAs( id.firstElement(), "id" );
+ return b.obj();
+ }
+ DbSet impl_;
+ };
+
+ // class for tracking ids and mod ids, in memory or on disk
+ // All functions must be called with db mutex held
+ // Kind of sloppy class structure, for now just want to keep the in mem
+ // version speedy.
+ // see http://www.mongodb.org/display/DOCS/Pairing+Internals
+ class IdTracker {
+ public:
+ IdTracker() :
+ dbIds_( "local.temp.replIds" ),
+ dbModIds_( "local.temp.replModIds" ),
+ inMem_( true ),
+ maxMem_( opIdMem ) {
+ }
+ void reset( int maxMem = opIdMem ) {
+ memIds_.reset();
+ memModIds_.reset();
+ dbIds_.reset();
+ dbModIds_.reset();
+ maxMem_ = maxMem;
+ inMem_ = true;
+ }
+ bool haveId( const char *ns, const BSONObj &id ) {
+ if ( inMem_ )
+ return get( memIds_, ns, id );
+ else
+ return get( dbIds_, ns, id );
+ }
+ bool haveModId( const char *ns, const BSONObj &id ) {
+ if ( inMem_ )
+ return get( memModIds_, ns, id );
+ else
+ return get( dbModIds_, ns, id );
+ }
+ void haveId( const char *ns, const BSONObj &id, bool val ) {
+ if ( inMem_ )
+ set( memIds_, ns, id, val );
+ else
+ set( dbIds_, ns, id, val );
+ }
+ void haveModId( const char *ns, const BSONObj &id, bool val ) {
+ if ( inMem_ )
+ set( memModIds_, ns, id, val );
+ else
+ set( dbModIds_, ns, id, val );
+ }
+ // will release the db mutex
+ void mayUpgradeStorage() {
+ if ( !inMem_ || memIds_.roughSize() + memModIds_.roughSize() <= maxMem_ )
+ return;
+ log() << "saving master modified id information to collection" << endl;
+ upgrade( memIds_, dbIds_ );
+ upgrade( memModIds_, dbModIds_ );
+ memIds_.reset();
+ memModIds_.reset();
+ inMem_ = false;
+ }
+ bool inMem() const { return inMem_; }
+ private:
+ template< class T >
+ bool get( T &ids, const char *ns, const BSONObj &id ) {
+ return ids.get( ns, id );
+ }
+ template< class T >
+ void set( T &ids, const char *ns, const BSONObj &id, bool val ) {
+ ids.set( ns, id, val );
+ }
+ void upgrade( MemIds &a, DbIds &b ) {
+ for( MemIds::IdSets::const_iterator i = a.imp_.begin(); i != a.imp_.end(); ++i ) {
+ for( BSONObjSetDefaultOrder::const_iterator j = i->second.begin(); j != i->second.end(); ++j ) {
+ set( b, i->first.c_str(), *j, true );
+ RARELY {
+ dbtemprelease t;
+ }
+ }
+ }
+ }
+ MemIds memIds_;
+ MemIds memModIds_;
+ DbIds dbIds_;
+ DbIds dbModIds_;
+ bool inMem_;
+ int maxMem_;
+ };
+
+} // namespace mongo