diff options
author | Antonin Kral <a.kral@bobek.cz> | 2011-03-17 00:05:43 +0100 |
---|---|---|
committer | Antonin Kral <a.kral@bobek.cz> | 2011-03-17 00:05:43 +0100 |
commit | 582fc32574a3b158c81e49cb00e6ae59205e66ba (patch) | |
tree | ac64a3243e0d2121709f685695247052858115c8 /s/d_split.cpp | |
parent | 2761bffa96595ac1698d86bbc2e95ebb0d4d6e93 (diff) | |
download | mongodb-582fc32574a3b158c81e49cb00e6ae59205e66ba.tar.gz |
Imported Upstream version 1.8.0
Diffstat (limited to 's/d_split.cpp')
-rw-r--r-- | s/d_split.cpp | 722 |
1 files changed, 648 insertions, 74 deletions
diff --git a/s/d_split.cpp b/s/d_split.cpp index fdefc7e..0896803 100644 --- a/s/d_split.cpp +++ b/s/d_split.cpp @@ -1,4 +1,4 @@ -// d_split.cpp +// @file d_split.cpp /** * Copyright (C) 2008 10gen Inc. @@ -27,6 +27,13 @@ #include "../db/query.h" #include "../db/queryoptimizer.h" +#include "../client/connpool.h" +#include "../client/distlock.h" + +#include "chunk.h" // for static genID only +#include "config.h" +#include "d_logic.h" + namespace mongo { // TODO: Fold these checks into each command. @@ -43,19 +50,19 @@ namespace mongo { public: CmdMedianKey() : Command( "medianKey" ) {} virtual bool slaveOk() const { return true; } - virtual LockType locktype() const { return READ; } + virtual LockType locktype() const { return READ; } virtual void help( stringstream &help ) const { - help << - "Internal command.\n" - "example: { medianKey:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }\n" - "NOTE: This command may take a while to run"; + help << + "Internal command.\n" + "example: { medianKey:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }\n" + "NOTE: This command may take a while to run"; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { const char *ns = jsobj.getStringField( "medianKey" ); BSONObj min = jsobj.getObjectField( "min" ); BSONObj max = jsobj.getObjectField( "max" ); BSONObj keyPattern = jsobj.getObjectField( "keyPattern" ); - + Client::Context ctx( ns ); IndexDetails *id = cmdIndexDetailsForRange( ns, errmsg, min, max, keyPattern ); @@ -66,22 +73,22 @@ namespace mongo { int num = 0; NamespaceDetails *d = nsdetails(ns); int idxNo = d->idxNo(*id); - + // only yielding on firt half for now // after this it should be in ram, so 2nd should be fast { shared_ptr<Cursor> c( new BtreeCursor( d, idxNo, *id, min, max, false, 1 ) ); scoped_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); - while ( c->ok() ){ + while ( c->ok() ) { num++; c->advance(); if ( ! cc->yieldSometimes() ) break; } } - + num /= 2; - + BtreeCursor c( d, idxNo, *id, min, max, false, 1 ); for( ; num; c.advance(), --num ); @@ -99,15 +106,15 @@ namespace mongo { int x = median.woCompare( min , BSONObj() , false ); int y = median.woCompare( max , BSONObj() , false ); - if ( x == 0 || y == 0 ){ + if ( x == 0 || y == 0 ) { // its on an edge, ok } - else if ( x < 0 && y < 0 ){ + else if ( x < 0 && y < 0 ) { log( LL_ERROR ) << "median error (1) min: " << min << " max: " << max << " median: " << median << endl; errmsg = "median error 1"; return false; } - else if ( x > 0 && y > 0 ){ + else if ( x > 0 && y > 0 ) { log( LL_ERROR ) << "median error (2) min: " << min << " max: " << max << " median: " << median << endl; errmsg = "median error 2"; return false; @@ -117,95 +124,662 @@ namespace mongo { } } cmdMedianKey; - class SplitVector : public Command { - public: - SplitVector() : Command( "splitVector" , false ){} + class CheckShardingIndex : public Command { + public: + CheckShardingIndex() : Command( "checkShardingIndex" , false ) {} virtual bool slaveOk() const { return false; } virtual LockType locktype() const { return READ; } virtual void help( stringstream &help ) const { - help << - "Internal command.\n" - "example: { splitVector : \"myLargeCollection\" , keyPattern : {x:1} , maxChunkSize : 200 }\n" - "maxChunkSize unit in MBs\n" - "NOTE: This command may take a while to run"; + help << "Internal command.\n"; } - bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ){ - const char* ns = jsobj.getStringField( "splitVector" ); + + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + + const char* ns = jsobj.getStringField( "checkShardingIndex" ); BSONObj keyPattern = jsobj.getObjectField( "keyPattern" ); - long long maxChunkSize = 0; - BSONElement maxSizeElem = jsobj[ "maxChunkSize" ]; - if ( ! maxSizeElem.eoo() ){ - maxChunkSize = maxSizeElem.numberLong() * 1<<20; - } else { - errmsg = "need to specify the desired max chunk size"; + // If min and max are not provided use the "minKey" and "maxKey" for the sharding key pattern. + BSONObj min = jsobj.getObjectField( "min" ); + BSONObj max = jsobj.getObjectField( "max" ); + if ( min.isEmpty() && max.isEmpty() ) { + BSONObjBuilder minBuilder; + BSONObjBuilder maxBuilder; + BSONForEach(key, keyPattern) { + minBuilder.appendMinKey( key.fieldName() ); + maxBuilder.appendMaxKey( key.fieldName() ); + } + min = minBuilder.obj(); + max = maxBuilder.obj(); + } + else if ( min.isEmpty() || max.isEmpty() ) { + errmsg = "either provide both min and max or leave both empty"; return false; } - - Client::Context ctx( ns ); - BSONObjBuilder minBuilder; - BSONObjBuilder maxBuilder; - BSONForEach(key, keyPattern){ - minBuilder.appendMinKey( key.fieldName() ); - maxBuilder.appendMaxKey( key.fieldName() ); + Client::Context ctx( ns ); + NamespaceDetails *d = nsdetails( ns ); + if ( ! d ) { + errmsg = "ns not found"; + return false; } - BSONObj min = minBuilder.obj(); - BSONObj max = maxBuilder.obj(); IndexDetails *idx = cmdIndexDetailsForRange( ns , errmsg , min , max , keyPattern ); - if ( idx == NULL ){ + if ( idx == NULL ) { errmsg = "couldn't find index over splitting key"; return false; } - NamespaceDetails *d = nsdetails( ns ); - BtreeCursor c( d , d->idxNo(*idx) , *idx , min , max , false , 1 ); + BtreeCursor * bc = new BtreeCursor( d , d->idxNo(*idx) , *idx , min , max , false , 1 ); + shared_ptr<Cursor> c( bc ); + scoped_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); + if ( ! cc->ok() ) { + // range is empty + return true; + } - // We'll use the average object size and number of object to find approximately how many keys - // each chunk should have. We'll split a little smaller than the specificied by 'maxSize' - // assuming a recently sharded collectio is still going to grow. + // for now, the only check is that all shard keys are filled + // null is ok, + // TODO if $exist for nulls were picking the index, it could be used instead efficiently + while ( cc->ok() ) { + BSONObj currKey = c->currKey(); + + BSONObjIterator i( currKey ); + int n = 0; + while ( i.more() ) { + BSONElement key = i.next(); + n++; - const long long dataSize = d->datasize; - const long long recCount = d->nrecords; - long long keyCount = 0; - if (( dataSize > 0 ) && ( recCount > 0 )){ - const long long avgRecSize = dataSize / recCount; - keyCount = 90 * maxChunkSize / (100 * avgRecSize); + if ( key.type() && key.type() != jstNULL ) + continue; + + BSONObj obj = c->current(); + BSONObjIterator j( keyPattern ); + BSONElement real; + for ( int x=0; x<n; x++ ) + real = j.next(); + + real = obj.getFieldDotted( real.fieldName() ); + + if ( real.type() ) + continue; + + ostringstream os; + os << "found null value in key " << bc->prettyKey( currKey ) << " for doc: " << real["_id"]; + log() << "checkShardingIndex for '" << ns << "' failed: " << os.str() << endl; + + errmsg = os.str(); + return false; + } + cc->advance(); } - // We traverse the index and add the keyCount-th key to the result vector. If that key - // appeared in the vector before, we omit it. The assumption here is that all the - // instances of a key value live in the same chunk. + return true; + } + } cmdCheckShardingIndex; - Timer timer; - long long currCount = 0; - vector<BSONObj> splitKeys; - BSONObj currKey; - while ( c.ok() ){ - currCount++; - if ( currCount > keyCount ){ - if ( ! currKey.isEmpty() && (currKey.woCompare( c.currKey() ) == 0 ) ) - continue; - - currKey = c.currKey(); - splitKeys.push_back( c.prettyKey( currKey ) ); - currCount = 0; + class SplitVector : public Command { + public: + SplitVector() : Command( "splitVector" , false ) {} + virtual bool slaveOk() const { return false; } + virtual LockType locktype() const { return READ; } + virtual void help( stringstream &help ) const { + help << + "Internal command.\n" + "examples:\n" + " { splitVector : \"blog.post\" , keyPattern:{x:1} , min:{x:10} , max:{x:20}, maxChunkSize:200 }\n" + " maxChunkSize unit in MBs\n" + " May optionally specify 'maxSplitPoints' and 'maxChunkObjects' to avoid traversing the whole chunk\n" + " \n" + " { splitVector : \"blog.post\" , keyPattern:{x:1} , min:{x:10} , max:{x:20}, force: true }\n" + " 'force' will produce one split point even if data is small; defaults to false\n" + "NOTE: This command may take a while to run"; + } + + bool run(const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + + // + // 1.a We'll parse the parameters in two steps. First, make sure the we can use the split index to get + // a good approximation of the size of the chunk -- without needing to access the actual data. + // + + const char* ns = jsobj.getStringField( "splitVector" ); + BSONObj keyPattern = jsobj.getObjectField( "keyPattern" ); + + // If min and max are not provided use the "minKey" and "maxKey" for the sharding key pattern. + BSONObj min = jsobj.getObjectField( "min" ); + BSONObj max = jsobj.getObjectField( "max" ); + if ( min.isEmpty() && max.isEmpty() ) { + BSONObjBuilder minBuilder; + BSONObjBuilder maxBuilder; + BSONForEach(key, keyPattern) { + minBuilder.appendMinKey( key.fieldName() ); + maxBuilder.appendMaxKey( key.fieldName() ); } - c.advance(); + min = minBuilder.obj(); + max = maxBuilder.obj(); + } + else if ( min.isEmpty() || max.isEmpty() ) { + errmsg = "either provide both min and max or leave both empty"; + return false; } - ostringstream os; - os << "Finding the split vector for " << ns << " over "<< keyPattern; - logIfSlow( timer , os.str() ); + long long maxSplitPoints = 0; + BSONElement maxSplitPointsElem = jsobj[ "maxSplitPoints" ]; + if ( maxSplitPointsElem.isNumber() ) { + maxSplitPoints = maxSplitPointsElem.numberLong(); + } - // Warning: we are sending back an array of keys but are currently limited to - // 4MB work of 'result' size. This should be okay for now. + long long maxChunkObjects = 0; + BSONElement MaxChunkObjectsElem = jsobj[ "maxChunkObjects" ]; + if ( MaxChunkObjectsElem.isNumber() ) { + maxChunkObjects = MaxChunkObjectsElem.numberLong(); + } + + vector<BSONObj> splitKeys; + + { + // Get the size estimate for this namespace + Client::Context ctx( ns ); + NamespaceDetails *d = nsdetails( ns ); + if ( ! d ) { + errmsg = "ns not found"; + return false; + } + + IndexDetails *idx = cmdIndexDetailsForRange( ns , errmsg , min , max , keyPattern ); + if ( idx == NULL ) { + errmsg = "couldn't find index over splitting key"; + return false; + } + + const long long recCount = d->stats.nrecords; + const long long dataSize = d->stats.datasize; + + // + // 1.b Now that we have the size estimate, go over the remaining parameters and apply any maximum size + // restrictions specified there. + // + + // 'force'-ing a split is equivalent to having maxChunkSize be the size of the current chunk, i.e., the + // logic below will split that chunk in half + long long maxChunkSize = 0; + bool force = false; + { + BSONElement maxSizeElem = jsobj[ "maxChunkSize" ]; + BSONElement forceElem = jsobj[ "force" ]; + + if ( forceElem.trueValue() ) { + force = true; + maxChunkSize = dataSize; + + } + else if ( maxSizeElem.isNumber() ) { + maxChunkSize = maxSizeElem.numberLong() * 1<<20; + + } + else { + maxSizeElem = jsobj["maxChunkSizeBytes"]; + if ( maxSizeElem.isNumber() ) { + maxChunkSize = maxSizeElem.numberLong(); + } + } + + if ( maxChunkSize <= 0 ) { + errmsg = "need to specify the desired max chunk size (maxChunkSize or maxChunkSizeBytes)"; + return false; + } + } + + + // If there's not enough data for more than one chunk, no point continuing. + if ( dataSize < maxChunkSize || recCount == 0 ) { + vector<BSONObj> emptyVector; + result.append( "splitKeys" , emptyVector ); + return true; + } + + log() << "request split points lookup for chunk " << ns << " " << min << " -->> " << max << endl; + + // We'll use the average object size and number of object to find approximately how many keys + // each chunk should have. We'll split at half the maxChunkSize or maxChunkObjects, if + // provided. + const long long avgRecSize = dataSize / recCount; + long long keyCount = maxChunkSize / (2 * avgRecSize); + if ( maxChunkObjects && ( maxChunkObjects < keyCount ) ) { + log() << "limiting split vector to " << maxChunkObjects << " (from " << keyCount << ") objects " << endl; + keyCount = maxChunkObjects; + } + + // + // 2. Traverse the index and add the keyCount-th key to the result vector. If that key + // appeared in the vector before, we omit it. The invariant here is that all the + // instances of a given key value live in the same chunk. + // + + Timer timer; + long long currCount = 0; + long long numChunks = 0; + + BtreeCursor * bc = new BtreeCursor( d , d->idxNo(*idx) , *idx , min , max , false , 1 ); + shared_ptr<Cursor> c( bc ); + scoped_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); + if ( ! cc->ok() ) { + errmsg = "can't open a cursor for splitting (desired range is possibly empty)"; + return false; + } + + // Use every 'keyCount'-th key as a split point. We add the initial key as a sentinel, to be removed + // at the end. If a key appears more times than entries allowed on a chunk, we issue a warning and + // split on the following key. + set<BSONObj> tooFrequentKeys; + splitKeys.push_back( c->currKey().getOwned() ); + while ( 1 ) { + while ( cc->ok() ) { + currCount++; + BSONObj currKey = c->currKey(); + + DEV assert( currKey.woCompare( max ) <= 0 ); + + if ( currCount > keyCount ) { + // Do not use this split key if it is the same used in the previous split point. + if ( currKey.woCompare( splitKeys.back() ) == 0 ) { + tooFrequentKeys.insert( currKey.getOwned() ); + + } + else { + splitKeys.push_back( currKey.getOwned() ); + currCount = 0; + numChunks++; + + LOG(4) << "picked a split key: " << bc->prettyKey( currKey ) << endl; + } + + } + + cc->advance(); + + // Stop if we have enough split points. + if ( maxSplitPoints && ( numChunks >= maxSplitPoints ) ) { + log() << "max number of requested split points reached (" << numChunks + << ") before the end of chunk " << ns << " " << min << " -->> " << max + << endl; + break; + } + + if ( ! cc->yieldSometimes() ) { + // we were near and and got pushed to the end + // i think returning the splits we've already found is fine + + // don't use the btree cursor pointer to acces keys beyond this point but ok + // to use it for format the keys we've got already + + break; + } + } + + if ( splitKeys.size() > 1 || ! force ) + break; + + force = false; + keyCount = currCount / 2; + currCount = 0; + log() << "splitVector doing another cycle because of force, keyCount now: " << keyCount << endl; + + c.reset( new BtreeCursor( d , d->idxNo(*idx) , *idx , min , max , false , 1 ) ); + cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); + } + + // + // 3. Format the result and issue any warnings about the data we gathered while traversing the + // index + // + + // Warn for keys that are more numerous than maxChunkSize allows. + for ( set<BSONObj>::const_iterator it = tooFrequentKeys.begin(); it != tooFrequentKeys.end(); ++it ) { + warning() << "chunk is larger than " << maxChunkSize + << " bytes because of key " << bc->prettyKey( *it ) << endl; + } + + // Remove the sentinel at the beginning before returning and add fieldnames. + splitKeys.erase( splitKeys.begin() ); + for ( vector<BSONObj>::iterator it = splitKeys.begin(); it != splitKeys.end() ; ++it ) { + *it = bc->prettyKey( *it ); + } + + if ( timer.millis() > cmdLine.slowMS ) { + warning() << "Finding the split vector for " << ns << " over "<< keyPattern + << " keyCount: " << keyCount << " numSplits: " << splitKeys.size() + << " lookedAt: " << currCount << " took " << timer.millis() << "ms" + << endl; + } + + // Warning: we are sending back an array of keys but are currently limited to + // 4MB work of 'result' size. This should be okay for now. + + } result.append( "splitKeys" , splitKeys ); + return true; } } cmdSplitVector; + // ** temporary ** 2010-10-22 + // chunkInfo is a helper to collect and log information about the chunks generated in splitChunk. + // It should hold the chunk state for this module only, while we don't have min/max key info per chunk on the + // mongod side. Do not build on this; it will go away. + struct ChunkInfo { + BSONObj min; + BSONObj max; + ShardChunkVersion lastmod; + + ChunkInfo() { } + ChunkInfo( BSONObj aMin , BSONObj aMax , ShardChunkVersion aVersion ) : min(aMin) , max(aMax) , lastmod(aVersion) {} + void appendShortVersion( const char* name, BSONObjBuilder& b ) const; + string toString() const; + }; + + void ChunkInfo::appendShortVersion( const char * name , BSONObjBuilder& b ) const { + BSONObjBuilder bb( b.subobjStart( name ) ); + bb.append( "min" , min ); + bb.append( "max" , max ); + bb.appendTimestamp( "lastmod" , lastmod ); + bb.done(); + } + + string ChunkInfo::toString() const { + ostringstream os; + os << "lastmod: " << lastmod.toString() << " min: " << min << " max: " << endl; + return os.str(); + } + // ** end temporary ** + + class SplitChunkCommand : public Command { + public: + SplitChunkCommand() : Command( "splitChunk" ) {} + virtual void help( stringstream& help ) const { + help << + "internal command usage only\n" + "example:\n" + " { splitChunk:\"db.foo\" , keyPattern: {a:1} , min : {a:100} , max: {a:200} { splitKeys : [ {a:150} , ... ]}"; + } + + virtual bool slaveOk() const { return false; } + virtual bool adminOnly() const { return true; } + virtual LockType locktype() const { return NONE; } + + bool run(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + + // + // 1. check whether parameters passed to splitChunk are sound + // + + const string ns = cmdObj.firstElement().str(); + if ( ns.empty() ) { + errmsg = "need to specify namespace in command"; + return false; + } + + BSONObj keyPattern = cmdObj["keyPattern"].Obj(); + if ( keyPattern.isEmpty() ) { + errmsg = "need to specify the key pattern the collection is sharded over"; + return false; + } + + BSONObj min = cmdObj["min"].Obj(); + if ( min.isEmpty() ) { + errmsg = "neet to specify the min key for the chunk"; + return false; + } + + BSONObj max = cmdObj["max"].Obj(); + if ( max.isEmpty() ) { + errmsg = "neet to specify the max key for the chunk"; + return false; + } + + string from = cmdObj["from"].str(); + if ( from.empty() ) { + errmsg = "need specify server to split chunk at"; + return false; + } + + BSONObj splitKeysElem = cmdObj["splitKeys"].Obj(); + if ( splitKeysElem.isEmpty() ) { + errmsg = "need to provide the split points to chunk over"; + return false; + } + vector<BSONObj> splitKeys; + BSONObjIterator it( splitKeysElem ); + while ( it.more() ) { + splitKeys.push_back( it.next().Obj().getOwned() ); + } + + BSONElement shardId = cmdObj["shardId"]; + if ( shardId.eoo() ) { + errmsg = "need to provide shardId"; + return false; + } + + // It is possible that this is the first sharded command this mongod is asked to perform. If so, + // start sharding apparatus. We'd still be missing some more shard-related info but we'll get it + // in step 2. below. + if ( ! shardingState.enabled() ) { + if ( cmdObj["configdb"].type() != String ) { + errmsg = "sharding not enabled"; + return false; + } + string configdb = cmdObj["configdb"].String(); + shardingState.enable( configdb ); + configServer.init( configdb ); + } + + Shard myShard( from ); + + log() << "received splitChunk request: " << cmdObj << endl; + + // + // 2. lock the collection's metadata and get highest version for the current shard + // + + DistributedLock lockSetup( ConnectionString( shardingState.getConfigServer() , ConnectionString::SYNC) , ns ); + dist_lock_try dlk( &lockSetup, string("split-") + min.toString() ); + if ( ! dlk.got() ) { + errmsg = "the collection's metadata lock is taken"; + result.append( "who" , dlk.other() ); + return false; + } + + // TODO This is a check migrate does to the letter. Factor it out and share. 2010-10-22 + + ShardChunkVersion maxVersion; + string shard; + ChunkInfo origChunk; + { + ScopedDbConnection conn( shardingState.getConfigServer() ); + + BSONObj x = conn->findOne( ShardNS::chunk , Query( BSON( "ns" << ns ) ).sort( BSON( "lastmod" << -1 ) ) ); + maxVersion = x["lastmod"]; + + BSONObj currChunk = conn->findOne( ShardNS::chunk , shardId.wrap( "_id" ) ).getOwned(); + assert( currChunk["shard"].type() ); + assert( currChunk["min"].type() ); + assert( currChunk["max"].type() ); + shard = currChunk["shard"].String(); + conn.done(); + + BSONObj currMin = currChunk["min"].Obj(); + BSONObj currMax = currChunk["max"].Obj(); + if ( currMin.woCompare( min ) || currMax.woCompare( max ) ) { + errmsg = "chunk boundaries are outdated (likely a split occurred)"; + result.append( "currMin" , currMin ); + result.append( "currMax" , currMax ); + result.append( "requestedMin" , min ); + result.append( "requestedMax" , max ); + + log( LL_WARNING ) << "aborted split because " << errmsg << ": " << min << "->" << max + << " is now " << currMin << "->" << currMax << endl; + return false; + } + + if ( shard != myShard.getName() ) { + errmsg = "location is outdated (likely balance or migrate occurred)"; + result.append( "from" , myShard.getName() ); + result.append( "official" , shard ); + + log( LL_WARNING ) << "aborted split because " << errmsg << ": chunk is at " << shard + << " and not at " << myShard.getName() << endl; + return false; + } + + if ( maxVersion < shardingState.getVersion( ns ) ) { + errmsg = "official version less than mine?"; + result.appendTimestamp( "officialVersion" , maxVersion ); + result.appendTimestamp( "myVersion" , shardingState.getVersion( ns ) ); + + log( LL_WARNING ) << "aborted split because " << errmsg << ": official " << maxVersion + << " mine: " << shardingState.getVersion(ns) << endl; + return false; + } + + origChunk.min = currMin.getOwned(); + origChunk.max = currMax.getOwned(); + origChunk.lastmod = currChunk["lastmod"]; + + // since this could be the first call that enable sharding we also make sure to have the chunk manager up to date + shardingState.gotShardName( shard ); + ShardChunkVersion shardVersion; + shardingState.trySetVersion( ns , shardVersion /* will return updated */ ); + + log() << "splitChunk accepted at version " << shardVersion << endl; + + } + + // + // 3. create the batch of updates to metadata ( the new chunks ) to be applied via 'applyOps' command + // + + BSONObjBuilder logDetail; + origChunk.appendShortVersion( "before" , logDetail ); + log(1) << "before split on " << origChunk << endl; + vector<ChunkInfo> newChunks; + + ShardChunkVersion myVersion = maxVersion; + BSONObj startKey = min; + splitKeys.push_back( max ); // makes it easier to have 'max' in the next loop. remove later. + + BSONObjBuilder cmdBuilder; + BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) ); + + for ( vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it ) { + BSONObj endKey = *it; + + // splits only update the 'minor' portion of version + myVersion.incMinor(); + + // build an update operation against the chunks collection of the config database with + // upsert true + BSONObjBuilder op; + op.append( "op" , "u" ); + op.appendBool( "b" , true ); + op.append( "ns" , ShardNS::chunk ); + + // add the modified (new) chunk infomation as the update object + BSONObjBuilder n( op.subobjStart( "o" ) ); + n.append( "_id" , Chunk::genID( ns , startKey ) ); + n.appendTimestamp( "lastmod" , myVersion ); + n.append( "ns" , ns ); + n.append( "min" , startKey ); + n.append( "max" , endKey ); + n.append( "shard" , shard ); + n.done(); + + // add the chunk's _id as the query part of the update statement + BSONObjBuilder q( op.subobjStart( "o2" ) ); + q.append( "_id" , Chunk::genID( ns , startKey ) ); + q.done(); + + updates.append( op.obj() ); + + // remember this chunk info for logging later + newChunks.push_back( ChunkInfo( startKey , endKey, myVersion ) ); + + startKey = endKey; + } + + updates.done(); + + { + BSONArrayBuilder preCond( cmdBuilder.subarrayStart( "preCondition" ) ); + BSONObjBuilder b; + b.append( "ns" , ShardNS::chunk ); + b.append( "q" , BSON( "query" << BSON( "ns" << ns ) << "orderby" << BSON( "lastmod" << -1 ) ) ); + { + BSONObjBuilder bb( b.subobjStart( "res" ) ); + bb.appendTimestamp( "lastmod" , maxVersion ); + bb.done(); + } + preCond.append( b.obj() ); + preCond.done(); + } + + // + // 4. apply the batch of updates to metadata and to the chunk manager + // + + BSONObj cmd = cmdBuilder.obj(); + + LOG(1) << "splitChunk update: " << cmd << endl; + + bool ok; + BSONObj cmdResult; + { + ScopedDbConnection conn( shardingState.getConfigServer() ); + ok = conn->runCommand( "config" , cmd , cmdResult ); + conn.done(); + } + + if ( ! ok ) { + stringstream ss; + ss << "saving chunks failed. cmd: " << cmd << " result: " << cmdResult; + error() << ss.str() << endl; + msgasserted( 13593 , ss.str() ); // assert(13593) + } + + // install a chunk manager with knowledge about newly split chunks in this shard's state + splitKeys.pop_back(); // 'max' was used as sentinel + maxVersion.incMinor(); + shardingState.splitChunk( ns , min , max , splitKeys , maxVersion ); + + // + // 5. logChanges + // + + // single splits are logged different than multisplits + if ( newChunks.size() == 2 ) { + newChunks[0].appendShortVersion( "left" , logDetail ); + newChunks[1].appendShortVersion( "right" , logDetail ); + configServer.logChange( "split" , ns , logDetail.obj() ); + + } + else { + BSONObj beforeDetailObj = logDetail.obj(); + BSONObj firstDetailObj = beforeDetailObj.getOwned(); + const int newChunksSize = newChunks.size(); + + for ( int i=0; i < newChunksSize; i++ ) { + BSONObjBuilder chunkDetail; + chunkDetail.appendElements( beforeDetailObj ); + chunkDetail.append( "number", i ); + chunkDetail.append( "of" , newChunksSize ); + newChunks[i].appendShortVersion( "chunk" , chunkDetail ); + configServer.logChange( "multi-split" , ns , chunkDetail.obj() ); + } + } + + return true; + } + } cmdSplitChunk; + } // namespace mongo |