diff options
Diffstat (limited to 'db/pdfile.cpp')
-rw-r--r-- | db/pdfile.cpp | 800 |
1 files changed, 471 insertions, 329 deletions
diff --git a/db/pdfile.cpp b/db/pdfile.cpp index 216f21a..20a7423 100644 --- a/db/pdfile.cpp +++ b/db/pdfile.cpp @@ -20,7 +20,6 @@ todo: _ table scans must be sequential, not next/prev pointers _ coalesce deleted - _ disallow system* manipulations from the database. */ @@ -37,21 +36,21 @@ _ disallow system* manipulations from the database. #include "query.h" #include "repl.h" #include "dbhelpers.h" -#include "namespace.h" +#include "namespace-inl.h" #include "queryutil.h" #include "extsort.h" -#include "curop.h" +#include "curop-inl.h" #include "background.h" namespace mongo { bool inDBRepair = false; struct doingRepair { - doingRepair(){ + doingRepair() { assert( ! inDBRepair ); inDBRepair = true; } - ~doingRepair(){ + ~doingRepair() { inDBRepair = false; } }; @@ -64,42 +63,42 @@ namespace mongo { return dbsInProg[db] != 0; } - bool BackgroundOperation::inProgForNs(const char *ns) { + bool BackgroundOperation::inProgForNs(const char *ns) { assertInWriteLock(); return nsInProg.count(ns) != 0; } - void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { + void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { uassert(12586, "cannot perform operation: a background operation is currently running for this database", - !inProgForDb(db)); + !inProgForDb(db)); } - void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { + void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { uassert(12587, "cannot perform operation: a background operation is currently running for this collection", - !inProgForNs(ns)); - } + !inProgForNs(ns)); + } - BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { + BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { assertInWriteLock(); dbsInProg[_ns.db]++; assert( nsInProg.count(_ns.ns()) == 0 ); nsInProg.insert(_ns.ns()); } - BackgroundOperation::~BackgroundOperation() { + BackgroundOperation::~BackgroundOperation() { assertInWriteLock(); dbsInProg[_ns.db]--; nsInProg.erase(_ns.ns()); } void BackgroundOperation::dump(stringstream& ss) { - if( nsInProg.size() ) { + if( nsInProg.size() ) { ss << "\n<b>Background Jobs in Progress</b>\n"; for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ ) ss << " " << *i << '\n'; } - for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { - if( i->second ) + for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { + if( i->second ) ss << "database " << i->first << ": " << i->second << '\n'; } } @@ -114,24 +113,23 @@ namespace mongo { DataFileMgr theDataFileMgr; DatabaseHolder dbHolder; int MAGIC = 0x1000; -// int curOp = -2; extern int otherTraceLevel; void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0); void ensureIdIndexForNewNs(const char *ns) { if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) && - strstr( ns, ".$freelist" ) == 0 ){ + strstr( ns, ".$freelist" ) == 0 ) { log( 1 ) << "adding _id index for collection " << ns << endl; ensureHaveIdIndex( ns ); - } + } } string getDbContext() { stringstream ss; Client * c = currentClient.get(); - if ( c ){ + if ( c ) { Client::Context * cx = c->getContext(); - if ( cx ){ + if ( cx ) { Database *database = cx->db(); if ( database ) { ss << database->name << ' '; @@ -142,20 +140,44 @@ namespace mongo { return ss.str(); } - BSONObj::BSONObj(const Record *r) { - init(r->data, false); - } - /*---------------------------------------------------------------------*/ - int initialExtentSize(int len) { + // inheritable class to implement an operation that may be applied to all + // files in a database using _applyOpToDataFiles() + class FileOp { + public: + virtual ~FileOp() {} + // Return true if file exists and operation successful + virtual bool apply( const boost::filesystem::path &p ) = 0; + virtual const char * op() const = 0; + }; + + void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath ); + + void _deleteDataFiles(const char *database) { + if ( directoryperdb ) { + FileAllocator::get()->waitUntilFinished(); + BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) ); + return; + } + class : public FileOp { + virtual bool apply( const boost::filesystem::path &p ) { + return boost::filesystem::remove( p ); + } + virtual const char * op() const { + return "remove"; + } + } deleter; + _applyOpToDataFiles( database, deleter, true ); + } + + int Extent::initialSize(int len) { long long sz = len * 16; if ( len < 1000 ) sz = len * 64; if ( sz > 1000000000 ) sz = 1000000000; int z = ((int)sz) & 0xffffff00; assert( z > len ); - //DEV tlog() << "initialExtentSize(" << len << ") returns " << z << endl; return z; } @@ -165,7 +187,7 @@ namespace mongo { return false; } - log(1) << "create collection " << ns << ' ' << options << '\n'; + log(1) << "create collection " << ns << ' ' << options << endl; /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field and then go back and set to ok : 1 after we are done. @@ -174,33 +196,48 @@ namespace mongo { if( !isFreeList ) addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options); - long long size = initialExtentSize(128); - BSONElement e = options.getField("size"); - if ( e.isNumber() ) { - size = e.numberLong(); - size += 256; - size &= 0xffffffffffffff00LL; + long long size = Extent::initialSize(128); + { + BSONElement e = options.getField("size"); + if ( e.isNumber() ) { + size = e.numberLong(); + size += 256; + size &= 0xffffffffffffff00LL; + } } - + uassert( 10083 , "invalid size spec", size > 0 ); bool newCapped = false; int mx = 0; - e = options.getField("capped"); - if ( e.type() == Bool && e.boolean() ) { + if( options.getBoolField("capped") ) { newCapped = true; - e = options.getField("max"); + BSONElement e = options.getField("max"); if ( e.isNumber() ) { mx = e.numberInt(); } } - // $nExtents just for debug/testing. We create '$nExtents' extents, - // each of size 'size'. - e = options.getField( "$nExtents" ); - int nExtents = int( e.number() ); + // $nExtents just for debug/testing. + BSONElement e = options.getField( "$nExtents" ); Database *database = cc().database(); - if ( nExtents > 0 ) { + if ( e.type() == Array ) { + // We create one extent per array entry, with size specified + // by the array value. + BSONObjIterator i( e.embeddedObject() ); + while( i.more() ) { + BSONElement e = i.next(); + int size = int( e.number() ); + assert( size <= 0x7fffffff ); + // $nExtents is just for testing - always allocate new extents + // rather than reuse existing extents so we have some predictibility + // in the extent size used by our tests + database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped ); + } + } + else if ( int( e.number() ) > 0 ) { + // We create '$nExtents' extents, each of size 'size'. + int nExtents = int( e.number() ); assert( size <= 0x7fffffff ); for ( int i = 0; i < nExtents; ++i ) { assert( size <= 0x7fffffff ); @@ -209,10 +246,16 @@ namespace mongo { // in the extent size used by our tests database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped ); } - } else { + } + else { + // This is the non test case, where we don't have a $nExtents spec. while ( size > 0 ) { int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize; int desiredExtentSize = (int) (size > max ? max : size); + if ( desiredExtentSize < Extent::minSize() ) { + desiredExtentSize = Extent::minSize(); + } + desiredExtentSize &= 0xffffff00; Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped ); size -= e->length; } @@ -223,15 +266,16 @@ namespace mongo { bool ensure = false; if ( options.getField( "autoIndexId" ).type() ) { - if ( options["autoIndexId"].trueValue() ){ + if ( options["autoIndexId"].trueValue() ) { ensure = true; } - } else { + } + else { if ( !newCapped ) { ensure=true; } } - if( ensure ) { + if( ensure ) { if( deferIdIndex ) *deferIdIndex = true; else @@ -239,7 +283,7 @@ namespace mongo { } if ( mx > 0 ) - d->max = mx; + getDur().writingInt( d->max ) = mx; return true; } @@ -250,7 +294,7 @@ namespace mongo { */ bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) { const char *coll = strchr( ns, '.' ) + 1; - massert( 10356 , "invalid ns", coll && *coll ); + massert( 10356 , str::stream() << "invalid ns: " << ns , coll && *coll ); char cl[ 256 ]; nsToDatabase( ns, cl ); bool ok = _userCreateNS(ns, options, err, deferIdIndex); @@ -272,14 +316,22 @@ namespace mongo { int MongoDataFile::maxSize() { if ( sizeof( int* ) == 4 ) { return 512 * 1024 * 1024; - } else if ( cmdLine.smallfiles ) { + } + else if ( cmdLine.smallfiles ) { return 0x7ff00000 >> 2; - } else { + } + else { return 0x7ff00000; } } - void MongoDataFile::badOfs(int ofs) const { + void MongoDataFile::badOfs2(int ofs) const { + stringstream ss; + ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; + uasserted(13441, ss.str()); + } + + void MongoDataFile::badOfs(int ofs) const { stringstream ss; ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; uasserted(13440, ss.str()); @@ -293,26 +345,18 @@ namespace mongo { else size = 0x7ff00000; - if ( strstr(filename, "_hudsonSmall") ) { - int mult = 1; - if ( fileNo > 1 && fileNo < 1000 ) - mult = fileNo; - size = 1024 * 512 * mult; - log() << "Warning : using small files for _hudsonSmall" << endl; - } - else if ( cmdLine.smallfiles ){ + if ( cmdLine.smallfiles ) { size = size >> 2; } - - + + return size; } void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) { { /* check quotas - very simple temporary implementation - we will in future look up - the quota from the grid database + very simple temporary implementation for now */ if ( cmdLine.quota && fileNo > cmdLine.quotaFiles && !MMF::exists(filename) ) { /* todo: if we were adding / changing keys in an index did we do some @@ -340,58 +384,66 @@ namespace mongo { if ( size > maxSize() ) size = maxSize(); - assert( ( size >= 64*1024*1024 ) || cmdLine.smallfiles || ( strstr( filename, "_hudsonSmall" ) ) ); + assert( size >= 64*1024*1024 || cmdLine.smallfiles ); assert( size % 4096 == 0 ); if ( preallocateOnly ) { if ( cmdLine.prealloc ) { - theFileAllocator().requestAllocation( filename, size ); + FileAllocator::get()->requestAllocation( filename, size ); } return; } - - _p = mmf.map(filename, size); - header = (DataFileHeader *) _p.at(0, DataFileHeader::HeaderSize); - if( sizeof(char *) == 4 ) - uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", header); + + { + assert( _mb == 0 ); + unsigned long long sz = size; + if( mmf.create(filename, sz, false) ) + _mb = mmf.getView(); + assert( sz <= 0x7fffffff ); + size = (int) sz; + } + //header = (DataFileHeader *) _p; + if( sizeof(char *) == 4 ) + uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0); else - uassert( 10085 , "can't map file memory", header); - header->init(fileNo, size); + uassert( 10085 , "can't map file memory", _mb != 0); + header()->init(fileNo, size, filename); } - void MongoDataFile::flush( bool sync ){ + void MongoDataFile::flush( bool sync ) { mmf.flush( sync ); } - void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { - DiskLoc oldExtentLoc; + void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { NamespaceIndex *ni = nsindex(ns); NamespaceDetails *details = ni->details(ns); if ( details ) { assert( !details->lastExtent.isNull() ); assert( !details->firstExtent.isNull() ); - e->xprev = details->lastExtent; - details->lastExtent.ext()->xnext = eloc; + getDur().writingDiskLoc(e->xprev) = details->lastExtent; + getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc; assert( !eloc.isNull() ); - details->lastExtent = eloc; + getDur().writingDiskLoc(details->lastExtent) = eloc; } else { ni->add_ns(ns, eloc, capped); details = ni->details(ns); } - details->lastExtentSize = e->length; - DEBUGGING out() << "temp: newextent adddelrec " << ns << endl; + { + NamespaceDetails *dw = details->writingWithoutExtra(); + dw->lastExtentSize = e->length; + } details->addDeletedRec(emptyLoc.drec(), emptyLoc); } Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) { - massert( 10357 , "shutdown in progress", !goingAway ); - massert( 10358 , "bad new extent size", approxSize >= 0 && approxSize <= Extent::maxSize() ); - massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header ); // null if file open failed - int ExtentSize = approxSize <= header->unusedLength ? approxSize : header->unusedLength; + massert( 10357 , "shutdown in progress", ! inShutdown() ); + massert( 10358 , "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() ); + massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed + int ExtentSize = approxSize <= header()->unusedLength ? approxSize : header()->unusedLength; DiskLoc loc; - if ( ExtentSize <= 0 ) { + if ( ExtentSize < Extent::minSize() ) { /* not there could be a lot of looping here is db just started and no files are open yet. we might want to do something about that. */ if ( loops > 8 ) { @@ -401,12 +453,14 @@ namespace mongo { log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n"; return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1); } - int offset = header->unused.getOfs(); - header->unused.setOfs( fileNo, offset + ExtentSize ); - header->unusedLength -= ExtentSize; - loc.setOfs(fileNo, offset); + int offset = header()->unused.getOfs(); + + DataFileHeader *h = getDur().writing(header()); + h->unused.set( fileNo, offset + ExtentSize ); + h->unusedLength -= ExtentSize; + loc.set(fileNo, offset); Extent *e = _getExtent(loc); - DiskLoc emptyLoc = e->init(ns, ExtentSize, fileNo, offset); + DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset); addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped); @@ -415,7 +469,7 @@ namespace mongo { return e; } - Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { + Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { string s = cc().database()->name + ".$freelist"; NamespaceDetails *f = nsdetails(s.c_str()); if( f ) { @@ -426,7 +480,7 @@ namespace mongo { if( low > 2048 ) low -= 256; high = (int) (approxSize * 1.05) + 256; } - else { + else { low = (int) (approxSize * 0.8); high = (int) (approxSize * 1.4); } @@ -436,20 +490,20 @@ namespace mongo { int bestDiff = 0x7fffffff; { DiskLoc L = f->firstExtent; - while( !L.isNull() ) { + while( !L.isNull() ) { Extent * e = L.ext(); - if( e->length >= low && e->length <= high ) { + if( e->length >= low && e->length <= high ) { int diff = abs(e->length - approxSize); - if( diff < bestDiff ) { + if( diff < bestDiff ) { bestDiff = diff; best = e; - if( diff == 0 ) + if( diff == 0 ) break; } } L = e->xnext; ++n; - + } } OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n"; @@ -457,13 +511,13 @@ namespace mongo { Extent *e = best; // remove from the free list if( !e->xprev.isNull() ) - e->xprev.ext()->xnext = e->xnext; + e->xprev.ext()->xnext.writing() = e->xnext; if( !e->xnext.isNull() ) - e->xnext.ext()->xprev = e->xprev; + e->xnext.ext()->xprev.writing() = e->xprev; if( f->firstExtent == e->myLoc ) - f->firstExtent = e->xnext; + f->firstExtent.writing() = e->xnext; if( f->lastExtent == e->myLoc ) - f->lastExtent = e->xprev; + f->lastExtent.writing() = e->xprev; // use it OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n"; @@ -479,9 +533,11 @@ namespace mongo { /*---------------------------------------------------------------------*/ - DiskLoc Extent::reuse(const char *nsname) { - /*TODOMMF - work to do when extent is freed. */ - log(3) << "reset extent was:" << nsDiagnostic.buf << " now:" << nsname << '\n'; + DiskLoc Extent::reuse(const char *nsname) { + return getDur().writing(this)->_reuse(nsname); + } + DiskLoc Extent::_reuse(const char *nsname) { + log(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n'; massert( 10360 , "Extent::reset bad magic value", magic == 0x41424344 ); xnext.Null(); xprev.Null(); @@ -493,12 +549,9 @@ namespace mongo { emptyLoc.inc( (int) (_extentData-(char*)this) ); int delRecLength = length - (_extentData - (char *) this); - //DeletedRecord *empty1 = (DeletedRecord *) extentData; - DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc); - //assert( empty == empty1 ); - - // do we want to zero the record? memset(empty, ...) + DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc); + empty = getDur().writing(empty); empty->lengthWithHeaders = delRecLength; empty->extentOfs = myLoc.getOfs(); empty->nextDeleted.Null(); @@ -509,7 +562,7 @@ namespace mongo { /* assumes already zeroed -- insufficient for block 'reuse' perhaps */ DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) { magic = 0x41424344; - myLoc.setOfs(_fileNo, _offset); + myLoc.set(_fileNo, _offset); xnext.Null(); xprev.Null(); nsDiagnostic = nsname; @@ -521,9 +574,7 @@ namespace mongo { emptyLoc.inc( (int) (_extentData-(char*)this) ); int l = _length - (_extentData - (char *) this); - //DeletedRecord *empty1 = (DeletedRecord *) extentData; - DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, l); - //assert( empty == empty1 ); + DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, l) ); empty->lengthWithHeaders = l; empty->extentOfs = myLoc.getOfs(); return emptyLoc; @@ -582,7 +633,7 @@ namespace mongo { } return maxExtentSize; } - + /*---------------------------------------------------------------------*/ shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) { @@ -612,12 +663,12 @@ namespace mongo { d->dumpDeleted(&extents); } - if ( d->capped ) + if ( d->capped ) return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) ); - + if ( !startLoc.isNull() ) - return shared_ptr<Cursor>(new BasicCursor( startLoc )); - + return shared_ptr<Cursor>(new BasicCursor( startLoc )); + while ( e->firstRecord.isNull() && !e->xnext.isNull() ) { /* todo: if extent is empty, free it for reuse elsewhere. that is a bit complicated have to clean up the freelists. @@ -638,37 +689,38 @@ namespace mongo { if ( el.number() >= 0 ) return DataFileMgr::findAll(ns, startLoc); - + // "reverse natural order" NamespaceDetails *d = nsdetails(ns); - + if ( !d ) return shared_ptr<Cursor>(new BasicCursor(DiskLoc())); - + if ( !d->capped ) { if ( !startLoc.isNull() ) - return shared_ptr<Cursor>(new ReverseCursor( startLoc )); + return shared_ptr<Cursor>(new ReverseCursor( startLoc )); Extent *e = d->lastExtent.ext(); while ( e->lastRecord.isNull() && !e->xprev.isNull() ) { OCCASIONALLY out() << " findTableScan: extent empty, skipping ahead" << endl; e = e->getPrevExtent(); } return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord )); - } else { + } + else { return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) ); } } - void printFreeList() { + void printFreeList() { string s = cc().database()->name + ".$freelist"; log() << "dump freelist " << s << '\n'; NamespaceDetails *freeExtents = nsdetails(s.c_str()); - if( freeExtents == 0 ) { + if( freeExtents == 0 ) { log() << " freeExtents==0" << endl; return; } DiskLoc a = freeExtents->firstExtent; - while( !a.isNull() ) { + while( !a.isNull() ) { Extent *e = a.ext(); log() << " " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << '\n'; a = e->xnext; @@ -687,7 +739,7 @@ namespace mongo { NamespaceString s(nsToDrop); assert( s.db == cc().database()->name ); if( s.isSystem() ) { - if( s.coll == "system.profile" ) + if( s.coll == "system.profile" ) uassert( 10087 , "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 ); else uasserted( 12502, "can't drop system ns" ); @@ -698,32 +750,31 @@ namespace mongo { BSONObj cond = BSON( "name" << nsToDrop ); // { name: "colltodropname" } string system_namespaces = cc().database()->name + ".system.namespaces"; /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true); - // no check of return code as this ns won't exist for some of the new storage engines + // no check of return code as this ns won't exist for some of the new storage engines } // free extents if( !d->firstExtent.isNull() ) { string s = cc().database()->name + ".$freelist"; NamespaceDetails *freeExtents = nsdetails(s.c_str()); - if( freeExtents == 0 ) { + if( freeExtents == 0 ) { string err; _userCreateNS(s.c_str(), BSONObj(), err, 0); freeExtents = nsdetails(s.c_str()); massert( 10361 , "can't create .$freelist", freeExtents); } - if( freeExtents->firstExtent.isNull() ) { - freeExtents->firstExtent = d->firstExtent; - freeExtents->lastExtent = d->lastExtent; + if( freeExtents->firstExtent.isNull() ) { + freeExtents->firstExtent.writing() = d->firstExtent; + freeExtents->lastExtent.writing() = d->lastExtent; } - else { + else { DiskLoc a = freeExtents->firstExtent; assert( a.ext()->xprev.isNull() ); - a.ext()->xprev = d->lastExtent; - d->lastExtent.ext()->xnext = a; - freeExtents->firstExtent = d->firstExtent; - - d->firstExtent.setInvalid(); - d->lastExtent.setInvalid(); + getDur().writingDiskLoc( a.ext()->xprev ) = d->lastExtent; + getDur().writingDiskLoc( d->lastExtent.ext()->xnext ) = a; + getDur().writingDiskLoc( freeExtents->firstExtent ) = d->firstExtent; + getDur().writingDiskLoc( d->firstExtent ).setInvalid(); + getDur().writingDiskLoc( d->lastExtent ).setInvalid(); } } @@ -740,7 +791,7 @@ namespace mongo { BackgroundOperation::assertNoBgOpInProgForNs(name.c_str()); if ( d->nIndexes != 0 ) { - try { + try { assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) ); } catch( DBException& e ) { @@ -754,11 +805,10 @@ namespace mongo { log(1) << "\t dropIndexes done" << endl; result.append("ns", name.c_str()); ClientCursor::invalidate(name.c_str()); - Client::invalidateNS( name ); Top::global.collectionDropped( name ); - dropNS(name); + dropNS(name); } - + int nUnindexes = 0; /* unindex all keys in index for this record. */ @@ -797,63 +847,69 @@ namespace mongo { int n = d->nIndexes; for ( int i = 0; i < n; i++ ) _unindexRecord(d->idx(i), obj, dl, !noWarn); - if( d->backgroundIndexBuildInProgress ) { + if( d->indexBuildInProgress ) { // background index // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it - _unindexRecord(d->idx(n), obj, dl, false); + _unindexRecord(d->idx(n), obj, dl, false); } } - /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. + /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. caller must check if capped */ - void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) - { + void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) { /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs != DiskLoc::NullOfs ) - todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs; + getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs; if ( todelete->nextOfs != DiskLoc::NullOfs ) - todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs; + getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs; } /* remove ourself from extent pointers */ { - Extent *e = todelete->myExtent(dl); + Extent *e = getDur().writing( todelete->myExtent(dl) ); if ( e->firstRecord == dl ) { if ( todelete->nextOfs == DiskLoc::NullOfs ) e->firstRecord.Null(); else - e->firstRecord.setOfs(dl.a(), todelete->nextOfs); + e->firstRecord.set(dl.a(), todelete->nextOfs); } if ( e->lastRecord == dl ) { if ( todelete->prevOfs == DiskLoc::NullOfs ) e->lastRecord.Null(); else - e->lastRecord.setOfs(dl.a(), todelete->prevOfs); + e->lastRecord.set(dl.a(), todelete->prevOfs); } } /* add to the free list */ { - d->nrecords--; - d->datasize -= todelete->netLength(); - /* temp: if in system.indexes, don't reuse, and zero out: we want to be - careful until validated more, as IndexDetails has pointers - to this disk location. so an incorrectly done remove would cause - a lot of problems. - */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize -= todelete->netLength(); + s->nrecords--; + } + if ( strstr(ns, ".system.indexes") ) { - memset(todelete, 0, todelete->lengthWithHeaders); + /* temp: if in system.indexes, don't reuse, and zero out: we want to be + careful until validated more, as IndexDetails has pointers + to this disk location. so an incorrectly done remove would cause + a lot of problems. + */ + memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders); } else { - DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse. + DEV { + unsigned long long *p = (unsigned long long *) todelete->data; + *getDur().writing(p) = 0; + //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse. + } d->addDeletedRec((DeletedRecord*)todelete, dl); } } } - void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) - { + void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) { dassert( todelete == dl.rec() ); NamespaceDetails* d = nsdetails(ns); @@ -880,8 +936,7 @@ namespace mongo { NamespaceDetails *d, NamespaceDetailsTransient *nsdt, Record *toupdate, const DiskLoc& dl, - const char *_buf, int _len, OpDebug& debug, bool &changedId, bool god) - { + const char *_buf, int _len, OpDebug& debug, bool god) { StringBuilder& ss = debug.str; dassert( toupdate == dl.rec() ); @@ -891,7 +946,7 @@ namespace mongo { DEV assert( objNew.objdata() == _buf ); if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) { - /* add back the old _id value if the update removes it. Note this implementation is slow + /* add back the old _id value if the update removes it. Note this implementation is slow (copies entire object multiple times), but this shouldn't happen often, so going for simple code, not speed. */ @@ -903,11 +958,13 @@ namespace mongo { objNew = b.obj(); } - /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further + /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ vector<IndexChanges> changes; + bool changedId = false; getIndexChanges(changes, *d, objNew, objOld, changedId); + uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId ); dupCheck(changes, *d, dl); if ( toupdate->netLength() < objNew.objsize() ) { @@ -946,8 +1003,8 @@ namespace mongo { try { /* we did the dupCheck() above. so we don't have to worry about it here. */ idx.head.btree()->bt_insert( - idx.head, - dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx); + idx.head, + dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx); } catch (AssertionException& e) { ss << " exception update index "; @@ -959,25 +1016,30 @@ namespace mongo { ss << '\n' << keyUpdates << " key updates "; } - // update in place - memcpy(toupdate->data, objNew.objdata(), objNew.objsize()); + // update in place + int sz = objNew.objsize(); + memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz); return dl; } - int followupExtentSize(int len, int lastExtentLen) { + int Extent::followupSize(int len, int lastExtentLen) { assert( len < Extent::maxSize() ); - int x = initialExtentSize(len); + int x = initialSize(len); int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2); int sz = y > x ? y : x; - if ( sz < lastExtentLen ) - sz = lastExtentLen; - else if ( sz > Extent::maxSize() ) + if ( sz < lastExtentLen ) { + // this means there was an int overflow + // so we should turn it into maxSize + sz = Extent::maxSize(); + } + else if ( sz > Extent::maxSize() ) { sz = Extent::maxSize(); - + } + sz = ((int)sz) & 0xffffff00; assert( sz > len ); - + return sz; } @@ -990,7 +1052,7 @@ namespace mongo { Ordering ordering = Ordering::make(order); int n = 0; for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) { - if( ++n == 2 ) { + if( ++n == 2 ) { d->setIndexIsMultikey(idxNo); } assert( !recordLoc.isNull() ); @@ -999,7 +1061,7 @@ namespace mongo { *i, ordering, dupsAllowed, idx); } catch (AssertionException& e) { - if( e.getCode() == 10287 && idxNo == d->nIndexes ) { + if( e.getCode() == 10287 && idxNo == d->nIndexes ) { DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; continue; } @@ -1012,8 +1074,7 @@ namespace mongo { } } - void testSorting() - { + void testSorting() { BSONObjBuilder b; b.appendNull(""); BSONObj x = b.obj(); @@ -1027,9 +1088,9 @@ namespace mongo { sorter.add(x, DiskLoc(3,77)); sorter.sort(); - + auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator(); - while( i->more() ) { + while( i->more() ) { BSONObjExternalSorter::Data d = i->next(); /*cout << d.second.toString() << endl; cout << d.first.objsize() << endl; @@ -1039,7 +1100,6 @@ namespace mongo { // throws DBException unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { - assert( d->backgroundIndexBuildInProgress == 0 ); CurOp * op = cc().curop(); Timer t; @@ -1050,17 +1110,17 @@ namespace mongo { bool dropDups = idx.dropDups() || inDBRepair; BSONObj order = idx.keyPattern(); - idx.head.Null(); - + getDur().writingDiskLoc(idx.head).Null(); + if ( logLevel > 1 ) printMemInfo( "before index start" ); /* get and sort all the keys ----- */ unsigned long long n = 0; shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); BSONObjExternalSorter sorter(order); - sorter.hintNumObjects( d->nrecords ); + sorter.hintNumObjects( d->stats.nrecords ); unsigned long long nkeys = 0; - ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->nrecords , 10 ) ); + ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) ); while ( c->ok() ) { BSONObj o = c->current(); DiskLoc loc = c->currLoc(); @@ -1069,17 +1129,17 @@ namespace mongo { idx.getKeysFromObject(o, keys); int k = 0; for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) { - if( ++k == 2 ) + if( ++k == 2 ) { d->setIndexIsMultikey(idxNo); - //cout<<"SORTER ADD " << i->toString() << ' ' << loc.toString() << endl; + } sorter.add(*i, loc); nkeys++; } - + c->advance(); n++; pm.hit(); - if ( logLevel > 1 && n % 10000 == 0 ){ + if ( logLevel > 1 && n % 10000 == 0 ) { printMemInfo( "\t iterating objects" ); } @@ -1089,37 +1149,37 @@ namespace mongo { if ( logLevel > 1 ) printMemInfo( "before final sort" ); sorter.sort(); if ( logLevel > 1 ) printMemInfo( "after final sort" ); - + log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl; list<DiskLoc> dupsToDrop; - /* build index --- */ + /* build index --- */ { BtreeBuilder btBuilder(dupsAllowed, idx); BSONObj keyLast; auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator(); assert( pm == op->setMessage( "index: (2/3) btree bottom up" , nkeys , 10 ) ); - while( i->more() ) { + while( i->more() ) { RARELY killCurrentOp.checkForInterrupt(); BSONObjExternalSorter::Data d = i->next(); - try { + try { btBuilder.addKey(d.first, d.second); } - catch( AssertionException& e ) { - if ( dupsAllowed ){ + catch( AssertionException& e ) { + if ( dupsAllowed ) { // unknow exception?? throw; } - + if( e.interrupted() ) throw; if ( ! dropDups ) throw; - /* we could queue these on disk, but normally there are very few dups, so instead we + /* we could queue these on disk, but normally there are very few dups, so instead we keep in ram and have a limit. */ dupsToDrop.push_back(d.second); @@ -1131,9 +1191,11 @@ namespace mongo { op->setMessage( "index: (3/3) btree-middle" ); log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl; btBuilder.commit(); - wassert( btBuilder.getn() == nkeys || dropDups ); + if ( btBuilder.getn() != nkeys && ! dropDups ) { + warning() << "not all entries were added to the index, probably some keys were too large" << endl; + } } - + log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl; for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ) @@ -1142,13 +1204,13 @@ namespace mongo { return n; } - class BackgroundIndexBuildJob : public BackgroundOperation { + class BackgroundIndexBuildJob : public BackgroundOperation { unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { bool dupsAllowed = !idx.unique(); bool dropDups = idx.dropDups(); - ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->nrecords ); + ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords ); unsigned long long n = 0; auto_ptr<ClientCursor> cc; @@ -1156,25 +1218,26 @@ namespace mongo { shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) ); } - CursorId id = cc->cursorid; + CursorId id = cc->cursorid(); - while ( cc->c->ok() ) { - BSONObj js = cc->c->current(); - try { - _indexRecord(d, idxNo, js, cc->c->currLoc(), dupsAllowed); - cc->c->advance(); - } catch( AssertionException& e ) { + while ( cc->ok() ) { + BSONObj js = cc->current(); + try { + _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed); + cc->advance(); + } + catch( AssertionException& e ) { if( e.interrupted() ) throw; if ( dropDups ) { - DiskLoc toDelete = cc->c->currLoc(); - bool ok = cc->c->advance(); + DiskLoc toDelete = cc->currLoc(); + bool ok = cc->advance(); cc->updateLocation(); theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true ); if( ClientCursor::find(id, false) == 0 ) { cc.release(); - if( !ok ) { + if( !ok ) { /* we were already at the end. normal. */ } else { @@ -1182,7 +1245,8 @@ namespace mongo { } break; } - } else { + } + else { log() << "background addExistingToIndex exception " << e.what() << endl; throw; } @@ -1200,7 +1264,7 @@ namespace mongo { return n; } - /* we do set a flag in the namespace for quick checking, but this is our authoritative info - + /* we do set a flag in the namespace for quick checking, but this is our authoritative info - that way on a crash/restart, we don't think we are still building one. */ set<NamespaceDetails*> bgJobsInProgress; @@ -1208,12 +1272,8 @@ namespace mongo { assertInWriteLock(); uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , dbMutex.getState() == 1 ); bgJobsInProgress.insert(d); - d->backgroundIndexBuildInProgress = 1; - d->nIndexes--; } void done(const char *ns, NamespaceDetails *d) { - d->nIndexes++; - d->backgroundIndexBuildInProgress = 0; NamespaceDetailsTransient::get_w(ns).addedIndex(); // clear query optimizer cache assertInWriteLock(); } @@ -1221,16 +1281,16 @@ namespace mongo { public: BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { } - unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { + unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { unsigned long long n = 0; prep(ns.c_str(), d); assert( idxNo == d->nIndexes ); - try { + try { idx.head = BtreeBucket::addBucket(idx); n = addExistingToIndex(ns.c_str(), d, idx, idxNo); } - catch(...) { + catch(...) { if( cc().database() && nsdetails(ns.c_str()) == d ) { assert( idxNo == d->nIndexes ); done(ns.c_str(), d); @@ -1246,25 +1306,51 @@ namespace mongo { } }; + /** + * For the lifetime of this object, an index build is indicated on the specified + * namespace and the newest index is marked as absent. This simplifies + * the cleanup required on recovery. + */ + class RecoverableIndexState { + public: + RecoverableIndexState( NamespaceDetails *d ) : _d( d ) { + indexBuildInProgress() = 1; + nIndexes()--; + } + ~RecoverableIndexState() { + DESTRUCTOR_GUARD ( + nIndexes()++; + indexBuildInProgress() = 0; + ) + } + private: + int &nIndexes() { return getDur().writingInt( _d->nIndexes ); } + int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); } + NamespaceDetails *_d; + }; + // throws DBException - static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { + static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { tlog() << "building new index on " << idx.keyPattern() << " for " << ns << ( background ? " background" : "" ) << endl; Timer t; - unsigned long long n; + unsigned long long n; if( background ) { log(2) << "buildAnIndex: background=true\n"; } assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be... + assert( d->indexBuildInProgress == 0 ); + assertInWriteLock(); + RecoverableIndexState recoverable( d ); if( inDBRepair || !background ) { - n = fastBuildIndex(ns.c_str(), d, idx, idxNo); - assert( !idx.head.isNull() ); - } - else { + n = fastBuildIndex(ns.c_str(), d, idx, idxNo); + assert( !idx.head.isNull() ); + } + else { BackgroundIndexBuildJob j(ns.c_str()); n = j.go(ns, d, idx, idxNo); - } + } tlog() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl; } @@ -1272,20 +1358,20 @@ namespace mongo { static void indexRecord(NamespaceDetails *d, BSONObj obj, DiskLoc loc) { int n = d->nIndexesBeingBuilt(); for ( int i = 0; i < n; i++ ) { - try { + try { bool unique = d->idx(i).unique(); _indexRecord(d, i, obj, loc, /*dupsAllowed*/!unique); } - catch( DBException& ) { + catch( DBException& ) { /* try to roll back previously added index entries note <= i (not < i) is important here as the index we were just attempted may be multikey and require some cleanup. */ - for( int j = 0; j <= i; j++ ) { + for( int j = 0; j <= i; j++ ) { try { _unindexRecord(d->idx(j), obj, loc, false); } - catch(...) { + catch(...) { log(3) << "unindex fails on rollback after unique failure\n"; } } @@ -1301,7 +1387,7 @@ namespace mongo { if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) ) return; - d->flags |= NamespaceDetails::Flag_HaveIdIndex; + *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex; { NamespaceDetails::IndexIterator i = d->ii(); @@ -1324,7 +1410,7 @@ namespace mongo { } #pragma pack(1) - struct IDToInsert_ { + struct IDToInsert_ { char type; char _id[4]; OID oid; @@ -1338,13 +1424,13 @@ namespace mongo { IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {} } idToInsert; #pragma pack() - + void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) { BSONObj tmp = o; insertWithObjMod( ns, tmp, god ); logOp( "i", ns, tmp ); } - + DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) { DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god ); if ( !loc.isNull() ) @@ -1356,12 +1442,12 @@ namespace mongo { insert( ns, o.objdata(), o.objsize(), god ); } - bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection); + bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ); // We are now doing two btree scans for all unique indexes (one here, and one when we've // written the record to the collection. This could be made more efficient inserting // dummy data here, keeping pointers to the btree nodes holding the dummy data and then - // updating the dummy data with the DiskLoc of the real record. + // updating the dummy data with the DiskLoc of the real record. void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) { for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) { if( d->idx(idxNo).unique() ) { @@ -1371,19 +1457,19 @@ namespace mongo { BSONObj order = idx.keyPattern(); for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) { uassert( 12582, "duplicate key insert for unique index of capped collection", - idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() ); + idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() ); } } - } + } } - /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc + /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc after the call -- that will prevent a double buffer copy in some cases (btree.cpp). */ DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) { bool wouldAddIndex = false; - massert( 10093 , "cannot insert into reserved $ collection", god || nsDollarCheck( ns ) ); - uassert( 10094 , "invalid ns", strchr( ns , '.' ) > 0 ); + massert( 10093 , "cannot insert into reserved $ collection", god || isANormalNSName( ns ) ); + uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); const char *sys = strstr(ns, "system."); if ( sys ) { uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns); @@ -1411,7 +1497,7 @@ namespace mongo { also if this is an addIndex, those checks should happen before this! */ // This may create first file in the database. - cc().database()->allocExtent(ns, initialExtentSize(len), false); + cc().database()->allocExtent(ns, Extent::initialSize(len), false); d = nsdetails(ns); if ( !god ) ensureIdIndexForNewNs(ns); @@ -1421,17 +1507,24 @@ namespace mongo { NamespaceDetails *tableToIndex = 0; string tabletoidxns; + BSONObj fixedIndexObject; if ( addIndex ) { assert( obuf ); BSONObj io((const char *) obuf); - if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) ) + if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) return DiskLoc(); + + if ( ! fixedIndexObject.isEmpty() ) { + obuf = fixedIndexObject.objdata(); + len = fixedIndexObject.objsize(); + } + } const BSONElement *newId = &writeId; int addID = 0; if( !god ) { - /* Check if we have an _id field. If we don't, we'll add it. + /* Check if we have an _id field. If we don't, we'll add it. Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. */ BSONObj io((const char *) obuf); @@ -1446,7 +1539,7 @@ namespace mongo { } len += newId->size(); } - + BSONElementManipulator::lookForTimestamps( io ); } @@ -1456,28 +1549,28 @@ namespace mongo { if ( lenWHdr == 0 ) { // old datafiles, backward compatible here. assert( d->paddingFactor == 0 ); - d->paddingFactor = 1.0; + *getDur().writing(&d->paddingFactor) = 1.0; lenWHdr = len + Record::HeaderSize; } - + // If the collection is capped, check if the new object will violate a unique index // constraint before allocating space. if ( d->nIndexes && d->capped && !god ) { checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) ); } - + DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); if ( loc.isNull() ) { // out of space if ( d->capped == 0 ) { // size capped doesn't grow log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl; - cc().database()->allocExtent(ns, followupExtentSize(lenWHdr, d->lastExtentSize), false); + cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false); loc = d->alloc(ns, lenWHdr, extentLoc); - if ( loc.isNull() ){ + if ( loc.isNull() ) { log() << "WARNING: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n"; - for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ){ + for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ) { log() << "try #" << zzz << endl; - cc().database()->allocExtent(ns, followupExtentSize(len, d->lastExtentSize), false); + cc().database()->allocExtent(ns, Extent::followupSize(len, d->lastExtentSize), false); loc = d->alloc(ns, lenWHdr, extentLoc); if ( ! loc.isNull() ) break; @@ -1492,45 +1585,55 @@ namespace mongo { } Record *r = loc.rec(); - assert( r->lengthWithHeaders >= lenWHdr ); - if( addID ) { - /* a little effort was made here to avoid a double copy when we add an ID */ - ((int&)*r->data) = *((int*) obuf) + newId->size(); - memcpy(r->data+4, newId->rawdata(), newId->size()); - memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4); - } - else { - if( obuf ) - memcpy(r->data, obuf, len); - } - Extent *e = r->myExtent(loc); - if ( e->lastRecord.isNull() ) { - e->firstRecord = e->lastRecord = loc; - r->prevOfs = r->nextOfs = DiskLoc::NullOfs; + { + assert( r->lengthWithHeaders >= lenWHdr ); + r = (Record*) getDur().writingPtr(r, lenWHdr); + if( addID ) { + /* a little effort was made here to avoid a double copy when we add an ID */ + ((int&)*r->data) = *((int*) obuf) + newId->size(); + memcpy(r->data+4, newId->rawdata(), newId->size()); + memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4); + } + else { + if( obuf ) + memcpy(r->data, obuf, len); + } } - else { - Record *oldlast = e->lastRecord.rec(); - r->prevOfs = e->lastRecord.getOfs(); - r->nextOfs = DiskLoc::NullOfs; - oldlast->nextOfs = loc.getOfs(); - e->lastRecord = loc; + { + Extent *e = r->myExtent(loc); + if ( e->lastRecord.isNull() ) { + Extent::FL *fl = getDur().writing(e->fl()); + fl->firstRecord = fl->lastRecord = loc; + r->prevOfs = r->nextOfs = DiskLoc::NullOfs; + } + else { + Record *oldlast = e->lastRecord.rec(); + r->prevOfs = e->lastRecord.getOfs(); + r->nextOfs = DiskLoc::NullOfs; + getDur().writingInt(oldlast->nextOfs) = loc.getOfs(); + getDur().writingDiskLoc(e->lastRecord) = loc; + } } - d->nrecords++; - d->datasize += r->netLength(); + /* durability todo : this could be a bit annoying / slow to record constantly */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize += r->netLength(); + s->nrecords++; + } // we don't bother clearing those stats for the god tables - also god is true when adidng a btree bucket if ( !god ) NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp(); - + if ( tableToIndex ) { uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos ); BSONObj info = loc.obj(); bool background = info["background"].trueValue(); - if( background && cc().isSyncThread() ) { - /* don't do background indexing on slaves. there are nuances. this could be added later + if( background && cc().isSyncThread() ) { + /* don't do background indexing on slaves. there are nuances. this could be added later but requires more code. */ log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl; @@ -1539,10 +1642,11 @@ namespace mongo { int idxNo = tableToIndex->nIndexes; IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes - idx.info = loc; + getDur().writingDiskLoc(idx.info) = loc; try { buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background); - } catch( DBException& e ) { + } + catch( DBException& e ) { // save our error msg string as an exception or dropIndexes will overwrite our message LastError *le = lastError.get(); int savecode = 0; @@ -1564,7 +1668,7 @@ namespace mongo { if( !ok ) { log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl; } - + assert( le && !saveerrmsg.empty() ); raiseError(savecode,saveerrmsg.c_str()); throw; @@ -1573,20 +1677,20 @@ namespace mongo { /* add this record to our indexes */ if ( d->nIndexes ) { - try { + try { BSONObj obj(r->data); indexRecord(d, obj, loc); - } - catch( AssertionException& e ) { + } + catch( AssertionException& e ) { // should be a dup key error on _id index if( tableToIndex || d->capped ) { massert( 12583, "unexpected index insertion failure on capped collection", !d->capped ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; uassert_nothrow(s.c_str()); - log() << s << '\n'; + error() << s << endl; } - else { + else { // normal case -- we can roll back _deleteRecord(d, ns, r, loc); throw; @@ -1594,7 +1698,7 @@ namespace mongo { } } - // out() << " inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl; + // out() << " inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl; return loc; } @@ -1619,18 +1723,27 @@ namespace mongo { Extent *e = r->myExtent(loc); if ( e->lastRecord.isNull() ) { - e->firstRecord = e->lastRecord = loc; - r->prevOfs = r->nextOfs = DiskLoc::NullOfs; + Extent::FL *fl = getDur().writing( e->fl() ); + fl->firstRecord = fl->lastRecord = loc; + + Record::NP *np = getDur().writing(r->np()); + np->nextOfs = np->prevOfs = DiskLoc::NullOfs; } else { Record *oldlast = e->lastRecord.rec(); - r->prevOfs = e->lastRecord.getOfs(); - r->nextOfs = DiskLoc::NullOfs; - oldlast->nextOfs = loc.getOfs(); - e->lastRecord = loc; + Record::NP *np = getDur().writing(r->np()); + np->prevOfs = e->lastRecord.getOfs(); + np->nextOfs = DiskLoc::NullOfs; + getDur().writingInt( oldlast->nextOfs ) = loc.getOfs(); + e->lastRecord.writing() = loc; } - d->nrecords++; + /* todo: don't update for oplog? seems wasteful. */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize += r->netLength(); + s->nrecords++; + } return r; } @@ -1641,7 +1754,7 @@ namespace mongo { namespace mongo { - void dropAllDatabasesExceptLocal() { + void dropAllDatabasesExceptLocal() { writelock lk(""); vector<string> n; @@ -1658,14 +1771,17 @@ namespace mongo { void dropDatabase(string db) { log(1) << "dropDatabase " << db << endl; - assert( cc().database() ); - assert( cc().database()->name == db ); + Database *d = cc().database(); + assert( d ); + assert( d->name == db ); - BackgroundOperation::assertNoBgOpInProgForDb(db.c_str()); + BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str()); - Client::invalidateDB( db ); + getDur().syncDataAndTruncateJournal(); + + Database::closeDatabase( d->name.c_str(), d->path ); + d = 0; // d is now deleted - closeDatabase( db.c_str() ); _deleteDataFiles( db.c_str() ); } @@ -1674,13 +1790,14 @@ namespace mongo { void boostRenameWrapper( const Path &from, const Path &to ) { try { boost::filesystem::rename( from, to ); - } catch ( const boost::filesystem::filesystem_error & ) { + } + catch ( const boost::filesystem::filesystem_error & ) { // boost rename doesn't work across partitions boost::filesystem::copy_file( from, to); boost::filesystem::remove( from ); } } - + // back up original database files to 'temp' dir void _renameForBackup( const char *database, const Path &reservedPath ) { Path newPath( reservedPath ); @@ -1738,7 +1855,8 @@ namespace mongo { ss << prefix << "_repairDatabase_" << i++; reservedPath = repairPath / ss.str(); BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) ); - } while ( exists ); + } + while ( exists ); return reservedPath; } @@ -1790,12 +1908,15 @@ namespace mongo { stringstream ss; ss << "localhost:" << cmdLine.port; string localhost = ss.str(); - + problem() << "repairDatabase " << dbName << endl; assert( cc().database()->name == dbName ); + assert( cc().database()->path == dbpath ); BackgroundOperation::assertNoBgOpInProgForDb(dbName); + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + boost::intmax_t totalSize = dbSize( dbName ); boost::intmax_t freeSize = freeSpace( repairpath ); if ( freeSize > -1 && freeSize < totalSize ) { @@ -1812,30 +1933,37 @@ namespace mongo { "backup" : "$tmp" ); BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) ); string reservedPathString = reservedPath.native_directory_string(); - + bool res; - { // clone to temp location, which effectively does repair + { + // clone to temp location, which effectively does repair Client::Context ctx( dbName, reservedPathString ); assert( ctx.justCreated() ); - - res = cloneFrom(localhost.c_str(), errmsg, dbName, - /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false); - closeDatabase( dbName, reservedPathString.c_str() ); + + res = cloneFrom(localhost.c_str(), errmsg, dbName, + /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false); + Database::closeDatabase( dbName, reservedPathString.c_str() ); } if ( !res ) { problem() << "clone failed for " << dbName << " with error: " << errmsg << endl; if ( !preserveClonedFilesOnFailure ) BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); + + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + return false; } + MongoFile::flushAll(true); + Client::Context ctx( dbName ); - closeDatabase( dbName ); + Database::closeDatabase( dbName, dbpath ); if ( backupOriginalFiles ) { _renameForBackup( dbName, reservedPath ); - } else { + } + else { _deleteDataFiles( dbName ); BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) ); } @@ -1845,12 +1973,14 @@ namespace mongo { if ( !backupOriginalFiles ) BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + return true; } void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) { if ( afterAllocator ) - theFileAllocator().waitUntilFinished(); + FileAllocator::get()->waitUntilFinished(); string c = database; c += '.'; boost::filesystem::path p(path); @@ -1871,8 +2001,8 @@ namespace mongo { q = p / ss.str(); BOOST_CHECK_EXCEPTION( ok = fo.apply(q) ); if ( ok ) { - if ( extra != 10 ){ - log(1) << fo.op() << " file " << q.string() << '\n'; + if ( extra != 10 ) { + log(1) << fo.op() << " file " << q.string() << endl; log() << " _applyOpToDataFiles() warning: extra == " << extra << endl; } } @@ -1883,19 +2013,20 @@ namespace mongo { } NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); } - - bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ){ + + bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) { log() << "DatabaseHolder::closeAll path:" << path << endl; dbMutex.assertWriteLocked(); - + map<string,Database*>& m = _paths[path]; _size -= m.size(); - + set< string > dbs; for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) { + wassert( i->second->path == path ); dbs.insert( i->first ); } - + currentClient.get()->getContext()->clear(); BSONObjBuilder bb( result.subarrayStart( "dbs" ) ); @@ -1910,7 +2041,7 @@ namespace mongo { nNotClosed++; } else { - closeDatabase( name.c_str() , path ); + Database::closeDatabase( name.c_str() , path ); bb.append( bb.numStr( n++ ) , name ); } } @@ -1923,6 +2054,17 @@ namespace mongo { return true; } - + + bool isValidNS( const StringData& ns ) { + // TODO: should check for invalid characters + + const char * x = strchr( ns.data() , '.' ); + if ( ! x ) + return false; + + x++; + return *x > 0; + } + } // namespace mongo |