summaryrefslogtreecommitdiff
path: root/db/pdfile.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'db/pdfile.cpp')
-rw-r--r--db/pdfile.cpp800
1 files changed, 471 insertions, 329 deletions
diff --git a/db/pdfile.cpp b/db/pdfile.cpp
index 216f21a..20a7423 100644
--- a/db/pdfile.cpp
+++ b/db/pdfile.cpp
@@ -20,7 +20,6 @@
todo:
_ table scans must be sequential, not next/prev pointers
_ coalesce deleted
-
_ disallow system* manipulations from the database.
*/
@@ -37,21 +36,21 @@ _ disallow system* manipulations from the database.
#include "query.h"
#include "repl.h"
#include "dbhelpers.h"
-#include "namespace.h"
+#include "namespace-inl.h"
#include "queryutil.h"
#include "extsort.h"
-#include "curop.h"
+#include "curop-inl.h"
#include "background.h"
namespace mongo {
bool inDBRepair = false;
struct doingRepair {
- doingRepair(){
+ doingRepair() {
assert( ! inDBRepair );
inDBRepair = true;
}
- ~doingRepair(){
+ ~doingRepair() {
inDBRepair = false;
}
};
@@ -64,42 +63,42 @@ namespace mongo {
return dbsInProg[db] != 0;
}
- bool BackgroundOperation::inProgForNs(const char *ns) {
+ bool BackgroundOperation::inProgForNs(const char *ns) {
assertInWriteLock();
return nsInProg.count(ns) != 0;
}
- void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) {
+ void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) {
uassert(12586, "cannot perform operation: a background operation is currently running for this database",
- !inProgForDb(db));
+ !inProgForDb(db));
}
- void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) {
+ void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) {
uassert(12587, "cannot perform operation: a background operation is currently running for this collection",
- !inProgForNs(ns));
- }
+ !inProgForNs(ns));
+ }
- BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) {
+ BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) {
assertInWriteLock();
dbsInProg[_ns.db]++;
assert( nsInProg.count(_ns.ns()) == 0 );
nsInProg.insert(_ns.ns());
}
- BackgroundOperation::~BackgroundOperation() {
+ BackgroundOperation::~BackgroundOperation() {
assertInWriteLock();
dbsInProg[_ns.db]--;
nsInProg.erase(_ns.ns());
}
void BackgroundOperation::dump(stringstream& ss) {
- if( nsInProg.size() ) {
+ if( nsInProg.size() ) {
ss << "\n<b>Background Jobs in Progress</b>\n";
for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ )
ss << " " << *i << '\n';
}
- for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) {
- if( i->second )
+ for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) {
+ if( i->second )
ss << "database " << i->first << ": " << i->second << '\n';
}
}
@@ -114,24 +113,23 @@ namespace mongo {
DataFileMgr theDataFileMgr;
DatabaseHolder dbHolder;
int MAGIC = 0x1000;
-// int curOp = -2;
extern int otherTraceLevel;
void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
void ensureIdIndexForNewNs(const char *ns) {
if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) &&
- strstr( ns, ".$freelist" ) == 0 ){
+ strstr( ns, ".$freelist" ) == 0 ) {
log( 1 ) << "adding _id index for collection " << ns << endl;
ensureHaveIdIndex( ns );
- }
+ }
}
string getDbContext() {
stringstream ss;
Client * c = currentClient.get();
- if ( c ){
+ if ( c ) {
Client::Context * cx = c->getContext();
- if ( cx ){
+ if ( cx ) {
Database *database = cx->db();
if ( database ) {
ss << database->name << ' ';
@@ -142,20 +140,44 @@ namespace mongo {
return ss.str();
}
- BSONObj::BSONObj(const Record *r) {
- init(r->data, false);
- }
-
/*---------------------------------------------------------------------*/
- int initialExtentSize(int len) {
+ // inheritable class to implement an operation that may be applied to all
+ // files in a database using _applyOpToDataFiles()
+ class FileOp {
+ public:
+ virtual ~FileOp() {}
+ // Return true if file exists and operation successful
+ virtual bool apply( const boost::filesystem::path &p ) = 0;
+ virtual const char * op() const = 0;
+ };
+
+ void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );
+
+ void _deleteDataFiles(const char *database) {
+ if ( directoryperdb ) {
+ FileAllocator::get()->waitUntilFinished();
+ BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) );
+ return;
+ }
+ class : public FileOp {
+ virtual bool apply( const boost::filesystem::path &p ) {
+ return boost::filesystem::remove( p );
+ }
+ virtual const char * op() const {
+ return "remove";
+ }
+ } deleter;
+ _applyOpToDataFiles( database, deleter, true );
+ }
+
+ int Extent::initialSize(int len) {
long long sz = len * 16;
if ( len < 1000 ) sz = len * 64;
if ( sz > 1000000000 )
sz = 1000000000;
int z = ((int)sz) & 0xffffff00;
assert( z > len );
- //DEV tlog() << "initialExtentSize(" << len << ") returns " << z << endl;
return z;
}
@@ -165,7 +187,7 @@ namespace mongo {
return false;
}
- log(1) << "create collection " << ns << ' ' << options << '\n';
+ log(1) << "create collection " << ns << ' ' << options << endl;
/* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field
and then go back and set to ok : 1 after we are done.
@@ -174,33 +196,48 @@ namespace mongo {
if( !isFreeList )
addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options);
- long long size = initialExtentSize(128);
- BSONElement e = options.getField("size");
- if ( e.isNumber() ) {
- size = e.numberLong();
- size += 256;
- size &= 0xffffffffffffff00LL;
+ long long size = Extent::initialSize(128);
+ {
+ BSONElement e = options.getField("size");
+ if ( e.isNumber() ) {
+ size = e.numberLong();
+ size += 256;
+ size &= 0xffffffffffffff00LL;
+ }
}
-
+
uassert( 10083 , "invalid size spec", size > 0 );
bool newCapped = false;
int mx = 0;
- e = options.getField("capped");
- if ( e.type() == Bool && e.boolean() ) {
+ if( options.getBoolField("capped") ) {
newCapped = true;
- e = options.getField("max");
+ BSONElement e = options.getField("max");
if ( e.isNumber() ) {
mx = e.numberInt();
}
}
- // $nExtents just for debug/testing. We create '$nExtents' extents,
- // each of size 'size'.
- e = options.getField( "$nExtents" );
- int nExtents = int( e.number() );
+ // $nExtents just for debug/testing.
+ BSONElement e = options.getField( "$nExtents" );
Database *database = cc().database();
- if ( nExtents > 0 ) {
+ if ( e.type() == Array ) {
+ // We create one extent per array entry, with size specified
+ // by the array value.
+ BSONObjIterator i( e.embeddedObject() );
+ while( i.more() ) {
+ BSONElement e = i.next();
+ int size = int( e.number() );
+ assert( size <= 0x7fffffff );
+ // $nExtents is just for testing - always allocate new extents
+ // rather than reuse existing extents so we have some predictibility
+ // in the extent size used by our tests
+ database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
+ }
+ }
+ else if ( int( e.number() ) > 0 ) {
+ // We create '$nExtents' extents, each of size 'size'.
+ int nExtents = int( e.number() );
assert( size <= 0x7fffffff );
for ( int i = 0; i < nExtents; ++i ) {
assert( size <= 0x7fffffff );
@@ -209,10 +246,16 @@ namespace mongo {
// in the extent size used by our tests
database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
}
- } else {
+ }
+ else {
+ // This is the non test case, where we don't have a $nExtents spec.
while ( size > 0 ) {
int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize;
int desiredExtentSize = (int) (size > max ? max : size);
+ if ( desiredExtentSize < Extent::minSize() ) {
+ desiredExtentSize = Extent::minSize();
+ }
+ desiredExtentSize &= 0xffffff00;
Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped );
size -= e->length;
}
@@ -223,15 +266,16 @@ namespace mongo {
bool ensure = false;
if ( options.getField( "autoIndexId" ).type() ) {
- if ( options["autoIndexId"].trueValue() ){
+ if ( options["autoIndexId"].trueValue() ) {
ensure = true;
}
- } else {
+ }
+ else {
if ( !newCapped ) {
ensure=true;
}
}
- if( ensure ) {
+ if( ensure ) {
if( deferIdIndex )
*deferIdIndex = true;
else
@@ -239,7 +283,7 @@ namespace mongo {
}
if ( mx > 0 )
- d->max = mx;
+ getDur().writingInt( d->max ) = mx;
return true;
}
@@ -250,7 +294,7 @@ namespace mongo {
*/
bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) {
const char *coll = strchr( ns, '.' ) + 1;
- massert( 10356 , "invalid ns", coll && *coll );
+ massert( 10356 , str::stream() << "invalid ns: " << ns , coll && *coll );
char cl[ 256 ];
nsToDatabase( ns, cl );
bool ok = _userCreateNS(ns, options, err, deferIdIndex);
@@ -272,14 +316,22 @@ namespace mongo {
int MongoDataFile::maxSize() {
if ( sizeof( int* ) == 4 ) {
return 512 * 1024 * 1024;
- } else if ( cmdLine.smallfiles ) {
+ }
+ else if ( cmdLine.smallfiles ) {
return 0x7ff00000 >> 2;
- } else {
+ }
+ else {
return 0x7ff00000;
}
}
- void MongoDataFile::badOfs(int ofs) const {
+ void MongoDataFile::badOfs2(int ofs) const {
+ stringstream ss;
+ ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
+ uasserted(13441, ss.str());
+ }
+
+ void MongoDataFile::badOfs(int ofs) const {
stringstream ss;
ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
uasserted(13440, ss.str());
@@ -293,26 +345,18 @@ namespace mongo {
else
size = 0x7ff00000;
- if ( strstr(filename, "_hudsonSmall") ) {
- int mult = 1;
- if ( fileNo > 1 && fileNo < 1000 )
- mult = fileNo;
- size = 1024 * 512 * mult;
- log() << "Warning : using small files for _hudsonSmall" << endl;
- }
- else if ( cmdLine.smallfiles ){
+ if ( cmdLine.smallfiles ) {
size = size >> 2;
}
-
-
+
+
return size;
}
void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) {
{
/* check quotas
- very simple temporary implementation - we will in future look up
- the quota from the grid database
+ very simple temporary implementation for now
*/
if ( cmdLine.quota && fileNo > cmdLine.quotaFiles && !MMF::exists(filename) ) {
/* todo: if we were adding / changing keys in an index did we do some
@@ -340,58 +384,66 @@ namespace mongo {
if ( size > maxSize() )
size = maxSize();
- assert( ( size >= 64*1024*1024 ) || cmdLine.smallfiles || ( strstr( filename, "_hudsonSmall" ) ) );
+ assert( size >= 64*1024*1024 || cmdLine.smallfiles );
assert( size % 4096 == 0 );
if ( preallocateOnly ) {
if ( cmdLine.prealloc ) {
- theFileAllocator().requestAllocation( filename, size );
+ FileAllocator::get()->requestAllocation( filename, size );
}
return;
}
-
- _p = mmf.map(filename, size);
- header = (DataFileHeader *) _p.at(0, DataFileHeader::HeaderSize);
- if( sizeof(char *) == 4 )
- uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", header);
+
+ {
+ assert( _mb == 0 );
+ unsigned long long sz = size;
+ if( mmf.create(filename, sz, false) )
+ _mb = mmf.getView();
+ assert( sz <= 0x7fffffff );
+ size = (int) sz;
+ }
+ //header = (DataFileHeader *) _p;
+ if( sizeof(char *) == 4 )
+ uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0);
else
- uassert( 10085 , "can't map file memory", header);
- header->init(fileNo, size);
+ uassert( 10085 , "can't map file memory", _mb != 0);
+ header()->init(fileNo, size, filename);
}
- void MongoDataFile::flush( bool sync ){
+ void MongoDataFile::flush( bool sync ) {
mmf.flush( sync );
}
- void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) {
- DiskLoc oldExtentLoc;
+ void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) {
NamespaceIndex *ni = nsindex(ns);
NamespaceDetails *details = ni->details(ns);
if ( details ) {
assert( !details->lastExtent.isNull() );
assert( !details->firstExtent.isNull() );
- e->xprev = details->lastExtent;
- details->lastExtent.ext()->xnext = eloc;
+ getDur().writingDiskLoc(e->xprev) = details->lastExtent;
+ getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc;
assert( !eloc.isNull() );
- details->lastExtent = eloc;
+ getDur().writingDiskLoc(details->lastExtent) = eloc;
}
else {
ni->add_ns(ns, eloc, capped);
details = ni->details(ns);
}
- details->lastExtentSize = e->length;
- DEBUGGING out() << "temp: newextent adddelrec " << ns << endl;
+ {
+ NamespaceDetails *dw = details->writingWithoutExtra();
+ dw->lastExtentSize = e->length;
+ }
details->addDeletedRec(emptyLoc.drec(), emptyLoc);
}
Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
- massert( 10357 , "shutdown in progress", !goingAway );
- massert( 10358 , "bad new extent size", approxSize >= 0 && approxSize <= Extent::maxSize() );
- massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header ); // null if file open failed
- int ExtentSize = approxSize <= header->unusedLength ? approxSize : header->unusedLength;
+ massert( 10357 , "shutdown in progress", ! inShutdown() );
+ massert( 10358 , "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() );
+ massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed
+ int ExtentSize = approxSize <= header()->unusedLength ? approxSize : header()->unusedLength;
DiskLoc loc;
- if ( ExtentSize <= 0 ) {
+ if ( ExtentSize < Extent::minSize() ) {
/* not there could be a lot of looping here is db just started and
no files are open yet. we might want to do something about that. */
if ( loops > 8 ) {
@@ -401,12 +453,14 @@ namespace mongo {
log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n";
return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
}
- int offset = header->unused.getOfs();
- header->unused.setOfs( fileNo, offset + ExtentSize );
- header->unusedLength -= ExtentSize;
- loc.setOfs(fileNo, offset);
+ int offset = header()->unused.getOfs();
+
+ DataFileHeader *h = getDur().writing(header());
+ h->unused.set( fileNo, offset + ExtentSize );
+ h->unusedLength -= ExtentSize;
+ loc.set(fileNo, offset);
Extent *e = _getExtent(loc);
- DiskLoc emptyLoc = e->init(ns, ExtentSize, fileNo, offset);
+ DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset);
addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);
@@ -415,7 +469,7 @@ namespace mongo {
return e;
}
- Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) {
+ Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) {
string s = cc().database()->name + ".$freelist";
NamespaceDetails *f = nsdetails(s.c_str());
if( f ) {
@@ -426,7 +480,7 @@ namespace mongo {
if( low > 2048 ) low -= 256;
high = (int) (approxSize * 1.05) + 256;
}
- else {
+ else {
low = (int) (approxSize * 0.8);
high = (int) (approxSize * 1.4);
}
@@ -436,20 +490,20 @@ namespace mongo {
int bestDiff = 0x7fffffff;
{
DiskLoc L = f->firstExtent;
- while( !L.isNull() ) {
+ while( !L.isNull() ) {
Extent * e = L.ext();
- if( e->length >= low && e->length <= high ) {
+ if( e->length >= low && e->length <= high ) {
int diff = abs(e->length - approxSize);
- if( diff < bestDiff ) {
+ if( diff < bestDiff ) {
bestDiff = diff;
best = e;
- if( diff == 0 )
+ if( diff == 0 )
break;
}
}
L = e->xnext;
++n;
-
+
}
}
OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
@@ -457,13 +511,13 @@ namespace mongo {
Extent *e = best;
// remove from the free list
if( !e->xprev.isNull() )
- e->xprev.ext()->xnext = e->xnext;
+ e->xprev.ext()->xnext.writing() = e->xnext;
if( !e->xnext.isNull() )
- e->xnext.ext()->xprev = e->xprev;
+ e->xnext.ext()->xprev.writing() = e->xprev;
if( f->firstExtent == e->myLoc )
- f->firstExtent = e->xnext;
+ f->firstExtent.writing() = e->xnext;
if( f->lastExtent == e->myLoc )
- f->lastExtent = e->xprev;
+ f->lastExtent.writing() = e->xprev;
// use it
OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
@@ -479,9 +533,11 @@ namespace mongo {
/*---------------------------------------------------------------------*/
- DiskLoc Extent::reuse(const char *nsname) {
- /*TODOMMF - work to do when extent is freed. */
- log(3) << "reset extent was:" << nsDiagnostic.buf << " now:" << nsname << '\n';
+ DiskLoc Extent::reuse(const char *nsname) {
+ return getDur().writing(this)->_reuse(nsname);
+ }
+ DiskLoc Extent::_reuse(const char *nsname) {
+ log(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n';
massert( 10360 , "Extent::reset bad magic value", magic == 0x41424344 );
xnext.Null();
xprev.Null();
@@ -493,12 +549,9 @@ namespace mongo {
emptyLoc.inc( (int) (_extentData-(char*)this) );
int delRecLength = length - (_extentData - (char *) this);
- //DeletedRecord *empty1 = (DeletedRecord *) extentData;
- DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc);
- //assert( empty == empty1 );
-
- // do we want to zero the record? memset(empty, ...)
+ DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc);
+ empty = getDur().writing(empty);
empty->lengthWithHeaders = delRecLength;
empty->extentOfs = myLoc.getOfs();
empty->nextDeleted.Null();
@@ -509,7 +562,7 @@ namespace mongo {
/* assumes already zeroed -- insufficient for block 'reuse' perhaps */
DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) {
magic = 0x41424344;
- myLoc.setOfs(_fileNo, _offset);
+ myLoc.set(_fileNo, _offset);
xnext.Null();
xprev.Null();
nsDiagnostic = nsname;
@@ -521,9 +574,7 @@ namespace mongo {
emptyLoc.inc( (int) (_extentData-(char*)this) );
int l = _length - (_extentData - (char *) this);
- //DeletedRecord *empty1 = (DeletedRecord *) extentData;
- DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, l);
- //assert( empty == empty1 );
+ DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, l) );
empty->lengthWithHeaders = l;
empty->extentOfs = myLoc.getOfs();
return emptyLoc;
@@ -582,7 +633,7 @@ namespace mongo {
}
return maxExtentSize;
}
-
+
/*---------------------------------------------------------------------*/
shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) {
@@ -612,12 +663,12 @@ namespace mongo {
d->dumpDeleted(&extents);
}
- if ( d->capped )
+ if ( d->capped )
return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) );
-
+
if ( !startLoc.isNull() )
- return shared_ptr<Cursor>(new BasicCursor( startLoc ));
-
+ return shared_ptr<Cursor>(new BasicCursor( startLoc ));
+
while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
/* todo: if extent is empty, free it for reuse elsewhere.
that is a bit complicated have to clean up the freelists.
@@ -638,37 +689,38 @@ namespace mongo {
if ( el.number() >= 0 )
return DataFileMgr::findAll(ns, startLoc);
-
+
// "reverse natural order"
NamespaceDetails *d = nsdetails(ns);
-
+
if ( !d )
return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
-
+
if ( !d->capped ) {
if ( !startLoc.isNull() )
- return shared_ptr<Cursor>(new ReverseCursor( startLoc ));
+ return shared_ptr<Cursor>(new ReverseCursor( startLoc ));
Extent *e = d->lastExtent.ext();
while ( e->lastRecord.isNull() && !e->xprev.isNull() ) {
OCCASIONALLY out() << " findTableScan: extent empty, skipping ahead" << endl;
e = e->getPrevExtent();
}
return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
- } else {
+ }
+ else {
return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) );
}
}
- void printFreeList() {
+ void printFreeList() {
string s = cc().database()->name + ".$freelist";
log() << "dump freelist " << s << '\n';
NamespaceDetails *freeExtents = nsdetails(s.c_str());
- if( freeExtents == 0 ) {
+ if( freeExtents == 0 ) {
log() << " freeExtents==0" << endl;
return;
}
DiskLoc a = freeExtents->firstExtent;
- while( !a.isNull() ) {
+ while( !a.isNull() ) {
Extent *e = a.ext();
log() << " " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << '\n';
a = e->xnext;
@@ -687,7 +739,7 @@ namespace mongo {
NamespaceString s(nsToDrop);
assert( s.db == cc().database()->name );
if( s.isSystem() ) {
- if( s.coll == "system.profile" )
+ if( s.coll == "system.profile" )
uassert( 10087 , "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 );
else
uasserted( 12502, "can't drop system ns" );
@@ -698,32 +750,31 @@ namespace mongo {
BSONObj cond = BSON( "name" << nsToDrop ); // { name: "colltodropname" }
string system_namespaces = cc().database()->name + ".system.namespaces";
/*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true);
- // no check of return code as this ns won't exist for some of the new storage engines
+ // no check of return code as this ns won't exist for some of the new storage engines
}
// free extents
if( !d->firstExtent.isNull() ) {
string s = cc().database()->name + ".$freelist";
NamespaceDetails *freeExtents = nsdetails(s.c_str());
- if( freeExtents == 0 ) {
+ if( freeExtents == 0 ) {
string err;
_userCreateNS(s.c_str(), BSONObj(), err, 0);
freeExtents = nsdetails(s.c_str());
massert( 10361 , "can't create .$freelist", freeExtents);
}
- if( freeExtents->firstExtent.isNull() ) {
- freeExtents->firstExtent = d->firstExtent;
- freeExtents->lastExtent = d->lastExtent;
+ if( freeExtents->firstExtent.isNull() ) {
+ freeExtents->firstExtent.writing() = d->firstExtent;
+ freeExtents->lastExtent.writing() = d->lastExtent;
}
- else {
+ else {
DiskLoc a = freeExtents->firstExtent;
assert( a.ext()->xprev.isNull() );
- a.ext()->xprev = d->lastExtent;
- d->lastExtent.ext()->xnext = a;
- freeExtents->firstExtent = d->firstExtent;
-
- d->firstExtent.setInvalid();
- d->lastExtent.setInvalid();
+ getDur().writingDiskLoc( a.ext()->xprev ) = d->lastExtent;
+ getDur().writingDiskLoc( d->lastExtent.ext()->xnext ) = a;
+ getDur().writingDiskLoc( freeExtents->firstExtent ) = d->firstExtent;
+ getDur().writingDiskLoc( d->firstExtent ).setInvalid();
+ getDur().writingDiskLoc( d->lastExtent ).setInvalid();
}
}
@@ -740,7 +791,7 @@ namespace mongo {
BackgroundOperation::assertNoBgOpInProgForNs(name.c_str());
if ( d->nIndexes != 0 ) {
- try {
+ try {
assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) );
}
catch( DBException& e ) {
@@ -754,11 +805,10 @@ namespace mongo {
log(1) << "\t dropIndexes done" << endl;
result.append("ns", name.c_str());
ClientCursor::invalidate(name.c_str());
- Client::invalidateNS( name );
Top::global.collectionDropped( name );
- dropNS(name);
+ dropNS(name);
}
-
+
int nUnindexes = 0;
/* unindex all keys in index for this record. */
@@ -797,63 +847,69 @@ namespace mongo {
int n = d->nIndexes;
for ( int i = 0; i < n; i++ )
_unindexRecord(d->idx(i), obj, dl, !noWarn);
- if( d->backgroundIndexBuildInProgress ) {
+ if( d->indexBuildInProgress ) { // background index
// always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it
- _unindexRecord(d->idx(n), obj, dl, false);
+ _unindexRecord(d->idx(n), obj, dl, false);
}
}
- /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
+ /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
caller must check if capped
*/
- void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl)
- {
+ void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) {
/* remove ourself from the record next/prev chain */
{
if ( todelete->prevOfs != DiskLoc::NullOfs )
- todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs;
+ getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs;
if ( todelete->nextOfs != DiskLoc::NullOfs )
- todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs;
+ getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs;
}
/* remove ourself from extent pointers */
{
- Extent *e = todelete->myExtent(dl);
+ Extent *e = getDur().writing( todelete->myExtent(dl) );
if ( e->firstRecord == dl ) {
if ( todelete->nextOfs == DiskLoc::NullOfs )
e->firstRecord.Null();
else
- e->firstRecord.setOfs(dl.a(), todelete->nextOfs);
+ e->firstRecord.set(dl.a(), todelete->nextOfs);
}
if ( e->lastRecord == dl ) {
if ( todelete->prevOfs == DiskLoc::NullOfs )
e->lastRecord.Null();
else
- e->lastRecord.setOfs(dl.a(), todelete->prevOfs);
+ e->lastRecord.set(dl.a(), todelete->prevOfs);
}
}
/* add to the free list */
{
- d->nrecords--;
- d->datasize -= todelete->netLength();
- /* temp: if in system.indexes, don't reuse, and zero out: we want to be
- careful until validated more, as IndexDetails has pointers
- to this disk location. so an incorrectly done remove would cause
- a lot of problems.
- */
+ {
+ NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+ s->datasize -= todelete->netLength();
+ s->nrecords--;
+ }
+
if ( strstr(ns, ".system.indexes") ) {
- memset(todelete, 0, todelete->lengthWithHeaders);
+ /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+ careful until validated more, as IndexDetails has pointers
+ to this disk location. so an incorrectly done remove would cause
+ a lot of problems.
+ */
+ memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders);
}
else {
- DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+ DEV {
+ unsigned long long *p = (unsigned long long *) todelete->data;
+ *getDur().writing(p) = 0;
+ //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+ }
d->addDeletedRec((DeletedRecord*)todelete, dl);
}
}
}
- void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn)
- {
+ void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) {
dassert( todelete == dl.rec() );
NamespaceDetails* d = nsdetails(ns);
@@ -880,8 +936,7 @@ namespace mongo {
NamespaceDetails *d,
NamespaceDetailsTransient *nsdt,
Record *toupdate, const DiskLoc& dl,
- const char *_buf, int _len, OpDebug& debug, bool &changedId, bool god)
- {
+ const char *_buf, int _len, OpDebug& debug, bool god) {
StringBuilder& ss = debug.str;
dassert( toupdate == dl.rec() );
@@ -891,7 +946,7 @@ namespace mongo {
DEV assert( objNew.objdata() == _buf );
if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
- /* add back the old _id value if the update removes it. Note this implementation is slow
+ /* add back the old _id value if the update removes it. Note this implementation is slow
(copies entire object multiple times), but this shouldn't happen often, so going for simple
code, not speed.
*/
@@ -903,11 +958,13 @@ namespace mongo {
objNew = b.obj();
}
- /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
+ /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
*/
vector<IndexChanges> changes;
+ bool changedId = false;
getIndexChanges(changes, *d, objNew, objOld, changedId);
+ uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId );
dupCheck(changes, *d, dl);
if ( toupdate->netLength() < objNew.objsize() ) {
@@ -946,8 +1003,8 @@ namespace mongo {
try {
/* we did the dupCheck() above. so we don't have to worry about it here. */
idx.head.btree()->bt_insert(
- idx.head,
- dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
+ idx.head,
+ dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
}
catch (AssertionException& e) {
ss << " exception update index ";
@@ -959,25 +1016,30 @@ namespace mongo {
ss << '\n' << keyUpdates << " key updates ";
}
- // update in place
- memcpy(toupdate->data, objNew.objdata(), objNew.objsize());
+ // update in place
+ int sz = objNew.objsize();
+ memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz);
return dl;
}
- int followupExtentSize(int len, int lastExtentLen) {
+ int Extent::followupSize(int len, int lastExtentLen) {
assert( len < Extent::maxSize() );
- int x = initialExtentSize(len);
+ int x = initialSize(len);
int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2);
int sz = y > x ? y : x;
- if ( sz < lastExtentLen )
- sz = lastExtentLen;
- else if ( sz > Extent::maxSize() )
+ if ( sz < lastExtentLen ) {
+ // this means there was an int overflow
+ // so we should turn it into maxSize
+ sz = Extent::maxSize();
+ }
+ else if ( sz > Extent::maxSize() ) {
sz = Extent::maxSize();
-
+ }
+
sz = ((int)sz) & 0xffffff00;
assert( sz > len );
-
+
return sz;
}
@@ -990,7 +1052,7 @@ namespace mongo {
Ordering ordering = Ordering::make(order);
int n = 0;
for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
- if( ++n == 2 ) {
+ if( ++n == 2 ) {
d->setIndexIsMultikey(idxNo);
}
assert( !recordLoc.isNull() );
@@ -999,7 +1061,7 @@ namespace mongo {
*i, ordering, dupsAllowed, idx);
}
catch (AssertionException& e) {
- if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
+ if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
continue;
}
@@ -1012,8 +1074,7 @@ namespace mongo {
}
}
- void testSorting()
- {
+ void testSorting() {
BSONObjBuilder b;
b.appendNull("");
BSONObj x = b.obj();
@@ -1027,9 +1088,9 @@ namespace mongo {
sorter.add(x, DiskLoc(3,77));
sorter.sort();
-
+
auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
- while( i->more() ) {
+ while( i->more() ) {
BSONObjExternalSorter::Data d = i->next();
/*cout << d.second.toString() << endl;
cout << d.first.objsize() << endl;
@@ -1039,7 +1100,6 @@ namespace mongo {
// throws DBException
unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
- assert( d->backgroundIndexBuildInProgress == 0 );
CurOp * op = cc().curop();
Timer t;
@@ -1050,17 +1110,17 @@ namespace mongo {
bool dropDups = idx.dropDups() || inDBRepair;
BSONObj order = idx.keyPattern();
- idx.head.Null();
-
+ getDur().writingDiskLoc(idx.head).Null();
+
if ( logLevel > 1 ) printMemInfo( "before index start" );
/* get and sort all the keys ----- */
unsigned long long n = 0;
shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
BSONObjExternalSorter sorter(order);
- sorter.hintNumObjects( d->nrecords );
+ sorter.hintNumObjects( d->stats.nrecords );
unsigned long long nkeys = 0;
- ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->nrecords , 10 ) );
+ ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
while ( c->ok() ) {
BSONObj o = c->current();
DiskLoc loc = c->currLoc();
@@ -1069,17 +1129,17 @@ namespace mongo {
idx.getKeysFromObject(o, keys);
int k = 0;
for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
- if( ++k == 2 )
+ if( ++k == 2 ) {
d->setIndexIsMultikey(idxNo);
- //cout<<"SORTER ADD " << i->toString() << ' ' << loc.toString() << endl;
+ }
sorter.add(*i, loc);
nkeys++;
}
-
+
c->advance();
n++;
pm.hit();
- if ( logLevel > 1 && n % 10000 == 0 ){
+ if ( logLevel > 1 && n % 10000 == 0 ) {
printMemInfo( "\t iterating objects" );
}
@@ -1089,37 +1149,37 @@ namespace mongo {
if ( logLevel > 1 ) printMemInfo( "before final sort" );
sorter.sort();
if ( logLevel > 1 ) printMemInfo( "after final sort" );
-
+
log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
list<DiskLoc> dupsToDrop;
- /* build index --- */
+ /* build index --- */
{
BtreeBuilder btBuilder(dupsAllowed, idx);
BSONObj keyLast;
auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
assert( pm == op->setMessage( "index: (2/3) btree bottom up" , nkeys , 10 ) );
- while( i->more() ) {
+ while( i->more() ) {
RARELY killCurrentOp.checkForInterrupt();
BSONObjExternalSorter::Data d = i->next();
- try {
+ try {
btBuilder.addKey(d.first, d.second);
}
- catch( AssertionException& e ) {
- if ( dupsAllowed ){
+ catch( AssertionException& e ) {
+ if ( dupsAllowed ) {
// unknow exception??
throw;
}
-
+
if( e.interrupted() )
throw;
if ( ! dropDups )
throw;
- /* we could queue these on disk, but normally there are very few dups, so instead we
+ /* we could queue these on disk, but normally there are very few dups, so instead we
keep in ram and have a limit.
*/
dupsToDrop.push_back(d.second);
@@ -1131,9 +1191,11 @@ namespace mongo {
op->setMessage( "index: (3/3) btree-middle" );
log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
btBuilder.commit();
- wassert( btBuilder.getn() == nkeys || dropDups );
+ if ( btBuilder.getn() != nkeys && ! dropDups ) {
+ warning() << "not all entries were added to the index, probably some keys were too large" << endl;
+ }
}
-
+
log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ )
@@ -1142,13 +1204,13 @@ namespace mongo {
return n;
}
- class BackgroundIndexBuildJob : public BackgroundOperation {
+ class BackgroundIndexBuildJob : public BackgroundOperation {
unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
bool dupsAllowed = !idx.unique();
bool dropDups = idx.dropDups();
- ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->nrecords );
+ ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords );
unsigned long long n = 0;
auto_ptr<ClientCursor> cc;
@@ -1156,25 +1218,26 @@ namespace mongo {
shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) );
}
- CursorId id = cc->cursorid;
+ CursorId id = cc->cursorid();
- while ( cc->c->ok() ) {
- BSONObj js = cc->c->current();
- try {
- _indexRecord(d, idxNo, js, cc->c->currLoc(), dupsAllowed);
- cc->c->advance();
- } catch( AssertionException& e ) {
+ while ( cc->ok() ) {
+ BSONObj js = cc->current();
+ try {
+ _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
+ cc->advance();
+ }
+ catch( AssertionException& e ) {
if( e.interrupted() )
throw;
if ( dropDups ) {
- DiskLoc toDelete = cc->c->currLoc();
- bool ok = cc->c->advance();
+ DiskLoc toDelete = cc->currLoc();
+ bool ok = cc->advance();
cc->updateLocation();
theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true );
if( ClientCursor::find(id, false) == 0 ) {
cc.release();
- if( !ok ) {
+ if( !ok ) {
/* we were already at the end. normal. */
}
else {
@@ -1182,7 +1245,8 @@ namespace mongo {
}
break;
}
- } else {
+ }
+ else {
log() << "background addExistingToIndex exception " << e.what() << endl;
throw;
}
@@ -1200,7 +1264,7 @@ namespace mongo {
return n;
}
- /* we do set a flag in the namespace for quick checking, but this is our authoritative info -
+ /* we do set a flag in the namespace for quick checking, but this is our authoritative info -
that way on a crash/restart, we don't think we are still building one. */
set<NamespaceDetails*> bgJobsInProgress;
@@ -1208,12 +1272,8 @@ namespace mongo {
assertInWriteLock();
uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , dbMutex.getState() == 1 );
bgJobsInProgress.insert(d);
- d->backgroundIndexBuildInProgress = 1;
- d->nIndexes--;
}
void done(const char *ns, NamespaceDetails *d) {
- d->nIndexes++;
- d->backgroundIndexBuildInProgress = 0;
NamespaceDetailsTransient::get_w(ns).addedIndex(); // clear query optimizer cache
assertInWriteLock();
}
@@ -1221,16 +1281,16 @@ namespace mongo {
public:
BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { }
- unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+ unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
unsigned long long n = 0;
prep(ns.c_str(), d);
assert( idxNo == d->nIndexes );
- try {
+ try {
idx.head = BtreeBucket::addBucket(idx);
n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
}
- catch(...) {
+ catch(...) {
if( cc().database() && nsdetails(ns.c_str()) == d ) {
assert( idxNo == d->nIndexes );
done(ns.c_str(), d);
@@ -1246,25 +1306,51 @@ namespace mongo {
}
};
+ /**
+ * For the lifetime of this object, an index build is indicated on the specified
+ * namespace and the newest index is marked as absent. This simplifies
+ * the cleanup required on recovery.
+ */
+ class RecoverableIndexState {
+ public:
+ RecoverableIndexState( NamespaceDetails *d ) : _d( d ) {
+ indexBuildInProgress() = 1;
+ nIndexes()--;
+ }
+ ~RecoverableIndexState() {
+ DESTRUCTOR_GUARD (
+ nIndexes()++;
+ indexBuildInProgress() = 0;
+ )
+ }
+ private:
+ int &nIndexes() { return getDur().writingInt( _d->nIndexes ); }
+ int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); }
+ NamespaceDetails *_d;
+ };
+
// throws DBException
- static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
+ static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
tlog() << "building new index on " << idx.keyPattern() << " for " << ns << ( background ? " background" : "" ) << endl;
Timer t;
- unsigned long long n;
+ unsigned long long n;
if( background ) {
log(2) << "buildAnIndex: background=true\n";
}
assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be...
+ assert( d->indexBuildInProgress == 0 );
+ assertInWriteLock();
+ RecoverableIndexState recoverable( d );
if( inDBRepair || !background ) {
- n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
- assert( !idx.head.isNull() );
- }
- else {
+ n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
+ assert( !idx.head.isNull() );
+ }
+ else {
BackgroundIndexBuildJob j(ns.c_str());
n = j.go(ns, d, idx, idxNo);
- }
+ }
tlog() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl;
}
@@ -1272,20 +1358,20 @@ namespace mongo {
static void indexRecord(NamespaceDetails *d, BSONObj obj, DiskLoc loc) {
int n = d->nIndexesBeingBuilt();
for ( int i = 0; i < n; i++ ) {
- try {
+ try {
bool unique = d->idx(i).unique();
_indexRecord(d, i, obj, loc, /*dupsAllowed*/!unique);
}
- catch( DBException& ) {
+ catch( DBException& ) {
/* try to roll back previously added index entries
note <= i (not < i) is important here as the index we were just attempted
may be multikey and require some cleanup.
*/
- for( int j = 0; j <= i; j++ ) {
+ for( int j = 0; j <= i; j++ ) {
try {
_unindexRecord(d->idx(j), obj, loc, false);
}
- catch(...) {
+ catch(...) {
log(3) << "unindex fails on rollback after unique failure\n";
}
}
@@ -1301,7 +1387,7 @@ namespace mongo {
if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) )
return;
- d->flags |= NamespaceDetails::Flag_HaveIdIndex;
+ *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex;
{
NamespaceDetails::IndexIterator i = d->ii();
@@ -1324,7 +1410,7 @@ namespace mongo {
}
#pragma pack(1)
- struct IDToInsert_ {
+ struct IDToInsert_ {
char type;
char _id[4];
OID oid;
@@ -1338,13 +1424,13 @@ namespace mongo {
IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {}
} idToInsert;
#pragma pack()
-
+
void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) {
BSONObj tmp = o;
insertWithObjMod( ns, tmp, god );
logOp( "i", ns, tmp );
}
-
+
DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god );
if ( !loc.isNull() )
@@ -1356,12 +1442,12 @@ namespace mongo {
insert( ns, o.objdata(), o.objsize(), god );
}
- bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection);
+ bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject );
// We are now doing two btree scans for all unique indexes (one here, and one when we've
// written the record to the collection. This could be made more efficient inserting
// dummy data here, keeping pointers to the btree nodes holding the dummy data and then
- // updating the dummy data with the DiskLoc of the real record.
+ // updating the dummy data with the DiskLoc of the real record.
void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) {
for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) {
if( d->idx(idxNo).unique() ) {
@@ -1371,19 +1457,19 @@ namespace mongo {
BSONObj order = idx.keyPattern();
for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
uassert( 12582, "duplicate key insert for unique index of capped collection",
- idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() );
+ idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() );
}
}
- }
+ }
}
- /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
+ /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
*/
DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) {
bool wouldAddIndex = false;
- massert( 10093 , "cannot insert into reserved $ collection", god || nsDollarCheck( ns ) );
- uassert( 10094 , "invalid ns", strchr( ns , '.' ) > 0 );
+ massert( 10093 , "cannot insert into reserved $ collection", god || isANormalNSName( ns ) );
+ uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
const char *sys = strstr(ns, "system.");
if ( sys ) {
uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
@@ -1411,7 +1497,7 @@ namespace mongo {
also if this is an addIndex, those checks should happen before this!
*/
// This may create first file in the database.
- cc().database()->allocExtent(ns, initialExtentSize(len), false);
+ cc().database()->allocExtent(ns, Extent::initialSize(len), false);
d = nsdetails(ns);
if ( !god )
ensureIdIndexForNewNs(ns);
@@ -1421,17 +1507,24 @@ namespace mongo {
NamespaceDetails *tableToIndex = 0;
string tabletoidxns;
+ BSONObj fixedIndexObject;
if ( addIndex ) {
assert( obuf );
BSONObj io((const char *) obuf);
- if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) )
+ if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) )
return DiskLoc();
+
+ if ( ! fixedIndexObject.isEmpty() ) {
+ obuf = fixedIndexObject.objdata();
+ len = fixedIndexObject.objsize();
+ }
+
}
const BSONElement *newId = &writeId;
int addID = 0;
if( !god ) {
- /* Check if we have an _id field. If we don't, we'll add it.
+ /* Check if we have an _id field. If we don't, we'll add it.
Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
*/
BSONObj io((const char *) obuf);
@@ -1446,7 +1539,7 @@ namespace mongo {
}
len += newId->size();
}
-
+
BSONElementManipulator::lookForTimestamps( io );
}
@@ -1456,28 +1549,28 @@ namespace mongo {
if ( lenWHdr == 0 ) {
// old datafiles, backward compatible here.
assert( d->paddingFactor == 0 );
- d->paddingFactor = 1.0;
+ *getDur().writing(&d->paddingFactor) = 1.0;
lenWHdr = len + Record::HeaderSize;
}
-
+
// If the collection is capped, check if the new object will violate a unique index
// constraint before allocating space.
if ( d->nIndexes && d->capped && !god ) {
checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
}
-
+
DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
if ( loc.isNull() ) {
// out of space
if ( d->capped == 0 ) { // size capped doesn't grow
log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
- cc().database()->allocExtent(ns, followupExtentSize(lenWHdr, d->lastExtentSize), false);
+ cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false);
loc = d->alloc(ns, lenWHdr, extentLoc);
- if ( loc.isNull() ){
+ if ( loc.isNull() ) {
log() << "WARNING: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
- for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ){
+ for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ) {
log() << "try #" << zzz << endl;
- cc().database()->allocExtent(ns, followupExtentSize(len, d->lastExtentSize), false);
+ cc().database()->allocExtent(ns, Extent::followupSize(len, d->lastExtentSize), false);
loc = d->alloc(ns, lenWHdr, extentLoc);
if ( ! loc.isNull() )
break;
@@ -1492,45 +1585,55 @@ namespace mongo {
}
Record *r = loc.rec();
- assert( r->lengthWithHeaders >= lenWHdr );
- if( addID ) {
- /* a little effort was made here to avoid a double copy when we add an ID */
- ((int&)*r->data) = *((int*) obuf) + newId->size();
- memcpy(r->data+4, newId->rawdata(), newId->size());
- memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4);
- }
- else {
- if( obuf )
- memcpy(r->data, obuf, len);
- }
- Extent *e = r->myExtent(loc);
- if ( e->lastRecord.isNull() ) {
- e->firstRecord = e->lastRecord = loc;
- r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+ {
+ assert( r->lengthWithHeaders >= lenWHdr );
+ r = (Record*) getDur().writingPtr(r, lenWHdr);
+ if( addID ) {
+ /* a little effort was made here to avoid a double copy when we add an ID */
+ ((int&)*r->data) = *((int*) obuf) + newId->size();
+ memcpy(r->data+4, newId->rawdata(), newId->size());
+ memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4);
+ }
+ else {
+ if( obuf )
+ memcpy(r->data, obuf, len);
+ }
}
- else {
- Record *oldlast = e->lastRecord.rec();
- r->prevOfs = e->lastRecord.getOfs();
- r->nextOfs = DiskLoc::NullOfs;
- oldlast->nextOfs = loc.getOfs();
- e->lastRecord = loc;
+ {
+ Extent *e = r->myExtent(loc);
+ if ( e->lastRecord.isNull() ) {
+ Extent::FL *fl = getDur().writing(e->fl());
+ fl->firstRecord = fl->lastRecord = loc;
+ r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+ }
+ else {
+ Record *oldlast = e->lastRecord.rec();
+ r->prevOfs = e->lastRecord.getOfs();
+ r->nextOfs = DiskLoc::NullOfs;
+ getDur().writingInt(oldlast->nextOfs) = loc.getOfs();
+ getDur().writingDiskLoc(e->lastRecord) = loc;
+ }
}
- d->nrecords++;
- d->datasize += r->netLength();
+ /* durability todo : this could be a bit annoying / slow to record constantly */
+ {
+ NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+ s->datasize += r->netLength();
+ s->nrecords++;
+ }
// we don't bother clearing those stats for the god tables - also god is true when adidng a btree bucket
if ( !god )
NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
-
+
if ( tableToIndex ) {
uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos );
BSONObj info = loc.obj();
bool background = info["background"].trueValue();
- if( background && cc().isSyncThread() ) {
- /* don't do background indexing on slaves. there are nuances. this could be added later
+ if( background && cc().isSyncThread() ) {
+ /* don't do background indexing on slaves. there are nuances. this could be added later
but requires more code.
*/
log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
@@ -1539,10 +1642,11 @@ namespace mongo {
int idxNo = tableToIndex->nIndexes;
IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
- idx.info = loc;
+ getDur().writingDiskLoc(idx.info) = loc;
try {
buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
- } catch( DBException& e ) {
+ }
+ catch( DBException& e ) {
// save our error msg string as an exception or dropIndexes will overwrite our message
LastError *le = lastError.get();
int savecode = 0;
@@ -1564,7 +1668,7 @@ namespace mongo {
if( !ok ) {
log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
}
-
+
assert( le && !saveerrmsg.empty() );
raiseError(savecode,saveerrmsg.c_str());
throw;
@@ -1573,20 +1677,20 @@ namespace mongo {
/* add this record to our indexes */
if ( d->nIndexes ) {
- try {
+ try {
BSONObj obj(r->data);
indexRecord(d, obj, loc);
- }
- catch( AssertionException& e ) {
+ }
+ catch( AssertionException& e ) {
// should be a dup key error on _id index
if( tableToIndex || d->capped ) {
massert( 12583, "unexpected index insertion failure on capped collection", !d->capped );
string s = e.toString();
s += " : on addIndex/capped - collection and its index will not match";
uassert_nothrow(s.c_str());
- log() << s << '\n';
+ error() << s << endl;
}
- else {
+ else {
// normal case -- we can roll back
_deleteRecord(d, ns, r, loc);
throw;
@@ -1594,7 +1698,7 @@ namespace mongo {
}
}
- // out() << " inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
+ // out() << " inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
return loc;
}
@@ -1619,18 +1723,27 @@ namespace mongo {
Extent *e = r->myExtent(loc);
if ( e->lastRecord.isNull() ) {
- e->firstRecord = e->lastRecord = loc;
- r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+ Extent::FL *fl = getDur().writing( e->fl() );
+ fl->firstRecord = fl->lastRecord = loc;
+
+ Record::NP *np = getDur().writing(r->np());
+ np->nextOfs = np->prevOfs = DiskLoc::NullOfs;
}
else {
Record *oldlast = e->lastRecord.rec();
- r->prevOfs = e->lastRecord.getOfs();
- r->nextOfs = DiskLoc::NullOfs;
- oldlast->nextOfs = loc.getOfs();
- e->lastRecord = loc;
+ Record::NP *np = getDur().writing(r->np());
+ np->prevOfs = e->lastRecord.getOfs();
+ np->nextOfs = DiskLoc::NullOfs;
+ getDur().writingInt( oldlast->nextOfs ) = loc.getOfs();
+ e->lastRecord.writing() = loc;
}
- d->nrecords++;
+ /* todo: don't update for oplog? seems wasteful. */
+ {
+ NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+ s->datasize += r->netLength();
+ s->nrecords++;
+ }
return r;
}
@@ -1641,7 +1754,7 @@ namespace mongo {
namespace mongo {
- void dropAllDatabasesExceptLocal() {
+ void dropAllDatabasesExceptLocal() {
writelock lk("");
vector<string> n;
@@ -1658,14 +1771,17 @@ namespace mongo {
void dropDatabase(string db) {
log(1) << "dropDatabase " << db << endl;
- assert( cc().database() );
- assert( cc().database()->name == db );
+ Database *d = cc().database();
+ assert( d );
+ assert( d->name == db );
- BackgroundOperation::assertNoBgOpInProgForDb(db.c_str());
+ BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str());
- Client::invalidateDB( db );
+ getDur().syncDataAndTruncateJournal();
+
+ Database::closeDatabase( d->name.c_str(), d->path );
+ d = 0; // d is now deleted
- closeDatabase( db.c_str() );
_deleteDataFiles( db.c_str() );
}
@@ -1674,13 +1790,14 @@ namespace mongo {
void boostRenameWrapper( const Path &from, const Path &to ) {
try {
boost::filesystem::rename( from, to );
- } catch ( const boost::filesystem::filesystem_error & ) {
+ }
+ catch ( const boost::filesystem::filesystem_error & ) {
// boost rename doesn't work across partitions
boost::filesystem::copy_file( from, to);
boost::filesystem::remove( from );
}
}
-
+
// back up original database files to 'temp' dir
void _renameForBackup( const char *database, const Path &reservedPath ) {
Path newPath( reservedPath );
@@ -1738,7 +1855,8 @@ namespace mongo {
ss << prefix << "_repairDatabase_" << i++;
reservedPath = repairPath / ss.str();
BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
- } while ( exists );
+ }
+ while ( exists );
return reservedPath;
}
@@ -1790,12 +1908,15 @@ namespace mongo {
stringstream ss;
ss << "localhost:" << cmdLine.port;
string localhost = ss.str();
-
+
problem() << "repairDatabase " << dbName << endl;
assert( cc().database()->name == dbName );
+ assert( cc().database()->path == dbpath );
BackgroundOperation::assertNoBgOpInProgForDb(dbName);
+ getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
boost::intmax_t totalSize = dbSize( dbName );
boost::intmax_t freeSize = freeSpace( repairpath );
if ( freeSize > -1 && freeSize < totalSize ) {
@@ -1812,30 +1933,37 @@ namespace mongo {
"backup" : "$tmp" );
BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
string reservedPathString = reservedPath.native_directory_string();
-
+
bool res;
- { // clone to temp location, which effectively does repair
+ {
+ // clone to temp location, which effectively does repair
Client::Context ctx( dbName, reservedPathString );
assert( ctx.justCreated() );
-
- res = cloneFrom(localhost.c_str(), errmsg, dbName,
- /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
- closeDatabase( dbName, reservedPathString.c_str() );
+
+ res = cloneFrom(localhost.c_str(), errmsg, dbName,
+ /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
+ Database::closeDatabase( dbName, reservedPathString.c_str() );
}
if ( !res ) {
problem() << "clone failed for " << dbName << " with error: " << errmsg << endl;
if ( !preserveClonedFilesOnFailure )
BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+ getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
return false;
}
+ MongoFile::flushAll(true);
+
Client::Context ctx( dbName );
- closeDatabase( dbName );
+ Database::closeDatabase( dbName, dbpath );
if ( backupOriginalFiles ) {
_renameForBackup( dbName, reservedPath );
- } else {
+ }
+ else {
_deleteDataFiles( dbName );
BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) );
}
@@ -1845,12 +1973,14 @@ namespace mongo {
if ( !backupOriginalFiles )
BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+ getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
return true;
}
void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) {
if ( afterAllocator )
- theFileAllocator().waitUntilFinished();
+ FileAllocator::get()->waitUntilFinished();
string c = database;
c += '.';
boost::filesystem::path p(path);
@@ -1871,8 +2001,8 @@ namespace mongo {
q = p / ss.str();
BOOST_CHECK_EXCEPTION( ok = fo.apply(q) );
if ( ok ) {
- if ( extra != 10 ){
- log(1) << fo.op() << " file " << q.string() << '\n';
+ if ( extra != 10 ) {
+ log(1) << fo.op() << " file " << q.string() << endl;
log() << " _applyOpToDataFiles() warning: extra == " << extra << endl;
}
}
@@ -1883,19 +2013,20 @@ namespace mongo {
}
NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); }
-
- bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ){
+
+ bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) {
log() << "DatabaseHolder::closeAll path:" << path << endl;
dbMutex.assertWriteLocked();
-
+
map<string,Database*>& m = _paths[path];
_size -= m.size();
-
+
set< string > dbs;
for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) {
+ wassert( i->second->path == path );
dbs.insert( i->first );
}
-
+
currentClient.get()->getContext()->clear();
BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
@@ -1910,7 +2041,7 @@ namespace mongo {
nNotClosed++;
}
else {
- closeDatabase( name.c_str() , path );
+ Database::closeDatabase( name.c_str() , path );
bb.append( bb.numStr( n++ ) , name );
}
}
@@ -1923,6 +2054,17 @@ namespace mongo {
return true;
}
-
+
+ bool isValidNS( const StringData& ns ) {
+ // TODO: should check for invalid characters
+
+ const char * x = strchr( ns.data() , '.' );
+ if ( ! x )
+ return false;
+
+ x++;
+ return *x > 0;
+ }
+
} // namespace mongo