1 files changed, 471 insertions, 329 deletions
diff --git a/db/pdfile.cpp b/db/pdfile.cpp
index 216f21a..20a7423 100644
--- a/db/pdfile.cpp
+++ b/db/pdfile.cpp
@@ -20,7 +20,6 @@
 todo:
 _ table scans must be sequential, not next/prev pointers
 _ coalesce deleted
-
 _ disallow system* manipulations from the database.
 */
 
@@ -37,21 +36,21 @@ _ disallow system* manipulations from the database.
 #include "query.h"
 #include "repl.h"
 #include "dbhelpers.h"
-#include "namespace.h"
+#include "namespace-inl.h"
 #include "queryutil.h"
 #include "extsort.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "background.h"
 
 namespace mongo {
 
     bool inDBRepair = false;
     struct doingRepair {
-        doingRepair(){
+        doingRepair() {
             assert( ! inDBRepair );
             inDBRepair = true;
         }
-        ~doingRepair(){
+        ~doingRepair() {
             inDBRepair = false;
         }
     };
@@ -64,42 +63,42 @@ namespace mongo {
         return dbsInProg[db] != 0;
     }
 
-    bool BackgroundOperation::inProgForNs(const char *ns) { 
+    bool BackgroundOperation::inProgForNs(const char *ns) {
         assertInWriteLock();
         return nsInProg.count(ns) != 0;
     }
 
-    void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { 
+    void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) {
         uassert(12586, "cannot perform operation: a background operation is currently running for this database",
-            !inProgForDb(db));
+                !inProgForDb(db));
     }
 
-    void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { 
+    void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) {
         uassert(12587, "cannot perform operation: a background operation is currently running for this collection",
-            !inProgForNs(ns));
-    } 
+                !inProgForNs(ns));
+    }
 
-    BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { 
+    BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) {
         assertInWriteLock();
         dbsInProg[_ns.db]++;
         assert( nsInProg.count(_ns.ns()) == 0 );
         nsInProg.insert(_ns.ns());
     }
 
-    BackgroundOperation::~BackgroundOperation() { 
+    BackgroundOperation::~BackgroundOperation() {
         assertInWriteLock();
         dbsInProg[_ns.db]--;
         nsInProg.erase(_ns.ns());
     }
 
     void BackgroundOperation::dump(stringstream& ss) {
-        if( nsInProg.size() ) { 
+        if( nsInProg.size() ) {
             ss << "\n<b>Background Jobs in Progress</b>\n";
             for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ )
                 ss << "  " << *i << '\n';
         }
-        for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { 
-            if( i->second ) 
+        for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) {
+            if( i->second )
                 ss << "database " << i->first << ": " << i->second << '\n';
         }
     }
@@ -114,24 +113,23 @@ namespace mongo {
     DataFileMgr theDataFileMgr;
     DatabaseHolder dbHolder;
     int MAGIC = 0x1000;
-//    int curOp = -2;
 
     extern int otherTraceLevel;
     void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
     void ensureIdIndexForNewNs(const char *ns) {
         if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) &&
-             strstr( ns, ".$freelist" ) == 0 ){
+                strstr( ns, ".$freelist" ) == 0 ) {
             log( 1 ) << "adding _id index for collection " << ns << endl;
             ensureHaveIdIndex( ns );
-        }        
+        }
     }
 
     string getDbContext() {
         stringstream ss;
         Client * c = currentClient.get();
-        if ( c ){
+        if ( c ) {
             Client::Context * cx = c->getContext();
-            if ( cx ){
+            if ( cx ) {
                 Database *database = cx->db();
                 if ( database ) {
                     ss << database->name << ' ';
@@ -142,20 +140,44 @@ namespace mongo {
         return ss.str();
     }
 
-    BSONObj::BSONObj(const Record *r) {
-        init(r->data, false);
-    }
-
     /*---------------------------------------------------------------------*/
 
-    int initialExtentSize(int len) {
+    // inheritable class to implement an operation that may be applied to all
+    // files in a database using _applyOpToDataFiles()
+    class FileOp {
+    public:
+        virtual ~FileOp() {}
+        // Return true if file exists and operation successful
+        virtual bool apply( const boost::filesystem::path &p ) = 0;
+        virtual const char * op() const = 0;
+    };
+
+    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );
+
+    void _deleteDataFiles(const char *database) {
+        if ( directoryperdb ) {
+            FileAllocator::get()->waitUntilFinished();
+            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) );
+            return;
+        }
+        class : public FileOp {
+            virtual bool apply( const boost::filesystem::path &p ) {
+                return boost::filesystem::remove( p );
+            }
+            virtual const char * op() const {
+                return "remove";
+            }
+        } deleter;
+        _applyOpToDataFiles( database, deleter, true );
+    }
+
+    int Extent::initialSize(int len) {
         long long sz = len * 16;
         if ( len < 1000 ) sz = len * 64;
         if ( sz > 1000000000 )
             sz = 1000000000;
         int z = ((int)sz) & 0xffffff00;
         assert( z > len );
-        //DEV tlog() << "initialExtentSize(" << len << ") returns " << z << endl;
         return z;
     }
 
@@ -165,7 +187,7 @@ namespace mongo {
             return false;
         }
 
-        log(1) << "create collection " << ns << ' ' << options << '\n';
+        log(1) << "create collection " << ns << ' ' << options << endl;
 
         /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field
            and then go back and set to ok : 1 after we are done.
@@ -174,33 +196,48 @@ namespace mongo {
         if( !isFreeList )
             addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options);
 
-        long long size = initialExtentSize(128);
-        BSONElement e = options.getField("size");
-        if ( e.isNumber() ) {
-            size = e.numberLong();
-            size += 256;
-            size &= 0xffffffffffffff00LL;
+        long long size = Extent::initialSize(128);
+        {
+            BSONElement e = options.getField("size");
+            if ( e.isNumber() ) {
+                size = e.numberLong();
+                size += 256;
+                size &= 0xffffffffffffff00LL;
+            }
         }
-        
+
         uassert( 10083 ,  "invalid size spec", size > 0 );
 
         bool newCapped = false;
         int mx = 0;
-        e = options.getField("capped");
-        if ( e.type() == Bool && e.boolean() ) {
+        if( options.getBoolField("capped") ) {
             newCapped = true;
-            e = options.getField("max");
+            BSONElement e = options.getField("max");
             if ( e.isNumber() ) {
                 mx = e.numberInt();
             }
         }
 
-        // $nExtents just for debug/testing.  We create '$nExtents' extents,
-        // each of size 'size'.
-        e = options.getField( "$nExtents" );
-        int nExtents = int( e.number() );
+        // $nExtents just for debug/testing.
+        BSONElement e = options.getField( "$nExtents" );
         Database *database = cc().database();
-        if ( nExtents > 0 ) {
+        if ( e.type() == Array ) {
+            // We create one extent per array entry, with size specified
+            // by the array value.
+            BSONObjIterator i( e.embeddedObject() );
+            while( i.more() ) {
+                BSONElement e = i.next();
+                int size = int( e.number() );
+                assert( size <= 0x7fffffff );
+                // $nExtents is just for testing - always allocate new extents
+                // rather than reuse existing extents so we have some predictibility
+                // in the extent size used by our tests
+                database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
+            }
+        }
+        else if ( int( e.number() ) > 0 ) {
+            // We create '$nExtents' extents, each of size 'size'.
+            int nExtents = int( e.number() );
             assert( size <= 0x7fffffff );
             for ( int i = 0; i < nExtents; ++i ) {
                 assert( size <= 0x7fffffff );
@@ -209,10 +246,16 @@ namespace mongo {
                 // in the extent size used by our tests
                 database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
             }
-        } else {
+        }
+        else {
+            // This is the non test case, where we don't have a $nExtents spec.
             while ( size > 0 ) {
                 int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize;
                 int desiredExtentSize = (int) (size > max ? max : size);
+                if ( desiredExtentSize < Extent::minSize() ) {
+                    desiredExtentSize = Extent::minSize();
+                }
+                desiredExtentSize &= 0xffffff00;
                 Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped );
                 size -= e->length;
             }
@@ -223,15 +266,16 @@ namespace mongo {
 
         bool ensure = false;
         if ( options.getField( "autoIndexId" ).type() ) {
-            if ( options["autoIndexId"].trueValue() ){
+            if ( options["autoIndexId"].trueValue() ) {
                 ensure = true;
             }
-        } else {
+        }
+        else {
             if ( !newCapped ) {
                 ensure=true;
             }
         }
-        if( ensure ) { 
+        if( ensure ) {
             if( deferIdIndex )
                 *deferIdIndex = true;
             else
@@ -239,7 +283,7 @@ namespace mongo {
         }
 
         if ( mx > 0 )
-            d->max = mx;
+            getDur().writingInt( d->max ) = mx;
 
         return true;
     }
@@ -250,7 +294,7 @@ namespace mongo {
     */
     bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) {
         const char *coll = strchr( ns, '.' ) + 1;
-        massert( 10356 ,  "invalid ns", coll && *coll );
+        massert( 10356 ,  str::stream() << "invalid ns: " << ns , coll && *coll );
         char cl[ 256 ];
         nsToDatabase( ns, cl );
         bool ok = _userCreateNS(ns, options, err, deferIdIndex);
@@ -272,14 +316,22 @@ namespace mongo {
     int MongoDataFile::maxSize() {
         if ( sizeof( int* ) == 4 ) {
             return 512 * 1024 * 1024;
-        } else if ( cmdLine.smallfiles ) {
+        }
+        else if ( cmdLine.smallfiles ) {
             return 0x7ff00000 >> 2;
-        } else {
+        }
+        else {
             return 0x7ff00000;
         }
     }
 
-    void MongoDataFile::badOfs(int ofs) const { 
+    void MongoDataFile::badOfs2(int ofs) const {
+        stringstream ss;
+        ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
+        uasserted(13441, ss.str());
+    }
+
+    void MongoDataFile::badOfs(int ofs) const {
         stringstream ss;
         ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
         uasserted(13440, ss.str());
@@ -293,26 +345,18 @@ namespace mongo {
         else
             size = 0x7ff00000;
 
-        if ( strstr(filename, "_hudsonSmall") ) {
-            int mult = 1;
-            if ( fileNo > 1 && fileNo < 1000 )
-                mult = fileNo;
-            size = 1024 * 512 * mult;
-            log() << "Warning : using small files for _hudsonSmall" << endl;
-        }
-        else if ( cmdLine.smallfiles ){
+        if ( cmdLine.smallfiles ) {
             size = size >> 2;
         }
-        
-        
+
+
         return size;
     }
 
     void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) {
         {
             /* check quotas
-               very simple temporary implementation - we will in future look up
-               the quota from the grid database
+               very simple temporary implementation for now
             */
             if ( cmdLine.quota && fileNo > cmdLine.quotaFiles && !MMF::exists(filename) ) {
                 /* todo: if we were adding / changing keys in an index did we do some
@@ -340,58 +384,66 @@ namespace mongo {
         if ( size > maxSize() )
             size = maxSize();
 
-        assert( ( size >= 64*1024*1024 ) || cmdLine.smallfiles || ( strstr( filename, "_hudsonSmall" ) ) );
+        assert( size >= 64*1024*1024 || cmdLine.smallfiles );
         assert( size % 4096 == 0 );
 
         if ( preallocateOnly ) {
             if ( cmdLine.prealloc ) {
-                theFileAllocator().requestAllocation( filename, size );
+                FileAllocator::get()->requestAllocation( filename, size );
             }
             return;
         }
-        
-        _p = mmf.map(filename, size);
-        header = (DataFileHeader *) _p.at(0, DataFileHeader::HeaderSize);
-        if( sizeof(char *) == 4 ) 
-            uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", header);
+
+        {
+            assert( _mb == 0 );
+            unsigned long long sz = size;
+            if( mmf.create(filename, sz, false) )
+                _mb = mmf.getView();
+            assert( sz <= 0x7fffffff );
+            size = (int) sz;
+        }
+        //header = (DataFileHeader *) _p;
+        if( sizeof(char *) == 4 )
+            uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0);
         else
-            uassert( 10085 , "can't map file memory", header);
-        header->init(fileNo, size);
+            uassert( 10085 , "can't map file memory", _mb != 0);
+        header()->init(fileNo, size, filename);
     }
 
-    void MongoDataFile::flush( bool sync ){
+    void MongoDataFile::flush( bool sync ) {
         mmf.flush( sync );
     }
 
-    void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { 
-        DiskLoc oldExtentLoc;
+    void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) {
         NamespaceIndex *ni = nsindex(ns);
         NamespaceDetails *details = ni->details(ns);
         if ( details ) {
             assert( !details->lastExtent.isNull() );
             assert( !details->firstExtent.isNull() );
-            e->xprev = details->lastExtent;
-            details->lastExtent.ext()->xnext = eloc;
+            getDur().writingDiskLoc(e->xprev) = details->lastExtent;
+            getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc;
             assert( !eloc.isNull() );
-            details->lastExtent = eloc;
+            getDur().writingDiskLoc(details->lastExtent) = eloc;
         }
         else {
             ni->add_ns(ns, eloc, capped);
             details = ni->details(ns);
         }
 
-        details->lastExtentSize = e->length;
-        DEBUGGING out() << "temp: newextent adddelrec " << ns << endl;
+        {
+            NamespaceDetails *dw = details->writingWithoutExtra();
+            dw->lastExtentSize = e->length;
+        }
         details->addDeletedRec(emptyLoc.drec(), emptyLoc);
     }
 
     Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
-        massert( 10357 ,  "shutdown in progress", !goingAway );
-        massert( 10358 ,  "bad new extent size", approxSize >= 0 && approxSize <= Extent::maxSize() );
-        massert( 10359 ,  "header==0 on new extent: 32 bit mmap space exceeded?", header ); // null if file open failed
-        int ExtentSize = approxSize <= header->unusedLength ? approxSize : header->unusedLength;
+        massert( 10357 ,  "shutdown in progress", ! inShutdown() );
+        massert( 10358 ,  "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() );
+        massert( 10359 ,  "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed
+        int ExtentSize = approxSize <= header()->unusedLength ? approxSize : header()->unusedLength;
         DiskLoc loc;
-        if ( ExtentSize <= 0 ) {
+        if ( ExtentSize < Extent::minSize() ) {
             /* not there could be a lot of looping here is db just started and
                no files are open yet.  we might want to do something about that. */
             if ( loops > 8 ) {
@@ -401,12 +453,14 @@ namespace mongo {
             log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n";
             return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
         }
-        int offset = header->unused.getOfs();
-        header->unused.setOfs( fileNo, offset + ExtentSize );
-        header->unusedLength -= ExtentSize;
-        loc.setOfs(fileNo, offset);
+        int offset = header()->unused.getOfs();
+
+        DataFileHeader *h = getDur().writing(header());
+        h->unused.set( fileNo, offset + ExtentSize );
+        h->unusedLength -= ExtentSize;
+        loc.set(fileNo, offset);
         Extent *e = _getExtent(loc);
-        DiskLoc emptyLoc = e->init(ns, ExtentSize, fileNo, offset);
+        DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset);
 
         addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);
 
@@ -415,7 +469,7 @@ namespace mongo {
         return e;
     }
 
-    Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { 
+    Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) {
         string s = cc().database()->name + ".$freelist";
         NamespaceDetails *f = nsdetails(s.c_str());
         if( f ) {
@@ -426,7 +480,7 @@ namespace mongo {
                 if( low > 2048 ) low -= 256;
                 high = (int) (approxSize * 1.05) + 256;
             }
-            else { 
+            else {
                 low = (int) (approxSize * 0.8);
                 high = (int) (approxSize * 1.4);
             }
@@ -436,20 +490,20 @@ namespace mongo {
             int bestDiff = 0x7fffffff;
             {
                 DiskLoc L = f->firstExtent;
-                while( !L.isNull() ) { 
+                while( !L.isNull() ) {
                     Extent * e = L.ext();
-                    if( e->length >= low && e->length <= high ) { 
+                    if( e->length >= low && e->length <= high ) {
                         int diff = abs(e->length - approxSize);
-                        if( diff < bestDiff ) { 
+                        if( diff < bestDiff ) {
                             bestDiff = diff;
                             best = e;
-                            if( diff == 0 ) 
+                            if( diff == 0 )
                                 break;
                         }
                     }
                     L = e->xnext;
                     ++n;
-                
+
                 }
             }
             OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
@@ -457,13 +511,13 @@ namespace mongo {
                 Extent *e = best;
                 // remove from the free list
                 if( !e->xprev.isNull() )
-                    e->xprev.ext()->xnext = e->xnext;
+                    e->xprev.ext()->xnext.writing() = e->xnext;
                 if( !e->xnext.isNull() )
-                    e->xnext.ext()->xprev = e->xprev;
+                    e->xnext.ext()->xprev.writing() = e->xprev;
                 if( f->firstExtent == e->myLoc )
-                    f->firstExtent = e->xnext;
+                    f->firstExtent.writing() = e->xnext;
                 if( f->lastExtent == e->myLoc )
-                    f->lastExtent = e->xprev;
+                    f->lastExtent.writing() = e->xprev;
 
                 // use it
                 OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
@@ -479,9 +533,11 @@ namespace mongo {
 
     /*---------------------------------------------------------------------*/
 
-    DiskLoc Extent::reuse(const char *nsname) { 
-		/*TODOMMF - work to do when extent is freed. */
-        log(3) << "reset extent was:" << nsDiagnostic.buf << " now:" << nsname << '\n';
+    DiskLoc Extent::reuse(const char *nsname) {
+        return getDur().writing(this)->_reuse(nsname);
+    }
+    DiskLoc Extent::_reuse(const char *nsname) {
+        log(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n';
         massert( 10360 ,  "Extent::reset bad magic value", magic == 0x41424344 );
         xnext.Null();
         xprev.Null();
@@ -493,12 +549,9 @@ namespace mongo {
         emptyLoc.inc( (int) (_extentData-(char*)this) );
 
         int delRecLength = length - (_extentData - (char *) this);
-        //DeletedRecord *empty1 = (DeletedRecord *) extentData;
-        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc);
-        //assert( empty == empty1 );
-
-        // do we want to zero the record? memset(empty, ...)
 
+        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc);
+        empty = getDur().writing(empty);
         empty->lengthWithHeaders = delRecLength;
         empty->extentOfs = myLoc.getOfs();
         empty->nextDeleted.Null();
@@ -509,7 +562,7 @@ namespace mongo {
     /* assumes already zeroed -- insufficient for block 'reuse' perhaps */
     DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) {
         magic = 0x41424344;
-        myLoc.setOfs(_fileNo, _offset);
+        myLoc.set(_fileNo, _offset);
         xnext.Null();
         xprev.Null();
         nsDiagnostic = nsname;
@@ -521,9 +574,7 @@ namespace mongo {
         emptyLoc.inc( (int) (_extentData-(char*)this) );
 
         int l = _length - (_extentData - (char *) this);
-        //DeletedRecord *empty1 = (DeletedRecord *) extentData;
-        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, l);
-        //assert( empty == empty1 );
+        DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, l) );
         empty->lengthWithHeaders = l;
         empty->extentOfs = myLoc.getOfs();
         return emptyLoc;
@@ -582,7 +633,7 @@ namespace mongo {
         }
         return maxExtentSize;
     }
-    
+
     /*---------------------------------------------------------------------*/
 
     shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) {
@@ -612,12 +663,12 @@ namespace mongo {
             d->dumpDeleted(&extents);
         }
 
-        if ( d->capped ) 
+        if ( d->capped )
             return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) );
-        
+
         if ( !startLoc.isNull() )
-            return shared_ptr<Cursor>(new BasicCursor( startLoc ));                
-        
+            return shared_ptr<Cursor>(new BasicCursor( startLoc ));
+
         while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
             /* todo: if extent is empty, free it for reuse elsewhere.
                that is a bit complicated have to clean up the freelists.
@@ -638,37 +689,38 @@ namespace mongo {
 
         if ( el.number() >= 0 )
             return DataFileMgr::findAll(ns, startLoc);
-        
+
         // "reverse natural order"
         NamespaceDetails *d = nsdetails(ns);
-        
+
         if ( !d )
             return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
-        
+
         if ( !d->capped ) {
             if ( !startLoc.isNull() )
-                return shared_ptr<Cursor>(new ReverseCursor( startLoc ));                
+                return shared_ptr<Cursor>(new ReverseCursor( startLoc ));
             Extent *e = d->lastExtent.ext();
             while ( e->lastRecord.isNull() && !e->xprev.isNull() ) {
                 OCCASIONALLY out() << "  findTableScan: extent empty, skipping ahead" << endl;
                 e = e->getPrevExtent();
             }
             return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
-        } else {
+        }
+        else {
             return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) );
         }
     }
 
-    void printFreeList() { 
+    void printFreeList() {
         string s = cc().database()->name + ".$freelist";
         log() << "dump freelist " << s << '\n';
         NamespaceDetails *freeExtents = nsdetails(s.c_str());
-        if( freeExtents == 0 ) { 
+        if( freeExtents == 0 ) {
             log() << "  freeExtents==0" << endl;
             return;
         }
         DiskLoc a = freeExtents->firstExtent;
-        while( !a.isNull() ) { 
+        while( !a.isNull() ) {
             Extent *e = a.ext();
             log() << "  " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << '\n';
             a = e->xnext;
@@ -687,7 +739,7 @@ namespace mongo {
         NamespaceString s(nsToDrop);
         assert( s.db == cc().database()->name );
         if( s.isSystem() ) {
-            if( s.coll == "system.profile" ) 
+            if( s.coll == "system.profile" )
                 uassert( 10087 ,  "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 );
             else
                 uasserted( 12502, "can't drop system ns" );
@@ -698,32 +750,31 @@ namespace mongo {
             BSONObj cond = BSON( "name" << nsToDrop );   // { name: "colltodropname" }
             string system_namespaces = cc().database()->name + ".system.namespaces";
             /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true);
-			// no check of return code as this ns won't exist for some of the new storage engines
+            // no check of return code as this ns won't exist for some of the new storage engines
         }
 
         // free extents
         if( !d->firstExtent.isNull() ) {
             string s = cc().database()->name + ".$freelist";
             NamespaceDetails *freeExtents = nsdetails(s.c_str());
-            if( freeExtents == 0 ) { 
+            if( freeExtents == 0 ) {
                 string err;
                 _userCreateNS(s.c_str(), BSONObj(), err, 0);
                 freeExtents = nsdetails(s.c_str());
                 massert( 10361 , "can't create .$freelist", freeExtents);
             }
-            if( freeExtents->firstExtent.isNull() ) { 
-                freeExtents->firstExtent = d->firstExtent;
-                freeExtents->lastExtent = d->lastExtent;
+            if( freeExtents->firstExtent.isNull() ) {
+                freeExtents->firstExtent.writing() = d->firstExtent;
+                freeExtents->lastExtent.writing() = d->lastExtent;
             }
-            else { 
+            else {
                 DiskLoc a = freeExtents->firstExtent;
                 assert( a.ext()->xprev.isNull() );
-                a.ext()->xprev = d->lastExtent;
-                d->lastExtent.ext()->xnext = a;
-                freeExtents->firstExtent = d->firstExtent;
-
-                d->firstExtent.setInvalid();
-                d->lastExtent.setInvalid();
+                getDur().writingDiskLoc( a.ext()->xprev ) = d->lastExtent;
+                getDur().writingDiskLoc( d->lastExtent.ext()->xnext ) = a;
+                getDur().writingDiskLoc( freeExtents->firstExtent ) = d->firstExtent;
+                getDur().writingDiskLoc( d->firstExtent ).setInvalid();
+                getDur().writingDiskLoc( d->lastExtent ).setInvalid();
             }
         }
 
@@ -740,7 +791,7 @@ namespace mongo {
         BackgroundOperation::assertNoBgOpInProgForNs(name.c_str());
 
         if ( d->nIndexes != 0 ) {
-            try { 
+            try {
                 assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) );
             }
             catch( DBException& e ) {
@@ -754,11 +805,10 @@ namespace mongo {
         log(1) << "\t dropIndexes done" << endl;
         result.append("ns", name.c_str());
         ClientCursor::invalidate(name.c_str());
-        Client::invalidateNS( name );
         Top::global.collectionDropped( name );
-        dropNS(name);        
+        dropNS(name);
     }
-    
+
     int nUnindexes = 0;
 
     /* unindex all keys in index for this record. */
@@ -797,63 +847,69 @@ namespace mongo {
         int n = d->nIndexes;
         for ( int i = 0; i < n; i++ )
             _unindexRecord(d->idx(i), obj, dl, !noWarn);
-        if( d->backgroundIndexBuildInProgress ) {
+        if( d->indexBuildInProgress ) { // background index
             // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it
-            _unindexRecord(d->idx(n), obj, dl, false); 
+            _unindexRecord(d->idx(n), obj, dl, false);
         }
     }
 
-    /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. 
+    /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
        caller must check if capped
     */
-    void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl)
-    {
+    void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) {
         /* remove ourself from the record next/prev chain */
         {
             if ( todelete->prevOfs != DiskLoc::NullOfs )
-                todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs;
+                getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs;
             if ( todelete->nextOfs != DiskLoc::NullOfs )
-                todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs;
+                getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs;
         }
 
         /* remove ourself from extent pointers */
         {
-            Extent *e = todelete->myExtent(dl);
+            Extent *e = getDur().writing( todelete->myExtent(dl) );
             if ( e->firstRecord == dl ) {
                 if ( todelete->nextOfs == DiskLoc::NullOfs )
                     e->firstRecord.Null();
                 else
-                    e->firstRecord.setOfs(dl.a(), todelete->nextOfs);
+                    e->firstRecord.set(dl.a(), todelete->nextOfs);
             }
             if ( e->lastRecord == dl ) {
                 if ( todelete->prevOfs == DiskLoc::NullOfs )
                     e->lastRecord.Null();
                 else
-                    e->lastRecord.setOfs(dl.a(), todelete->prevOfs);
+                    e->lastRecord.set(dl.a(), todelete->prevOfs);
             }
         }
 
         /* add to the free list */
         {
-            d->nrecords--;
-            d->datasize -= todelete->netLength();
-            /* temp: if in system.indexes, don't reuse, and zero out: we want to be
-               careful until validated more, as IndexDetails has pointers
-               to this disk location.  so an incorrectly done remove would cause
-               a lot of problems.
-            */
+            {
+                NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+                s->datasize -= todelete->netLength();
+                s->nrecords--;
+            }
+
             if ( strstr(ns, ".system.indexes") ) {
-                memset(todelete, 0, todelete->lengthWithHeaders);
+                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+                   careful until validated more, as IndexDetails has pointers
+                   to this disk location.  so an incorrectly done remove would cause
+                   a lot of problems.
+                */
+                memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders);
             }
             else {
-                DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+                DEV {
+                    unsigned long long *p = (unsigned long long *) todelete->data;
+                    *getDur().writing(p) = 0;
+                    //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+                }
                 d->addDeletedRec((DeletedRecord*)todelete, dl);
             }
         }
     }
 
-    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn)
-    {
+    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) {
         dassert( todelete == dl.rec() );
 
         NamespaceDetails* d = nsdetails(ns);
@@ -880,8 +936,7 @@ namespace mongo {
         NamespaceDetails *d,
         NamespaceDetailsTransient *nsdt,
         Record *toupdate, const DiskLoc& dl,
-        const char *_buf, int _len, OpDebug& debug, bool &changedId, bool god)
-    {
+        const char *_buf, int _len, OpDebug& debug,  bool god) {
         StringBuilder& ss = debug.str;
         dassert( toupdate == dl.rec() );
 
@@ -891,7 +946,7 @@ namespace mongo {
         DEV assert( objNew.objdata() == _buf );
 
         if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
-            /* add back the old _id value if the update removes it.  Note this implementation is slow 
+            /* add back the old _id value if the update removes it.  Note this implementation is slow
                (copies entire object multiple times), but this shouldn't happen often, so going for simple
                code, not speed.
             */
@@ -903,11 +958,13 @@ namespace mongo {
             objNew = b.obj();
         }
 
-        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further  
+        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
            below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
         */
         vector<IndexChanges> changes;
+        bool changedId = false;
         getIndexChanges(changes, *d, objNew, objOld, changedId);
+        uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId );
         dupCheck(changes, *d, dl);
 
         if ( toupdate->netLength() < objNew.objsize() ) {
@@ -946,8 +1003,8 @@ namespace mongo {
                     try {
                         /* we did the dupCheck() above.  so we don't have to worry about it here. */
                         idx.head.btree()->bt_insert(
-                                                    idx.head,
-                                                    dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
+                            idx.head,
+                            dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
                     }
                     catch (AssertionException& e) {
                         ss << " exception update index ";
@@ -959,25 +1016,30 @@ namespace mongo {
                 ss << '\n' << keyUpdates << " key updates ";
         }
 
-        //	update in place
-        memcpy(toupdate->data, objNew.objdata(), objNew.objsize());
+        //  update in place
+        int sz = objNew.objsize();
+        memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz);
         return dl;
     }
 
-    int followupExtentSize(int len, int lastExtentLen) {
+    int Extent::followupSize(int len, int lastExtentLen) {
         assert( len < Extent::maxSize() );
-        int x = initialExtentSize(len);
+        int x = initialSize(len);
         int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2);
         int sz = y > x ? y : x;
 
-        if ( sz < lastExtentLen )
-            sz = lastExtentLen;
-        else if ( sz > Extent::maxSize() )
+        if ( sz < lastExtentLen ) {
+            // this means there was an int overflow
+            // so we should turn it into maxSize
+            sz = Extent::maxSize();
+        }
+        else if ( sz > Extent::maxSize() ) {
             sz = Extent::maxSize();
-        
+        }
+
         sz = ((int)sz) & 0xffffff00;
         assert( sz > len );
-        
+
         return sz;
     }
 
@@ -990,7 +1052,7 @@ namespace mongo {
         Ordering ordering = Ordering::make(order);
         int n = 0;
         for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
-            if( ++n == 2 ) { 
+            if( ++n == 2 ) {
                 d->setIndexIsMultikey(idxNo);
             }
             assert( !recordLoc.isNull() );
@@ -999,7 +1061,7 @@ namespace mongo {
                                             *i, ordering, dupsAllowed, idx);
             }
             catch (AssertionException& e) {
-                if( e.getCode() == 10287 && idxNo == d->nIndexes ) { 
+                if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
                     DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
                     continue;
                 }
@@ -1012,8 +1074,7 @@ namespace mongo {
         }
     }
 
-    void testSorting() 
-    {
+    void testSorting() {
         BSONObjBuilder b;
         b.appendNull("");
         BSONObj x = b.obj();
@@ -1027,9 +1088,9 @@ namespace mongo {
         sorter.add(x, DiskLoc(3,77));
 
         sorter.sort();
-        
+
         auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
-        while( i->more() ) { 
+        while( i->more() ) {
             BSONObjExternalSorter::Data d = i->next();
             /*cout << d.second.toString() << endl;
             cout << d.first.objsize() << endl;
@@ -1039,7 +1100,6 @@ namespace mongo {
 
     // throws DBException
     unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
-        assert( d->backgroundIndexBuildInProgress == 0 );
         CurOp * op = cc().curop();
 
         Timer t;
@@ -1050,17 +1110,17 @@ namespace mongo {
         bool dropDups = idx.dropDups() || inDBRepair;
         BSONObj order = idx.keyPattern();
 
-        idx.head.Null();
-        
+        getDur().writingDiskLoc(idx.head).Null();
+
         if ( logLevel > 1 ) printMemInfo( "before index start" );
 
         /* get and sort all the keys ----- */
         unsigned long long n = 0;
         shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
         BSONObjExternalSorter sorter(order);
-        sorter.hintNumObjects( d->nrecords );
+        sorter.hintNumObjects( d->stats.nrecords );
         unsigned long long nkeys = 0;
-        ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->nrecords , 10 ) );
+        ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
         while ( c->ok() ) {
             BSONObj o = c->current();
             DiskLoc loc = c->currLoc();
@@ -1069,17 +1129,17 @@ namespace mongo {
             idx.getKeysFromObject(o, keys);
             int k = 0;
             for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
-                if( ++k == 2 )
+                if( ++k == 2 ) {
                     d->setIndexIsMultikey(idxNo);
-                //cout<<"SORTER ADD " << i->toString() << ' ' << loc.toString() << endl;
+                }
                 sorter.add(*i, loc);
                 nkeys++;
             }
-            
+
             c->advance();
             n++;
             pm.hit();
-            if ( logLevel > 1 && n % 10000 == 0 ){
+            if ( logLevel > 1 && n % 10000 == 0 ) {
                 printMemInfo( "\t iterating objects" );
             }
 
@@ -1089,37 +1149,37 @@ namespace mongo {
         if ( logLevel > 1 ) printMemInfo( "before final sort" );
         sorter.sort();
         if ( logLevel > 1 ) printMemInfo( "after final sort" );
-        
+
         log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
 
         list<DiskLoc> dupsToDrop;
 
-        /* build index --- */ 
+        /* build index --- */
         {
             BtreeBuilder btBuilder(dupsAllowed, idx);
             BSONObj keyLast;
             auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
             assert( pm == op->setMessage( "index: (2/3) btree bottom up" , nkeys , 10 ) );
-            while( i->more() ) { 
+            while( i->more() ) {
                 RARELY killCurrentOp.checkForInterrupt();
                 BSONObjExternalSorter::Data d = i->next();
 
-                try { 
+                try {
                     btBuilder.addKey(d.first, d.second);
                 }
-                catch( AssertionException& e ) { 
-                    if ( dupsAllowed ){
+                catch( AssertionException& e ) {
+                    if ( dupsAllowed ) {
                         // unknow exception??
                         throw;
                     }
-                    
+
                     if( e.interrupted() )
                         throw;
 
                     if ( ! dropDups )
                         throw;
 
-                    /* we could queue these on disk, but normally there are very few dups, so instead we 
+                    /* we could queue these on disk, but normally there are very few dups, so instead we
                        keep in ram and have a limit.
                     */
                     dupsToDrop.push_back(d.second);
@@ -1131,9 +1191,11 @@ namespace mongo {
             op->setMessage( "index: (3/3) btree-middle" );
             log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
             btBuilder.commit();
-            wassert( btBuilder.getn() == nkeys || dropDups ); 
+            if ( btBuilder.getn() != nkeys && ! dropDups ) {
+                warning() << "not all entries were added to the index, probably some keys were too large" << endl;
+            }
         }
-        
+
         log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
 
         for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ )
@@ -1142,13 +1204,13 @@ namespace mongo {
         return n;
     }
 
-    class BackgroundIndexBuildJob : public BackgroundOperation { 
+    class BackgroundIndexBuildJob : public BackgroundOperation {
 
         unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
             bool dupsAllowed = !idx.unique();
             bool dropDups = idx.dropDups();
 
-            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->nrecords );
+            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords );
 
             unsigned long long n = 0;
             auto_ptr<ClientCursor> cc;
@@ -1156,25 +1218,26 @@ namespace mongo {
                 shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
                 cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) );
             }
-            CursorId id = cc->cursorid;
+            CursorId id = cc->cursorid();
 
-            while ( cc->c->ok() ) {
-                BSONObj js = cc->c->current();
-                try { 
-                    _indexRecord(d, idxNo, js, cc->c->currLoc(), dupsAllowed);
-                    cc->c->advance();
-                } catch( AssertionException& e ) { 
+            while ( cc->ok() ) {
+                BSONObj js = cc->current();
+                try {
+                    _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                    cc->advance();
+                }
+                catch( AssertionException& e ) {
                     if( e.interrupted() )
                         throw;
 
                     if ( dropDups ) {
-                        DiskLoc toDelete = cc->c->currLoc();
-                        bool ok = cc->c->advance();
+                        DiskLoc toDelete = cc->currLoc();
+                        bool ok = cc->advance();
                         cc->updateLocation();
                         theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true );
                         if( ClientCursor::find(id, false) == 0 ) {
                             cc.release();
-                            if( !ok ) { 
+                            if( !ok ) {
                                 /* we were already at the end. normal. */
                             }
                             else {
@@ -1182,7 +1245,8 @@ namespace mongo {
                             }
                             break;
                         }
-                    } else {
+                    }
+                    else {
                         log() << "background addExistingToIndex exception " << e.what() << endl;
                         throw;
                     }
@@ -1200,7 +1264,7 @@ namespace mongo {
             return n;
         }
 
-        /* we do set a flag in the namespace for quick checking, but this is our authoritative info - 
+        /* we do set a flag in the namespace for quick checking, but this is our authoritative info -
            that way on a crash/restart, we don't think we are still building one. */
         set<NamespaceDetails*> bgJobsInProgress;
 
@@ -1208,12 +1272,8 @@ namespace mongo {
             assertInWriteLock();
             uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , dbMutex.getState() == 1 );
             bgJobsInProgress.insert(d);
-            d->backgroundIndexBuildInProgress = 1;
-            d->nIndexes--;
         }
         void done(const char *ns, NamespaceDetails *d) {
-            d->nIndexes++;
-            d->backgroundIndexBuildInProgress = 0;
             NamespaceDetailsTransient::get_w(ns).addedIndex(); // clear query optimizer cache
             assertInWriteLock();
         }
@@ -1221,16 +1281,16 @@ namespace mongo {
     public:
         BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { }
 
-        unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { 
+        unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
             unsigned long long n = 0;
 
             prep(ns.c_str(), d);
             assert( idxNo == d->nIndexes );
-            try { 
+            try {
                 idx.head = BtreeBucket::addBucket(idx);
                 n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
             }
-            catch(...) { 
+            catch(...) {
                 if( cc().database() && nsdetails(ns.c_str()) == d ) {
                     assert( idxNo == d->nIndexes );
                     done(ns.c_str(), d);
@@ -1246,25 +1306,51 @@ namespace mongo {
         }
     };
 
+    /**
+     * For the lifetime of this object, an index build is indicated on the specified
+     * namespace and the newest index is marked as absent.  This simplifies
+     * the cleanup required on recovery.
+     */
+    class RecoverableIndexState {
+    public:
+        RecoverableIndexState( NamespaceDetails *d ) : _d( d ) {
+            indexBuildInProgress() = 1;
+            nIndexes()--;
+        }
+        ~RecoverableIndexState() {
+            DESTRUCTOR_GUARD (
+                nIndexes()++;
+                indexBuildInProgress() = 0;
+            )
+        }
+    private:
+        int &nIndexes() { return getDur().writingInt( _d->nIndexes ); }
+        int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); }
+        NamespaceDetails *_d;
+    };
+
     // throws DBException
-    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { 
+    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
         tlog() << "building new index on " << idx.keyPattern() << " for " << ns << ( background ? " background" : "" ) << endl;
         Timer t;
-		unsigned long long n;
+        unsigned long long n;
 
         if( background ) {
             log(2) << "buildAnIndex: background=true\n";
         }
 
         assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be...
+        assert( d->indexBuildInProgress == 0 );
+        assertInWriteLock();
+        RecoverableIndexState recoverable( d );
         if( inDBRepair || !background ) {
-			n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
-			assert( !idx.head.isNull() );
-		}
-		else {
+            n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
+            assert( !idx.head.isNull() );
+        }
+        else {
             BackgroundIndexBuildJob j(ns.c_str());
             n = j.go(ns, d, idx, idxNo);
-		}
+        }
         tlog() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl;
     }
 
@@ -1272,20 +1358,20 @@ namespace mongo {
     static void indexRecord(NamespaceDetails *d, BSONObj obj, DiskLoc loc) {
         int n = d->nIndexesBeingBuilt();
         for ( int i = 0; i < n; i++ ) {
-            try { 
+            try {
                 bool unique = d->idx(i).unique();
                 _indexRecord(d, i, obj, loc, /*dupsAllowed*/!unique);
             }
-            catch( DBException& ) { 
+            catch( DBException& ) {
                 /* try to roll back previously added index entries
                    note <= i (not < i) is important here as the index we were just attempted
                    may be multikey and require some cleanup.
                 */
-                for( int j = 0; j <= i; j++ ) { 
+                for( int j = 0; j <= i; j++ ) {
                     try {
                         _unindexRecord(d->idx(j), obj, loc, false);
                     }
-                    catch(...) { 
+                    catch(...) {
                         log(3) << "unindex fails on rollback after unique failure\n";
                     }
                 }
@@ -1301,7 +1387,7 @@ namespace mongo {
         if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) )
             return;
 
-        d->flags |= NamespaceDetails::Flag_HaveIdIndex;
+        *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex;
 
         {
             NamespaceDetails::IndexIterator i = d->ii();
@@ -1324,7 +1410,7 @@ namespace mongo {
     }
 
 #pragma pack(1)
-    struct IDToInsert_ { 
+    struct IDToInsert_ {
         char type;
         char _id[4];
         OID oid;
@@ -1338,13 +1424,13 @@ namespace mongo {
         IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {}
     } idToInsert;
 #pragma pack()
-    
+
     void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) {
         BSONObj tmp = o;
         insertWithObjMod( ns, tmp, god );
         logOp( "i", ns, tmp );
     }
-    
+
     DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
         DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god );
         if ( !loc.isNull() )
@@ -1356,12 +1442,12 @@ namespace mongo {
         insert( ns, o.objdata(), o.objsize(), god );
     }
 
-    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection);
+    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject );
 
     // We are now doing two btree scans for all unique indexes (one here, and one when we've
     // written the record to the collection.  This could be made more efficient inserting
     // dummy data here, keeping pointers to the btree nodes holding the dummy data and then
-    // updating the dummy data with the DiskLoc of the real record.    
+    // updating the dummy data with the DiskLoc of the real record.
     void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) {
         for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) {
             if( d->idx(idxNo).unique() ) {
@@ -1371,19 +1457,19 @@ namespace mongo {
                 BSONObj order = idx.keyPattern();
                 for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
                     uassert( 12582, "duplicate key insert for unique index of capped collection",
-                            idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() );
+                             idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() );
                 }
             }
-        }        
+        }
     }
 
-    /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc 
+    /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
              after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
     */
     DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) {
         bool wouldAddIndex = false;
-        massert( 10093 , "cannot insert into reserved $ collection", god || nsDollarCheck( ns ) );
-        uassert( 10094 , "invalid ns", strchr( ns , '.' ) > 0 );
+        massert( 10093 , "cannot insert into reserved $ collection", god || isANormalNSName( ns ) );
+        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
         const char *sys = strstr(ns, "system.");
         if ( sys ) {
             uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
@@ -1411,7 +1497,7 @@ namespace mongo {
                also if this is an addIndex, those checks should happen before this!
             */
             // This may create first file in the database.
-            cc().database()->allocExtent(ns, initialExtentSize(len), false);
+            cc().database()->allocExtent(ns, Extent::initialSize(len), false);
             d = nsdetails(ns);
             if ( !god )
                 ensureIdIndexForNewNs(ns);
@@ -1421,17 +1507,24 @@ namespace mongo {
         NamespaceDetails *tableToIndex = 0;
 
         string tabletoidxns;
+        BSONObj fixedIndexObject;
         if ( addIndex ) {
             assert( obuf );
             BSONObj io((const char *) obuf);
-            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex) )
+            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) )
                 return DiskLoc();
+
+            if ( ! fixedIndexObject.isEmpty() ) {
+                obuf = fixedIndexObject.objdata();
+                len = fixedIndexObject.objsize();
+            }
+
         }
 
         const BSONElement *newId = &writeId;
         int addID = 0;
         if( !god ) {
-            /* Check if we have an _id field. If we don't, we'll add it. 
+            /* Check if we have an _id field. If we don't, we'll add it.
                Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
             */
             BSONObj io((const char *) obuf);
@@ -1446,7 +1539,7 @@ namespace mongo {
                 }
                 len += newId->size();
             }
-            
+
             BSONElementManipulator::lookForTimestamps( io );
         }
 
@@ -1456,28 +1549,28 @@ namespace mongo {
         if ( lenWHdr == 0 ) {
             // old datafiles, backward compatible here.
             assert( d->paddingFactor == 0 );
-            d->paddingFactor = 1.0;
+            *getDur().writing(&d->paddingFactor) = 1.0;
             lenWHdr = len + Record::HeaderSize;
         }
-        
+
         // If the collection is capped, check if the new object will violate a unique index
         // constraint before allocating space.
         if ( d->nIndexes && d->capped && !god ) {
             checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
         }
-        
+
         DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
         if ( loc.isNull() ) {
             // out of space
             if ( d->capped == 0 ) { // size capped doesn't grow
                 log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
-                cc().database()->allocExtent(ns, followupExtentSize(lenWHdr, d->lastExtentSize), false);
+                cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false);
                 loc = d->alloc(ns, lenWHdr, extentLoc);
-                if ( loc.isNull() ){
+                if ( loc.isNull() ) {
                     log() << "WARNING: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
-                    for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ){
+                    for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ) {
                         log() << "try #" << zzz << endl;
-                        cc().database()->allocExtent(ns, followupExtentSize(len, d->lastExtentSize), false);
+                        cc().database()->allocExtent(ns, Extent::followupSize(len, d->lastExtentSize), false);
                         loc = d->alloc(ns, lenWHdr, extentLoc);
                         if ( ! loc.isNull() )
                             break;
@@ -1492,45 +1585,55 @@ namespace mongo {
         }
 
         Record *r = loc.rec();
-        assert( r->lengthWithHeaders >= lenWHdr );
-        if( addID ) { 
-            /* a little effort was made here to avoid a double copy when we add an ID */
-            ((int&)*r->data) = *((int*) obuf) + newId->size();
-            memcpy(r->data+4, newId->rawdata(), newId->size());
-            memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4);
-        }
-        else {
-            if( obuf )
-                memcpy(r->data, obuf, len);
-        }
-        Extent *e = r->myExtent(loc);
-        if ( e->lastRecord.isNull() ) {
-            e->firstRecord = e->lastRecord = loc;
-            r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+        {
+            assert( r->lengthWithHeaders >= lenWHdr );
+            r = (Record*) getDur().writingPtr(r, lenWHdr);
+            if( addID ) {
+                /* a little effort was made here to avoid a double copy when we add an ID */
+                ((int&)*r->data) = *((int*) obuf) + newId->size();
+                memcpy(r->data+4, newId->rawdata(), newId->size());
+                memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4);
+            }
+            else {
+                if( obuf )
+                    memcpy(r->data, obuf, len);
+            }
         }
-        else {
 
-            Record *oldlast = e->lastRecord.rec();
-            r->prevOfs = e->lastRecord.getOfs();
-            r->nextOfs = DiskLoc::NullOfs;
-            oldlast->nextOfs = loc.getOfs();
-            e->lastRecord = loc;
+        {
+            Extent *e = r->myExtent(loc);
+            if ( e->lastRecord.isNull() ) {
+                Extent::FL *fl = getDur().writing(e->fl());
+                fl->firstRecord = fl->lastRecord = loc;
+                r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+            }
+            else {
+                Record *oldlast = e->lastRecord.rec();
+                r->prevOfs = e->lastRecord.getOfs();
+                r->nextOfs = DiskLoc::NullOfs;
+                getDur().writingInt(oldlast->nextOfs) = loc.getOfs();
+                getDur().writingDiskLoc(e->lastRecord) = loc;
+            }
         }
 
-        d->nrecords++;
-        d->datasize += r->netLength();
+        /* durability todo : this could be a bit annoying / slow to record constantly */
+        {
+            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
 
         // we don't bother clearing those stats for the god tables - also god is true when adidng a btree bucket
         if ( !god )
             NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
-        
+
         if ( tableToIndex ) {
             uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos );
 
             BSONObj info = loc.obj();
             bool background = info["background"].trueValue();
-            if( background && cc().isSyncThread() ) { 
-                /* don't do background indexing on slaves.  there are nuances.  this could be added later 
+            if( background && cc().isSyncThread() ) {
+                /* don't do background indexing on slaves.  there are nuances.  this could be added later
                    but requires more code.
                    */
                 log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
@@ -1539,10 +1642,11 @@ namespace mongo {
 
             int idxNo = tableToIndex->nIndexes;
             IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
-            idx.info = loc;
+            getDur().writingDiskLoc(idx.info) = loc;
             try {
                 buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
-            } catch( DBException& e ) {
+            }
+            catch( DBException& e ) {
                 // save our error msg string as an exception or dropIndexes will overwrite our message
                 LastError *le = lastError.get();
                 int savecode = 0;
@@ -1564,7 +1668,7 @@ namespace mongo {
                 if( !ok ) {
                     log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
                 }
-                
+
                 assert( le && !saveerrmsg.empty() );
                 raiseError(savecode,saveerrmsg.c_str());
                 throw;
@@ -1573,20 +1677,20 @@ namespace mongo {
 
         /* add this record to our indexes */
         if ( d->nIndexes ) {
-            try { 
+            try {
                 BSONObj obj(r->data);
                 indexRecord(d, obj, loc);
-            } 
-            catch( AssertionException& e ) { 
+            }
+            catch( AssertionException& e ) {
                 // should be a dup key error on _id index
                 if( tableToIndex || d->capped ) {
                     massert( 12583, "unexpected index insertion failure on capped collection", !d->capped );
                     string s = e.toString();
                     s += " : on addIndex/capped - collection and its index will not match";
                     uassert_nothrow(s.c_str());
-                    log() << s << '\n';
+                    error() << s << endl;
                 }
-                else { 
+                else {
                     // normal case -- we can roll back
                     _deleteRecord(d, ns, r, loc);
                     throw;
@@ -1594,7 +1698,7 @@ namespace mongo {
             }
         }
 
-        //	out() << "   inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
+        //  out() << "   inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
         return loc;
     }
 
@@ -1619,18 +1723,27 @@ namespace mongo {
 
         Extent *e = r->myExtent(loc);
         if ( e->lastRecord.isNull() ) {
-            e->firstRecord = e->lastRecord = loc;
-            r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+            Extent::FL *fl = getDur().writing( e->fl() );
+            fl->firstRecord = fl->lastRecord = loc;
+
+            Record::NP *np = getDur().writing(r->np());
+            np->nextOfs = np->prevOfs = DiskLoc::NullOfs;
         }
         else {
             Record *oldlast = e->lastRecord.rec();
-            r->prevOfs = e->lastRecord.getOfs();
-            r->nextOfs = DiskLoc::NullOfs;
-            oldlast->nextOfs = loc.getOfs();
-            e->lastRecord = loc;
+            Record::NP *np = getDur().writing(r->np());
+            np->prevOfs = e->lastRecord.getOfs();
+            np->nextOfs = DiskLoc::NullOfs;
+            getDur().writingInt( oldlast->nextOfs ) = loc.getOfs();
+            e->lastRecord.writing() = loc;
         }
 
-        d->nrecords++;
+        /* todo: don't update for oplog?  seems wasteful. */
+        {
+            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
 
         return r;
     }
@@ -1641,7 +1754,7 @@ namespace mongo {
 
 namespace mongo {
 
-    void dropAllDatabasesExceptLocal() { 
+    void dropAllDatabasesExceptLocal() {
         writelock lk("");
 
         vector<string> n;
@@ -1658,14 +1771,17 @@ namespace mongo {
 
     void dropDatabase(string db) {
         log(1) << "dropDatabase " << db << endl;
-        assert( cc().database() );
-        assert( cc().database()->name == db );
+        Database *d = cc().database();
+        assert( d );
+        assert( d->name == db );
 
-        BackgroundOperation::assertNoBgOpInProgForDb(db.c_str());
+        BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str());
 
-        Client::invalidateDB( db );
+        getDur().syncDataAndTruncateJournal();
+
+        Database::closeDatabase( d->name.c_str(), d->path );
+        d = 0; // d is now deleted
 
-        closeDatabase( db.c_str() );
         _deleteDataFiles( db.c_str() );
     }
 
@@ -1674,13 +1790,14 @@ namespace mongo {
     void boostRenameWrapper( const Path &from, const Path &to ) {
         try {
             boost::filesystem::rename( from, to );
-        } catch ( const boost::filesystem::filesystem_error & ) {
+        }
+        catch ( const boost::filesystem::filesystem_error & ) {
             // boost rename doesn't work across partitions
             boost::filesystem::copy_file( from, to);
             boost::filesystem::remove( from );
         }
     }
-    
+
     // back up original database files to 'temp' dir
     void _renameForBackup( const char *database, const Path &reservedPath ) {
         Path newPath( reservedPath );
@@ -1738,7 +1855,8 @@ namespace mongo {
             ss << prefix << "_repairDatabase_" << i++;
             reservedPath = repairPath / ss.str();
             BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
-        } while ( exists );
+        }
+        while ( exists );
         return reservedPath;
     }
 
@@ -1790,12 +1908,15 @@ namespace mongo {
         stringstream ss;
         ss << "localhost:" << cmdLine.port;
         string localhost = ss.str();
-        
+
         problem() << "repairDatabase " << dbName << endl;
         assert( cc().database()->name == dbName );
+        assert( cc().database()->path == dbpath );
 
         BackgroundOperation::assertNoBgOpInProgForDb(dbName);
 
+        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
         boost::intmax_t totalSize = dbSize( dbName );
         boost::intmax_t freeSize = freeSpace( repairpath );
         if ( freeSize > -1 && freeSize < totalSize ) {
@@ -1812,30 +1933,37 @@ namespace mongo {
                                 "backup" : "$tmp" );
         BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
         string reservedPathString = reservedPath.native_directory_string();
-        
+
         bool res;
-        { // clone to temp location, which effectively does repair
+        {
+            // clone to temp location, which effectively does repair
             Client::Context ctx( dbName, reservedPathString );
             assert( ctx.justCreated() );
-            
-            res = cloneFrom(localhost.c_str(), errmsg, dbName, 
-                                 /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
-            closeDatabase( dbName, reservedPathString.c_str() );
+
+            res = cloneFrom(localhost.c_str(), errmsg, dbName,
+                            /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
+            Database::closeDatabase( dbName, reservedPathString.c_str() );
         }
 
         if ( !res ) {
             problem() << "clone failed for " << dbName << " with error: " << errmsg << endl;
             if ( !preserveClonedFilesOnFailure )
                 BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+            getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
             return false;
         }
 
+        MongoFile::flushAll(true);
+
         Client::Context ctx( dbName );
-        closeDatabase( dbName );
+        Database::closeDatabase( dbName, dbpath );
 
         if ( backupOriginalFiles ) {
             _renameForBackup( dbName, reservedPath );
-        } else {
+        }
+        else {
             _deleteDataFiles( dbName );
             BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) );
         }
@@ -1845,12 +1973,14 @@ namespace mongo {
         if ( !backupOriginalFiles )
             BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
 
+        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
         return true;
     }
 
     void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) {
         if ( afterAllocator )
-            theFileAllocator().waitUntilFinished();
+            FileAllocator::get()->waitUntilFinished();
         string c = database;
         c += '.';
         boost::filesystem::path p(path);
@@ -1871,8 +2001,8 @@ namespace mongo {
             q = p / ss.str();
             BOOST_CHECK_EXCEPTION( ok = fo.apply(q) );
             if ( ok ) {
-                if ( extra != 10 ){
-                    log(1) << fo.op() << " file " << q.string() << '\n';
+                if ( extra != 10 ) {
+                    log(1) << fo.op() << " file " << q.string() << endl;
                     log() << "  _applyOpToDataFiles() warning: extra == " << extra << endl;
                 }
             }
@@ -1883,19 +2013,20 @@ namespace mongo {
     }
 
     NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); }
-    
-    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ){
+
+    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) {
         log() << "DatabaseHolder::closeAll path:" << path << endl;
         dbMutex.assertWriteLocked();
-        
+
         map<string,Database*>& m = _paths[path];
         _size -= m.size();
-        
+
         set< string > dbs;
         for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) {
+            wassert( i->second->path == path );
             dbs.insert( i->first );
         }
-        
+
         currentClient.get()->getContext()->clear();
 
         BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
@@ -1910,7 +2041,7 @@ namespace mongo {
                 nNotClosed++;
             }
             else {
-                closeDatabase( name.c_str() , path );
+                Database::closeDatabase( name.c_str() , path );
                 bb.append( bb.numStr( n++ ) , name );
             }
         }
@@ -1923,6 +2054,17 @@ namespace mongo {
 
         return true;
     }
-    
+
+    bool isValidNS( const StringData& ns ) {
+        // TODO: should check for invalid characters
+
+        const char * x = strchr( ns.data() , '.' );
+        if ( ! x )
+            return false;
+
+        x++;
+        return *x > 0;
+    }
+
 
 } // namespace mongo