diff options
Diffstat (limited to 'db/btree.cpp')
-rw-r--r-- | db/btree.cpp | 1198 |
1 files changed, 614 insertions, 584 deletions
diff --git a/db/btree.cpp b/db/btree.cpp index 299c212..bf9926e 100644 --- a/db/btree.cpp +++ b/db/btree.cpp @@ -27,33 +27,23 @@ #include "curop-inl.h" #include "stats/counters.h" #include "dur_commitjob.h" +#include "btreebuilder.h" +#include "../util/unittest.h" namespace mongo { -#define VERIFYTHISLOC dassert( thisLoc.btree() == this ); + BOOST_STATIC_ASSERT( Record::HeaderSize == 16 ); + BOOST_STATIC_ASSERT( Record::HeaderSize + BtreeData_V1::BucketSize == 8192 ); - /** - * give us a writable version of the btree bucket (declares write intent). - * note it is likely more efficient to declare write intent on something smaller when you can. - */ - BtreeBucket* DiskLoc::btreemod() const { - assert( _a != -1 ); - BtreeBucket *b = const_cast< BtreeBucket * >( btree() ); - return static_cast< BtreeBucket* >( getDur().writingPtr( b, BucketSize ) ); - } +#define VERIFYTHISLOC dassert( thisLoc.btree<V>() == this ); - _KeyNode& _KeyNode::writing() const { - return *getDur().writing( const_cast< _KeyNode* >( this ) ); + template< class Loc > + __KeyNode<Loc> & __KeyNode<Loc>::writing() const { + return *getDur().writing( const_cast< __KeyNode<Loc> * >( this ) ); } - KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) : - prevChildBucket(k.prevChildBucket), - recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs()) - { } - - // largest key size we allow. note we very much need to support bigger keys (somehow) in the future. - static const int KeyMax = BucketSize / 10; - + // BucketBasics::lowWaterMark() + // // We define this value as the maximum number of bytes such that, if we have // fewer than this many bytes, we must be able to either merge with or receive // keys from any neighboring node. If our utilization goes below this value we @@ -65,18 +55,15 @@ namespace mongo { // rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as // follows: We know we cannot merge with the neighbor, so the total data size // for us, the neighbor, and the separator must be at least - // BtreeBucket::bodySize() + 1. We must be able to accept one key of any + // BtreeBucket<V>::bodySize() + 1. We must be able to accept one key of any // allowed size, so our size plus storage for that additional key must be - // <= BtreeBucket::bodySize() / 2. This way, with the extra key we'll have a + // <= BtreeBucket<V>::bodySize() / 2. This way, with the extra key we'll have a // new bucket data size < half the total data size and by the implementation // of rebalancedSeparatorPos() the key must be added. - static const int lowWaterMark = BtreeBucket::bodySize() / 2 - KeyMax - sizeof( _KeyNode ) + 1; static const int split_debug = 0; static const int insert_debug = 0; - extern int otherTraceLevel; - /** * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly * for the 10287 error code. @@ -88,47 +75,57 @@ namespace mongo { /* BucketBasics --------------------------------------------------- */ - void BucketBasics::assertWritable() { + template< class V > + void BucketBasics<V>::assertWritable() { if( cmdLine.dur ) - dur::assertAlreadyDeclared(this, sizeof(*this)); + dur::assertAlreadyDeclared(this, V::BucketSize); } - string BtreeBucket::bucketSummary() const { + template< class V > + string BtreeBucket<V>::bucketSummary() const { stringstream ss; ss << " Bucket info:" << endl; - ss << " n: " << n << endl; - ss << " parent: " << parent.toString() << endl; - ss << " nextChild: " << parent.toString() << endl; - ss << " flags:" << flags << endl; - ss << " emptySize: " << emptySize << " topSize: " << topSize << endl; + ss << " n: " << this->n << endl; + ss << " parent: " << this->parent.toString() << endl; + ss << " nextChild: " << this->parent.toString() << endl; + ss << " flags:" << this->flags << endl; + ss << " emptySize: " << this->emptySize << " topSize: " << this->topSize << endl; return ss.str(); } - int BucketBasics::Size() const { - assert( _wasSize == BucketSize ); - return BucketSize; + template< class V > + int BucketBasics<V>::Size() const { + return V::BucketSize; } - void BucketBasics::_shape(int level, stringstream& ss) const { + template< class V > + void BucketBasics<V>::_shape(int level, stringstream& ss) const { for ( int i = 0; i < level; i++ ) ss << ' '; ss << "*\n"; - for ( int i = 0; i < n; i++ ) - if ( !k(i).prevChildBucket.isNull() ) - k(i).prevChildBucket.btree()->_shape(level+1,ss); - if ( !nextChild.isNull() ) - nextChild.btree()->_shape(level+1,ss); + for ( int i = 0; i < this->n; i++ ) { + if ( !k(i).prevChildBucket.isNull() ) { + DiskLoc ll = k(i).prevChildBucket; + ll.btree<V>()->_shape(level+1,ss); + } + } + if ( !this->nextChild.isNull() ) { + DiskLoc ll = this->nextChild; + ll.btree<V>()->_shape(level+1,ss); + } } int bt_fv=0; int bt_dmp=0; - void BtreeBucket::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const { + template< class V > + void BtreeBucket<V>::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const { bt_dmp=1; fullValidate(thisLoc, order); bt_dmp=0; } - int BtreeBucket::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount, bool strict) const { + template< class V > + long long BtreeBucket<V>::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount, bool strict, unsigned depth) const { { bool f = false; assert( f = true ); @@ -136,18 +133,18 @@ namespace mongo { } killCurrentOp.checkForInterrupt(); - assertValid(order, true); + this->assertValid(order, true); if ( bt_dmp ) { - out() << thisLoc.toString() << ' '; - ((BtreeBucket *) this)->dump(); + _log() << thisLoc.toString() << ' '; + ((BtreeBucket *) this)->dump(depth); } // keycount - int kc = 0; + long long kc = 0; - for ( int i = 0; i < n; i++ ) { - const _KeyNode& kn = k(i); + for ( int i = 0; i < this->n; i++ ) { + const _KeyNode& kn = this->k(i); if ( kn.isUsed() ) { kc++; @@ -159,25 +156,26 @@ namespace mongo { } if ( !kn.prevChildBucket.isNull() ) { DiskLoc left = kn.prevChildBucket; - const BtreeBucket *b = left.btree(); + const BtreeBucket *b = left.btree<V>(); if ( strict ) { assert( b->parent == thisLoc ); } else { wassert( b->parent == thisLoc ); } - kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict); + kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict, depth+1); } } - if ( !nextChild.isNull() ) { - const BtreeBucket *b = nextChild.btree(); + if ( !this->nextChild.isNull() ) { + DiskLoc ll = this->nextChild; + const BtreeBucket *b = ll.btree<V>(); if ( strict ) { assert( b->parent == thisLoc ); } else { wassert( b->parent == thisLoc ); } - kc += b->fullValidate(nextChild, order, unusedCount, strict); + kc += b->fullValidate(this->nextChild, order, unusedCount, strict, depth+1); } return kc; @@ -185,12 +183,17 @@ namespace mongo { int nDumped = 0; - void BucketBasics::assertValid(const Ordering &order, bool force) const { + template< class V > + void BucketBasics<V>::assertValid(const Ordering &order, bool force) const { if ( !debug && !force ) return; - wassert( n >= 0 && n < Size() ); - wassert( emptySize >= 0 && emptySize < BucketSize ); - wassert( topSize >= n && topSize <= BucketSize ); + { + int foo = this->n; + wassert( foo >= 0 && this->n < Size() ); + foo = this->emptySize; + wassert( foo >= 0 && this->emptySize < V::BucketSize ); + wassert( this->topSize >= this->n && this->topSize <= V::BucketSize ); + } // this is very slow so don't do often { @@ -201,26 +204,26 @@ namespace mongo { DEV { // slow: - for ( int i = 0; i < n-1; i++ ) { - BSONObj k1 = keyNode(i).key; - BSONObj k2 = keyNode(i+1).key; + for ( int i = 0; i < this->n-1; i++ ) { + Key k1 = keyNode(i).key; + Key k2 = keyNode(i+1).key; int z = k1.woCompare(k2, order); //OK if ( z > 0 ) { out() << "ERROR: btree key order corrupt. Keys:" << endl; if ( ++nDumped < 5 ) { - for ( int j = 0; j < n; j++ ) { + for ( int j = 0; j < this->n; j++ ) { out() << " " << keyNode(j).key.toString() << endl; } - ((BtreeBucket *) this)->dump(); + ((BtreeBucket<V> *) this)->dump(); } wassert(false); break; } else if ( z == 0 ) { if ( !(k(i).recordLoc < k(i+1).recordLoc) ) { - out() << "ERROR: btree key order corrupt (recordloc's wrong). Keys:" << endl; - out() << " k(" << i << "):" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl; - out() << " k(" << i+1 << "):" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl; + out() << "ERROR: btree key order corrupt (recordloc's wrong):" << endl; + out() << " k(" << i << ")" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl; + out() << " k(" << i+1 << ")" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl; wassert( k(i).recordLoc < k(i+1).recordLoc ); } } @@ -228,15 +231,15 @@ namespace mongo { } else { //faster: - if ( n > 1 ) { - BSONObj k1 = keyNode(0).key; - BSONObj k2 = keyNode(n-1).key; + if ( this->n > 1 ) { + Key k1 = keyNode(0).key; + Key k2 = keyNode(this->n-1).key; int z = k1.woCompare(k2, order); //wassert( z <= 0 ); if ( z > 0 ) { problem() << "btree keys out of order" << '\n'; ONCE { - ((BtreeBucket *) this)->dump(); + ((BtreeBucket<V> *) this)->dump(); } assert(false); } @@ -244,53 +247,59 @@ namespace mongo { } } - inline void BucketBasics::markUnused(int keypos) { - assert( keypos >= 0 && keypos < n ); + template< class V > + inline void BucketBasics<V>::markUnused(int keypos) { + assert( keypos >= 0 && keypos < this->n ); k(keypos).setUnused(); } - inline int BucketBasics::totalDataSize() const { - return (int) (Size() - (data-(char*)this)); + template< class V > + inline int BucketBasics<V>::totalDataSize() const { + return (int) (Size() - (this->data-(char*)this)); } - void BucketBasics::init() { - parent.Null(); - nextChild.Null(); - _wasSize = BucketSize; - _reserved1 = 0; - flags = Packed; - n = 0; - emptySize = totalDataSize(); - topSize = 0; - reserved = 0; + template< class V > + void BucketBasics<V>::init() { + this->_init(); + this->parent.Null(); + this->nextChild.Null(); + this->flags = Packed; + this->n = 0; + this->emptySize = totalDataSize(); + this->topSize = 0; } /** see _alloc */ - inline void BucketBasics::_unalloc(int bytes) { - topSize -= bytes; - emptySize += bytes; + template< class V > + inline void BucketBasics<V>::_unalloc(int bytes) { + this->topSize -= bytes; + this->emptySize += bytes; } /** * we allocate space from the end of the buffer for data. * the keynodes grow from the front. */ - inline int BucketBasics::_alloc(int bytes) { - topSize += bytes; - emptySize -= bytes; - int ofs = totalDataSize() - topSize; + template< class V > + inline int BucketBasics<V>::_alloc(int bytes) { + assert( this->emptySize >= bytes ); + this->topSize += bytes; + this->emptySize -= bytes; + int ofs = totalDataSize() - this->topSize; assert( ofs > 0 ); return ofs; } - void BucketBasics::_delKeyAtPos(int keypos, bool mayEmpty) { - assert( keypos >= 0 && keypos <= n ); + template< class V > + void BucketBasics<V>::_delKeyAtPos(int keypos, bool mayEmpty) { + // TODO This should be keypos < n + assert( keypos >= 0 && keypos <= this->n ); assert( childForPos(keypos).isNull() ); // TODO audit cases where nextChild is null - assert( ( mayEmpty && n > 0 ) || n > 1 || nextChild.isNull() ); - emptySize += sizeof(_KeyNode); - n--; - for ( int j = keypos; j < n; j++ ) + assert( ( mayEmpty && this->n > 0 ) || this->n > 1 || this->nextChild.isNull() ); + this->emptySize += sizeof(_KeyNode); + this->n--; + for ( int j = keypos; j < this->n; j++ ) k(j) = k(j+1); setNotPacked(); } @@ -299,38 +308,54 @@ namespace mongo { * pull rightmost key from the bucket. this version requires its right child to be null so it * does not bother returning that value. */ - void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) { - massert( 10282 , "n==0 in btree popBack()", n > 0 ); - assert( k(n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that - KeyNode kn = keyNode(n-1); + template< class V > + void BucketBasics<V>::popBack(DiskLoc& recLoc, Key &key) { + massert( 10282 , "n==0 in btree popBack()", this->n > 0 ); + assert( k(this->n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that + KeyNode kn = keyNode(this->n-1); recLoc = kn.recordLoc; - key = kn.key; - int keysize = kn.key.objsize(); + key.assign(kn.key); + int keysize = kn.key.dataSize(); - massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull()); + massert( 10283 , "rchild not null in btree popBack()", this->nextChild.isNull()); // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full. - nextChild = kn.prevChildBucket; + this->nextChild = kn.prevChildBucket; - n--; - emptySize += sizeof(_KeyNode); + this->n--; + // This is risky because the key we are returning points to this unalloc'ed memory, + // and we are assuming that the last key points to the last allocated + // bson region. + this->emptySize += sizeof(_KeyNode); _unalloc(keysize); } /** add a key. must be > all existing. be careful to set next ptr right. */ - bool BucketBasics::_pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) { - int bytesNeeded = key.objsize() + sizeof(_KeyNode); - if ( bytesNeeded > emptySize ) + template< class V > + bool BucketBasics<V>::_pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) { + int bytesNeeded = key.dataSize() + sizeof(_KeyNode); + if ( bytesNeeded > this->emptySize ) return false; - assert( bytesNeeded <= emptySize ); - assert( n == 0 || keyNode(n-1).key.woCompare(key, order) <= 0 ); - emptySize -= sizeof(_KeyNode); - _KeyNode& kn = k(n++); + assert( bytesNeeded <= this->emptySize ); + if( this->n ) { + const KeyNode klast = keyNode(this->n-1); + if( klast.key.woCompare(key, order) > 0 ) { + log() << "btree bucket corrupt? consider reindexing or running validate command" << endl; + log() << " klast: " << keyNode(this->n-1).key.toString() << endl; + log() << " key: " << key.toString() << endl; + DEV klast.key.woCompare(key, order); + assert(false); + } + } + this->emptySize -= sizeof(_KeyNode); + _KeyNode& kn = k(this->n++); kn.prevChildBucket = prevChild; kn.recordLoc = recordLoc; - kn.setKeyDataOfs( (short) _alloc(key.objsize()) ); - char *p = dataAt(kn.keyDataOfs()); - memcpy(p, key.objdata(), key.objsize()); + kn.setKeyDataOfs( (short) _alloc(key.dataSize()) ); + short ofs = kn.keyDataOfs(); + char *p = dataAt(ofs); + memcpy(p, key.data(), key.dataSize()); + return true; } @@ -342,19 +367,20 @@ namespace mongo { /** insert a key in a bucket with no complexity -- no splits required @return false if a split is required. */ - bool BucketBasics::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const { - assert( keypos >= 0 && keypos <= n ); - int bytesNeeded = key.objsize() + sizeof(_KeyNode); - if ( bytesNeeded > emptySize ) { + template< class V > + bool BucketBasics<V>::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const { + assert( keypos >= 0 && keypos <= this->n ); + int bytesNeeded = key.dataSize() + sizeof(_KeyNode); + if ( bytesNeeded > this->emptySize ) { _pack(thisLoc, order, keypos); - if ( bytesNeeded > emptySize ) + if ( bytesNeeded > this->emptySize ) return false; } BucketBasics *b; { const char *p = (const char *) &k(keypos); - const char *q = (const char *) &k(n+1); + const char *q = (const char *) &k(this->n+1); // declare that we will write to [k(keypos),k(n)] // todo: this writes a medium amount to the journal. we may want to add a verb "shift" to the redo log so // we can log a very small amount. @@ -364,39 +390,45 @@ namespace mongo { // 1 4 9 // -> // 1 4 _ 9 - for ( int j = n; j > keypos; j-- ) // make room + for ( int j = this->n; j > keypos; j-- ) // make room b->k(j) = b->k(j-1); } - getDur().declareWriteIntent(&b->emptySize, 12); // [b->emptySize..b->n] is 12 bytes and we are going to write those + getDur().declareWriteIntent(&b->emptySize, sizeof(this->emptySize)+sizeof(this->topSize)+sizeof(this->n)); b->emptySize -= sizeof(_KeyNode); b->n++; + // This _KeyNode was marked for writing above. _KeyNode& kn = b->k(keypos); kn.prevChildBucket.Null(); kn.recordLoc = recordLoc; - kn.setKeyDataOfs((short) b->_alloc(key.objsize()) ); + kn.setKeyDataOfs((short) b->_alloc(key.dataSize()) ); char *p = b->dataAt(kn.keyDataOfs()); - getDur().declareWriteIntent(p, key.objsize()); - memcpy(p, key.objdata(), key.objsize()); + getDur().declareWriteIntent(p, key.dataSize()); + memcpy(p, key.data(), key.dataSize()); return true; } - /** with this implementation, refPos == 0 disregards effect of refPos */ - bool BucketBasics::mayDropKey( int index, int refPos ) const { + /** + * With this implementation, refPos == 0 disregards effect of refPos. + * index > 0 prevents creation of an empty bucket. + */ + template< class V > + bool BucketBasics<V>::mayDropKey( int index, int refPos ) const { return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull(); } - int BucketBasics::packedDataSize( int refPos ) const { - if ( flags & Packed ) { - return BucketSize - emptySize - headerSize(); + template< class V > + int BucketBasics<V>::packedDataSize( int refPos ) const { + if ( this->flags & Packed ) { + return V::BucketSize - this->emptySize - headerSize(); } int size = 0; - for( int j = 0; j < n; ++j ) { + for( int j = 0; j < this->n; ++j ) { if ( mayDropKey( j, refPos ) ) { continue; } - size += keyNode( j ).key.objsize() + sizeof( _KeyNode ); + size += keyNode( j ).key.dataSize() + sizeof( _KeyNode ); } return size; } @@ -405,8 +437,9 @@ namespace mongo { * when we delete things we just leave empty space until the node is * full and then we repack it. */ - void BucketBasics::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const { - if ( flags & Packed ) + template< class V > + void BucketBasics<V>::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const { + if ( this->flags & Packed ) return; VERIFYTHISLOC @@ -416,22 +449,23 @@ namespace mongo { declaration anyway within the group commit interval, in which case we would just be adding code and complexity without benefit. */ - thisLoc.btreemod()->_packReadyForMod(order, refPos); + thisLoc.btreemod<V>()->_packReadyForMod(order, refPos); } /** version when write intent already declared */ - void BucketBasics::_packReadyForMod( const Ordering &order, int &refPos ) { + template< class V > + void BucketBasics<V>::_packReadyForMod( const Ordering &order, int &refPos ) { assertWritable(); - if ( flags & Packed ) + if ( this->flags & Packed ) return; int tdz = totalDataSize(); - char temp[BucketSize]; + char temp[V::BucketSize]; int ofs = tdz; - topSize = 0; + this->topSize = 0; int i = 0; - for ( int j = 0; j < n; j++ ) { + for ( int j = 0; j < this->n; j++ ) { if( mayDropKey( j, refPos ) ) { continue; // key is unused and has no children - drop it } @@ -442,36 +476,40 @@ namespace mongo { k( i ) = k( j ); } short ofsold = k(i).keyDataOfs(); - int sz = keyNode(i).key.objsize(); + int sz = keyNode(i).key.dataSize(); ofs -= sz; - topSize += sz; + this->topSize += sz; memcpy(temp+ofs, dataAt(ofsold), sz); k(i).setKeyDataOfsSavingUse( ofs ); ++i; } - if ( refPos == n ) { + if ( refPos == this->n ) { refPos = i; } - n = i; + this->n = i; int dataUsed = tdz - ofs; - memcpy(data + ofs, temp + ofs, dataUsed); + memcpy(this->data + ofs, temp + ofs, dataUsed); // assertWritable(); // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this)); - emptySize = tdz - dataUsed - n * sizeof(_KeyNode); - assert( emptySize >= 0 ); + this->emptySize = tdz - dataUsed - this->n * sizeof(_KeyNode); + { + int foo = this->emptySize; + assert( foo >= 0 ); + } setPacked(); assertValid( order ); } - inline void BucketBasics::truncateTo(int N, const Ordering &order, int &refPos) { + template< class V > + inline void BucketBasics<V>::truncateTo(int N, const Ordering &order, int &refPos) { dbMutex.assertWriteLocked(); assertWritable(); - n = N; + this->n = N; setNotPacked(); _packReadyForMod( order, refPos ); } @@ -489,19 +527,21 @@ namespace mongo { * We just have a simple algorithm right now: if a key includes the * halfway point (or 10% way point) in terms of bytes, split on that key; * otherwise split on the key immediately to the left of the halfway - * point. + * point (or 10% point). * * This function is expected to be called on a packed bucket. */ - int BucketBasics::splitPos( int keypos ) const { - assert( n > 2 ); + template< class V > + int BucketBasics<V>::splitPos( int keypos ) const { + assert( this->n > 2 ); int split = 0; int rightSize = 0; // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split. // see SERVER-983 - int rightSizeLimit = ( topSize + sizeof( _KeyNode ) * n ) / ( keypos == n ? 10 : 2 ); - for( int i = n - 1; i > -1; --i ) { - rightSize += keyNode( i ).key.objsize() + sizeof( _KeyNode ); + // TODO I think we only want to do the 90% split on the rhs node of the tree. + int rightSizeLimit = ( this->topSize + sizeof( _KeyNode ) * this->n ) / ( keypos == this->n ? 10 : 2 ); + for( int i = this->n - 1; i > -1; --i ) { + rightSize += keyNode( i ).key.dataSize() + sizeof( _KeyNode ); if ( rightSize > rightSizeLimit ) { split = i; break; @@ -511,37 +551,40 @@ namespace mongo { if ( split < 1 ) { split = 1; } - else if ( split > n - 2 ) { - split = n - 2; + else if ( split > this->n - 2 ) { + split = this->n - 2; } return split; } - void BucketBasics::reserveKeysFront( int nAdd ) { - assert( emptySize >= int( sizeof( _KeyNode ) * nAdd ) ); - emptySize -= sizeof( _KeyNode ) * nAdd; - for( int i = n - 1; i > -1; --i ) { + template< class V > + void BucketBasics<V>::reserveKeysFront( int nAdd ) { + assert( this->emptySize >= int( sizeof( _KeyNode ) * nAdd ) ); + this->emptySize -= sizeof( _KeyNode ) * nAdd; + for( int i = this->n - 1; i > -1; --i ) { k( i + nAdd ) = k( i ); } - n += nAdd; + this->n += nAdd; } - void BucketBasics::setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket ) { + template< class V > + void BucketBasics<V>::setKey( int i, const DiskLoc recordLoc, const Key &key, const DiskLoc prevChildBucket ) { _KeyNode &kn = k( i ); kn.recordLoc = recordLoc; kn.prevChildBucket = prevChildBucket; - short ofs = (short) _alloc( key.objsize() ); + short ofs = (short) _alloc( key.dataSize() ); kn.setKeyDataOfs( ofs ); char *p = dataAt( ofs ); - memcpy( p, key.objdata(), key.objsize() ); + memcpy( p, key.data(), key.dataSize() ); } - void BucketBasics::dropFront( int nDrop, const Ordering &order, int &refpos ) { - for( int i = nDrop; i < n; ++i ) { + template< class V > + void BucketBasics<V>::dropFront( int nDrop, const Ordering &order, int &refpos ) { + for( int i = nDrop; i < this->n; ++i ) { k( i - nDrop ) = k( i ); } - n -= nDrop; + this->n -= nDrop; setNotPacked(); _packReadyForMod( order, refpos ); } @@ -549,10 +592,11 @@ namespace mongo { /* - BtreeBucket --------------------------------------------------- */ /** @return largest key in the subtree. */ - void BtreeBucket::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) { + template< class V > + void BtreeBucket<V>::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) { DiskLoc loc = thisLoc; while ( 1 ) { - const BtreeBucket *b = loc.btree(); + const BtreeBucket *b = loc.btree<V>(); if ( !b->nextChild.isNull() ) { loc = b->nextChild; continue; @@ -571,8 +615,16 @@ namespace mongo { * not have more keys than an unsigned variable has bits. The same * assumption is used in the implementation below with respect to the 'mask' * variable. + * + * @param l a regular bsonobj + * @param rBegin composed partly of an existing bsonobj, and the remaining keys are taken from a vector of elements that frequently changes + * + * see + * jstests/index_check6.js + * https://jira.mongodb.org/browse/SERVER-371 */ - int BtreeBucket::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) { + template< class V > + int BtreeBucket<V>::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) { BSONObjIterator ll( l ); BSONObjIterator rr( rBegin ); vector< const BSONElement * >::const_iterator rr2 = rEnd.begin(); @@ -610,31 +662,29 @@ namespace mongo { return 0; } - bool BtreeBucket::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const { - int pos; - bool found; - DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc); + template< class V > + bool BtreeBucket<V>::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const { + int pos; + bool found; + DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc); - // skip unused keys - while ( 1 ) { - if( b.isNull() ) - break; - const BtreeBucket *bucket = b.btree(); - const _KeyNode& kn = bucket->k(pos); - if ( kn.isUsed() ) - return bucket->keyAt(pos).woEqual(key); - b = bucket->advance(b, pos, 1, "BtreeBucket::exists"); + // skip unused keys + while ( 1 ) { + if( b.isNull() ) + break; + const BtreeBucket *bucket = b.btree<V>(); + const _KeyNode& kn = bucket->k(pos); + if ( kn.isUsed() ) + return bucket->keyAt(pos).woEqual(key); + b = bucket->advance(b, pos, 1, "BtreeBucket<V>::exists"); } return false; } - /** - * @param self - don't complain about ourself already being in the index case. - * @return true = there is a duplicate. - */ - bool BtreeBucket::wouldCreateDup( + template< class V > + bool BtreeBucket<V>::wouldCreateDup( const IndexDetails& idx, const DiskLoc &thisLoc, - const BSONObj& key, const Ordering& order, + const Key& key, const Ordering& order, const DiskLoc &self) const { int pos; bool found; @@ -642,24 +692,25 @@ namespace mongo { while ( !b.isNull() ) { // we skip unused keys - const BtreeBucket *bucket = b.btree(); + const BtreeBucket *bucket = b.btree<V>(); const _KeyNode& kn = bucket->k(pos); if ( kn.isUsed() ) { if( bucket->keyAt(pos).woEqual(key) ) return kn.recordLoc != self; break; } - b = bucket->advance(b, pos, 1, "BtreeBucket::dupCheck"); + b = bucket->advance(b, pos, 1, "BtreeBucket<V>::dupCheck"); } return false; } - string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ) { + template< class V > + string BtreeBucket<V>::dupKeyError( const IndexDetails& idx , const Key& key ) { stringstream ss; ss << "E11000 duplicate key error "; ss << "index: " << idx.indexNamespace() << " "; - ss << "dup key: " << key; + ss << "dup key: " << key.toString(); return ss.str(); } @@ -677,30 +728,20 @@ namespace mongo { * returns n if it goes after the last existing key. * note result might be an Unused location! */ - char foo; - bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const { -#if defined(_EXPERIMENT1) - { - char *z = (char *) this; - int i = 0; - while( 1 ) { - i += 4096; - if( i >= BucketSize ) - break; - foo += z[i]; - } - } -#endif - + template< class V > + bool BtreeBucket<V>::find(const IndexDetails& idx, const Key& key, const DiskLoc &rl, + const Ordering &order, int& pos, bool assertIfDup) const { + Loc recordLoc; + recordLoc = rl; globalIndexCounters.btree( (char*)this ); // binary search for this key bool dupsChecked = false; int l=0; - int h=n-1; + int h=this->n-1; while ( l <= h ) { int m = (l+h)/2; - KeyNode M = keyNode(m); + KeyNode M = this->keyNode(m); int x = key.woCompare(M.key, order); if ( x == 0 ) { if( assertIfDup ) { @@ -710,8 +751,8 @@ namespace mongo { // coding effort in here to make this particularly fast if( !dupsChecked ) { dupsChecked = true; - if( idx.head.btree()->exists(idx, idx.head, key, order) ) { - if( idx.head.btree()->wouldCreateDup(idx, idx.head, key, order, recordLoc) ) + if( idx.head.btree<V>()->exists(idx, idx.head, key, order) ) { + if( idx.head.btree<V>()->wouldCreateDup(idx, idx.head, key, order, recordLoc) ) uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) ); else alreadyInIndex(); @@ -726,7 +767,7 @@ namespace mongo { } // dup keys allowed. use recordLoc as if it is part of the key - DiskLoc unusedRL = M.recordLoc; + Loc unusedRL = M.recordLoc; unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up x = recordLoc.compare(unusedRL); } @@ -742,49 +783,59 @@ namespace mongo { } // not found pos = l; - if ( pos != n ) { - BSONObj keyatpos = keyNode(pos).key; + if ( pos != this->n ) { + Key keyatpos = keyNode(pos).key; wassert( key.woCompare(keyatpos, order) <= 0 ); if ( pos > 0 ) { - wassert( keyNode(pos-1).key.woCompare(key, order) <= 0 ); + if( !( keyNode(pos-1).key.woCompare(key, order) <= 0 ) ) { + DEV { + log() << key.toString() << endl; + log() << keyNode(pos-1).key.toString() << endl; + } + wassert(false); + } } } return false; } - void BtreeBucket::delBucket(const DiskLoc thisLoc, const IndexDetails& id) { + template< class V > + void BtreeBucket<V>::delBucket(const DiskLoc thisLoc, const IndexDetails& id) { ClientCursor::informAboutToDeleteBucket(thisLoc); // slow... assert( !isHead() ); - const BtreeBucket *p = parent.btree(); + DiskLoc ll = this->parent; + const BtreeBucket *p = ll.btree<V>(); int parentIdx = indexInParent( thisLoc ); p->childForPos( parentIdx ).writing().Null(); deallocBucket( thisLoc, id ); } - void BtreeBucket::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) { + template< class V > + void BtreeBucket<V>::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) { #if 0 // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete // it (meaning it is ineligible for reuse). memset(this, 0, Size()); #else // defensive: - n = -1; - parent.Null(); + this->n = -1; + this->parent.Null(); string ns = id.indexNamespace(); theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc); #endif } /** note: may delete the entire bucket! this invalid upon return sometimes. */ - void BtreeBucket::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) { - assert(n>0); - DiskLoc left = childForPos(p); - - if ( n == 1 ) { - if ( left.isNull() && nextChild.isNull() ) { - _delKeyAtPos(p); + template< class V > + void BtreeBucket<V>::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) { + assert(this->n>0); + DiskLoc left = this->childForPos(p); + + if ( this->n == 1 ) { + if ( left.isNull() && this->nextChild.isNull() ) { + this->_delKeyAtPos(p); if ( isHead() ) { // we don't delete the top bucket ever } @@ -803,7 +854,7 @@ namespace mongo { } if ( left.isNull() ) { - _delKeyAtPos(p); + this->_delKeyAtPos(p); mayBalanceWithNeighbors( thisLoc, id, order ); } else { @@ -833,53 +884,71 @@ namespace mongo { * k by k', preserving the key's unused marking. This function is only * expected to mark a key as unused when handling a legacy btree. */ - void BtreeBucket::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) { - DiskLoc lchild = childForPos( keypos ); - DiskLoc rchild = childForPos( keypos + 1 ); + template< class V > + void BtreeBucket<V>::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) { + DiskLoc lchild = this->childForPos( keypos ); + DiskLoc rchild = this->childForPos( keypos + 1 ); assert( !lchild.isNull() || !rchild.isNull() ); int advanceDirection = lchild.isNull() ? 1 : -1; int advanceKeyOfs = keypos; DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ ); - - if ( !advanceLoc.btree()->childForPos( advanceKeyOfs ).isNull() || - !advanceLoc.btree()->childForPos( advanceKeyOfs + 1 ).isNull() ) { + // advanceLoc must be a descentant of thisLoc, because thisLoc has a + // child in the proper direction and all descendants of thisLoc must be + // nonempty because they are not the root. + + if ( !advanceLoc.btree<V>()->childForPos( advanceKeyOfs ).isNull() || + !advanceLoc.btree<V>()->childForPos( advanceKeyOfs + 1 ).isNull() ) { // only expected with legacy btrees, see note above - markUnused( keypos ); + this->markUnused( keypos ); return; } - KeyNode kn = advanceLoc.btree()->keyNode( advanceKeyOfs ); - setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, childForPos( keypos ), childForPos( keypos + 1 ), id ); - advanceLoc.btreemod()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order ); + KeyNode kn = advanceLoc.btree<V>()->keyNode( advanceKeyOfs ); + // Because advanceLoc is a descendant of thisLoc, updating thisLoc will + // not affect packing or keys of advanceLoc and kn will be stable + // during the following setInternalKey() + setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, this->childForPos( keypos ), this->childForPos( keypos + 1 ), id ); + advanceLoc.btreemod<V>()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order ); } - void BtreeBucket::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) { - assert( n == 0 && !nextChild.isNull() ); - if ( parent.isNull() ) { +//#define BTREE(loc) (static_cast<DiskLoc>(loc).btree<V>()) +#define BTREE(loc) (loc.template btree<V>()) +//#define BTREEMOD(loc) (static_cast<DiskLoc>(loc).btreemod<V>()) +#define BTREEMOD(loc) (loc.template btreemod<V>()) + + template< class V > + void BtreeBucket<V>::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) { + assert( this->n == 0 && !this->nextChild.isNull() ); + if ( this->parent.isNull() ) { assert( id.head == thisLoc ); - id.head.writing() = nextChild; + id.head.writing() = this->nextChild; } else { - parent.btree()->childForPos( indexInParent( thisLoc ) ).writing() = nextChild; + DiskLoc ll = this->parent; + ll.btree<V>()->childForPos( indexInParent( thisLoc ) ).writing() = this->nextChild; } - nextChild.btree()->parent.writing() = parent; + BTREE(this->nextChild)->parent.writing() = this->parent; + + BTREE(this->nextChild)->parent.writing() = this->parent; + //(static_cast<DiskLoc>(this->nextChild).btree<V>())->parent.writing() = this->parent; ClientCursor::informAboutToDeleteBucket( thisLoc ); deallocBucket( thisLoc, id ); } - bool BtreeBucket::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const { - assert( leftIndex >= 0 && leftIndex < n ); - DiskLoc leftNodeLoc = childForPos( leftIndex ); - DiskLoc rightNodeLoc = childForPos( leftIndex + 1 ); + template< class V > + bool BtreeBucket<V>::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const { + assert( leftIndex >= 0 && leftIndex < this->n ); + DiskLoc leftNodeLoc = this->childForPos( leftIndex ); + DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 ); if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) { // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway return false; } int pos = 0; { - const BtreeBucket *l = leftNodeLoc.btree(); - const BtreeBucket *r = rightNodeLoc.btree(); - if ( ( headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.objsize() + sizeof(_KeyNode) > unsigned( BucketSize ) ) ) { + const BtreeBucket *l = leftNodeLoc.btree<V>(); + const BtreeBucket *r = rightNodeLoc.btree<V>(); + if ( ( this->headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.dataSize() + sizeof(_KeyNode) > unsigned( V::BucketSize ) ) ) { return false; } } @@ -890,33 +959,34 @@ namespace mongo { * This implementation must respect the meaning and value of lowWaterMark. * Also see comments in splitPos(). */ - int BtreeBucket::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const { + template< class V > + int BtreeBucket<V>::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const { int split = -1; int rightSize = 0; - const BtreeBucket *l = childForPos( leftIndex ).btree(); - const BtreeBucket *r = childForPos( leftIndex + 1 ).btree(); + const BtreeBucket *l = BTREE(this->childForPos( leftIndex )); + const BtreeBucket *r = BTREE(this->childForPos( leftIndex + 1 )); int KNS = sizeof( _KeyNode ); - int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.objsize() + KNS + r->topSize + r->n * KNS ) / 2; + int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.dataSize() + KNS + r->topSize + r->n * KNS ) / 2; // This constraint should be ensured by only calling this function // if we go below the low water mark. - assert( rightSizeLimit < BtreeBucket::bodySize() ); + assert( rightSizeLimit < BtreeBucket<V>::bodySize() ); for( int i = r->n - 1; i > -1; --i ) { - rightSize += r->keyNode( i ).key.objsize() + KNS; + rightSize += r->keyNode( i ).key.dataSize() + KNS; if ( rightSize > rightSizeLimit ) { split = l->n + 1 + i; break; } } if ( split == -1 ) { - rightSize += keyNode( leftIndex ).key.objsize() + KNS; + rightSize += keyNode( leftIndex ).key.dataSize() + KNS; if ( rightSize > rightSizeLimit ) { split = l->n; } } if ( split == -1 ) { for( int i = l->n - 1; i > -1; --i ) { - rightSize += l->keyNode( i ).key.objsize() + KNS; + rightSize += l->keyNode( i ).key.dataSize() + KNS; if ( rightSize > rightSizeLimit ) { split = i; break; @@ -934,15 +1004,18 @@ namespace mongo { return split; } - void BtreeBucket::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { - DiskLoc leftNodeLoc = childForPos( leftIndex ); - DiskLoc rightNodeLoc = childForPos( leftIndex + 1 ); - BtreeBucket *l = leftNodeLoc.btreemod(); - BtreeBucket *r = rightNodeLoc.btreemod(); + template< class V > + void BtreeBucket<V>::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { + DiskLoc leftNodeLoc = this->childForPos( leftIndex ); + DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 ); + BtreeBucket *l = leftNodeLoc.btreemod<V>(); + BtreeBucket *r = rightNodeLoc.btreemod<V>(); int pos = 0; l->_packReadyForMod( order, pos ); r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys + // We know the additional keys below will fit in l because canMergeChildren() + // must be true. int oldLNum = l->n; { KeyNode kn = keyNode( leftIndex ); @@ -955,10 +1028,10 @@ namespace mongo { l->nextChild = r->nextChild; l->fixParentPtrs( leftNodeLoc, oldLNum ); r->delBucket( rightNodeLoc, id ); - childForPos( leftIndex + 1 ) = leftNodeLoc; - childForPos( leftIndex ) = DiskLoc(); - _delKeyAtPos( leftIndex, true ); - if ( n == 0 ) { + this->childForPos( leftIndex + 1 ) = leftNodeLoc; + this->childForPos( leftIndex ) = DiskLoc(); + this->_delKeyAtPos( leftIndex, true ); + if ( this->n == 0 ) { // will trash this and thisLoc // TODO To ensure all leaves are of equal height, we should ensure // this is only called on the root. @@ -970,9 +1043,10 @@ namespace mongo { } } - int BtreeBucket::indexInParent( const DiskLoc &thisLoc ) const { - assert( !parent.isNull() ); - const BtreeBucket *p = parent.btree(); + template< class V > + int BtreeBucket<V>::indexInParent( const DiskLoc &thisLoc ) const { + assert( !this->parent.isNull() ); + const BtreeBucket *p = BTREE(this->parent); if ( p->nextChild == thisLoc ) { return p->n; } @@ -986,27 +1060,33 @@ namespace mongo { out() << "ERROR: can't find ref to child bucket.\n"; out() << "child: " << thisLoc << "\n"; dump(); - out() << "Parent: " << parent << "\n"; + out() << "Parent: " << this->parent << "\n"; p->dump(); assert(false); return -1; // just to compile } - bool BtreeBucket::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const { + template< class V > + bool BtreeBucket<V>::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const { // If we can merge, then we must merge rather than balance to preserve // bucket utilization constraints. if ( canMergeChildren( thisLoc, leftIndex ) ) { return false; } - thisLoc.btreemod()->doBalanceChildren( thisLoc, leftIndex, id, order ); + thisLoc.btreemod<V>()->doBalanceChildren( thisLoc, leftIndex, id, order ); return true; } - void BtreeBucket::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split, + template< class V > + void BtreeBucket<V>::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split, BtreeBucket *l, const DiskLoc lchild, BtreeBucket *r, const DiskLoc rchild, IndexDetails &id, const Ordering &order ) { // TODO maybe do some audits the same way pushBack() does? + // As a precondition, rchild + the old separator are <= half a body size, + // and lchild is at most completely full. Based on the value of split, + // rchild will get <= half of the total bytes which is at most 75% + // of a full body. So rchild will have room for the following keys: int rAdd = l->n - split; r->reserveKeysFront( rAdd ); for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) { @@ -1021,16 +1101,26 @@ namespace mongo { { KeyNode kn = l->keyNode( split ); l->nextChild = kn.prevChildBucket; + // Because lchild is a descendant of thisLoc, updating thisLoc will + // not not affect packing or keys of lchild and kn will be stable + // during the following setInternalKey() setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id ); } int zeropos = 0; + // lchild and rchild cannot be merged, so there must be >0 (actually more) + // keys to the left of split. l->truncateTo( split, order, zeropos ); } - void BtreeBucket::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split, + template< class V > + void BtreeBucket<V>::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split, BtreeBucket *l, const DiskLoc lchild, BtreeBucket *r, const DiskLoc rchild, IndexDetails &id, const Ordering &order ) { + // As a precondition, lchild + the old separator are <= half a body size, + // and rchild is at most completely full. Based on the value of split, + // lchild will get less than half of the total bytes which is at most 75% + // of a full body. So lchild will have room for the following keys: int lN = l->n; { KeyNode kn = keyNode( leftIndex ); @@ -1043,20 +1133,27 @@ namespace mongo { { KeyNode kn = r->keyNode( split - lN - 1 ); l->nextChild = kn.prevChildBucket; + // Child lN was lchild's old nextChild, and don't need to fix that one. l->fixParentPtrs( lchild, lN + 1, l->n ); + // Because rchild is a descendant of thisLoc, updating thisLoc will + // not affect packing or keys of rchild and kn will be stable + // during the following setInternalKey() setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id ); } int zeropos = 0; + // lchild and rchild cannot be merged, so there must be >0 (actually more) + // keys to the right of split. r->dropFront( split - lN, order, zeropos ); } - void BtreeBucket::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { - DiskLoc lchild = childForPos( leftIndex ); - DiskLoc rchild = childForPos( leftIndex + 1 ); + template< class V > + void BtreeBucket<V>::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { + DiskLoc lchild = this->childForPos( leftIndex ); + DiskLoc rchild = this->childForPos( leftIndex + 1 ); int zeropos = 0; - BtreeBucket *l = lchild.btreemod(); + BtreeBucket *l = lchild.btreemod<V>(); l->_packReadyForMod( order, zeropos ); - BtreeBucket *r = rchild.btreemod(); + BtreeBucket *r = rchild.btreemod<V>(); r->_packReadyForMod( order, zeropos ); int split = rebalancedSeparatorPos( thisLoc, leftIndex ); @@ -1071,16 +1168,17 @@ namespace mongo { } } - bool BtreeBucket::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const { - if ( parent.isNull() ) { // we are root, there are no neighbors + template< class V > + bool BtreeBucket<V>::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const { + if ( this->parent.isNull() ) { // we are root, there are no neighbors return false; } - if ( packedDataSize( 0 ) >= lowWaterMark ) { + if ( this->packedDataSize( 0 ) >= this->lowWaterMark() ) { return false; } - const BtreeBucket *p = parent.btree(); + const BtreeBucket *p = BTREE(this->parent); int parentIdx = indexInParent( thisLoc ); // TODO will missing neighbor case be possible long term? Should we try to merge/balance somehow in that case if so? @@ -1091,21 +1189,21 @@ namespace mongo { // to preserve btree bucket utilization constraints since that's a more // heavy duty operation (especially if we must re-split later). if ( mayBalanceRight && - p->tryBalanceChildren( parent, parentIdx, id, order ) ) { + p->tryBalanceChildren( this->parent, parentIdx, id, order ) ) { return true; } if ( mayBalanceLeft && - p->tryBalanceChildren( parent, parentIdx - 1, id, order ) ) { + p->tryBalanceChildren( this->parent, parentIdx - 1, id, order ) ) { return true; } - BtreeBucket *pm = parent.btreemod(); + BtreeBucket *pm = BTREEMOD(this->parent); if ( mayBalanceRight ) { - pm->doMergeChildren( parent, parentIdx, id, order ); + pm->doMergeChildren( this->parent, parentIdx, id, order ); return true; } else if ( mayBalanceLeft ) { - pm->doMergeChildren( parent, parentIdx - 1, id, order ); + pm->doMergeChildren( this->parent, parentIdx - 1, id, order ); return true; } @@ -1113,64 +1211,70 @@ namespace mongo { } /** remove a key from the index */ - bool BtreeBucket::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const { + template< class V > + bool BtreeBucket<V>::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const { int pos; bool found; - DiskLoc loc = locate(id, thisLoc, key, Ordering::make(id.keyPattern()), pos, found, recordLoc, 1); + const Ordering ord = Ordering::make(id.keyPattern()); + DiskLoc loc = locate(id, thisLoc, key, ord, pos, found, recordLoc, 1); if ( found ) { - - if ( key.objsize() > KeyMax ) { + if ( key.objsize() > this->KeyMax ) { OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl; - } - - loc.btreemod()->delKeyAtPos(loc, id, pos, Ordering::make(id.keyPattern())); - + } + loc.btreemod<V>()->delKeyAtPos(loc, id, pos, ord); return true; } return false; } - BtreeBucket* BtreeBucket::allocTemp() { - BtreeBucket *b = (BtreeBucket*) malloc(BucketSize); + template< class V > + BtreeBucket<V> * BtreeBucket<V>::allocTemp() { + BtreeBucket *b = (BtreeBucket*) malloc(V::BucketSize); b->init(); return b; } - inline void BtreeBucket::fix(const DiskLoc thisLoc, const DiskLoc child) { + template< class V > + inline void BtreeBucket<V>::fix(const DiskLoc thisLoc, const DiskLoc child) { if ( !child.isNull() ) { if ( insert_debug ) - out() << " " << child.toString() << ".parent=" << thisLoc.toString() << endl; - child.btree()->parent.writing() = thisLoc; + out() << " fix " << child.toString() << ".parent=" << thisLoc.toString() << endl; + child.btree<V>()->parent.writing() = thisLoc; } } - /** this sucks. maybe get rid of parent ptrs. */ - void BtreeBucket::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const { + /** + * This can cause a lot of additional page writes when we assign buckets to + * different parents. Maybe get rid of parent ptrs? + */ + template< class V > + void BtreeBucket<V>::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const { VERIFYTHISLOC if ( lastIndex == -1 ) { - lastIndex = n; + lastIndex = this->n; } for ( int i = firstIndex; i <= lastIndex; i++ ) { - fix(thisLoc, childForPos(i)); + fix(thisLoc, this->childForPos(i)); } } - void BtreeBucket::setInternalKey( const DiskLoc thisLoc, int keypos, - const DiskLoc recordLoc, const BSONObj &key, const Ordering &order, + template< class V > + void BtreeBucket<V>::setInternalKey( const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const Key &key, const Ordering &order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) { - childForPos( keypos ).Null(); + this->childForPos( keypos ).Null(); // This may leave the bucket empty (n == 0) which is ok only as a // transient state. In the instant case, the implementation of // insertHere behaves correctly when n == 0 and as a side effect // increments n. - _delKeyAtPos( keypos, true ); + this->_delKeyAtPos( keypos, true ); // Ensure we do not orphan neighbor's old child. - assert( childForPos( keypos ) == rchild ); + assert( this->childForPos( keypos ) == rchild ); // Just set temporarily - required to pass validation in insertHere() - childForPos( keypos ) = lchild; + this->childForPos( keypos ) = lchild; insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx ); } @@ -1180,127 +1284,137 @@ namespace mongo { * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost. * NOTE this function may free some data, and as a result the value passed for keypos may * be invalid after calling insertHere() + * + * Some of the write intent signaling below relies on the implementation of + * the optimized write intent code in basicInsert(). */ - void BtreeBucket::insertHere( const DiskLoc thisLoc, int keypos, - const DiskLoc recordLoc, const BSONObj& key, const Ordering& order, + template< class V > + void BtreeBucket<V>::insertHere( const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const Key& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const { if ( insert_debug ) out() << " " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' ' << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl; - if ( !basicInsert(thisLoc, keypos, recordLoc, key, order) ) { - thisLoc.btreemod()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx); + if ( !this->basicInsert(thisLoc, keypos, recordLoc, key, order) ) { + // If basicInsert() fails, the bucket will be packed as required by split(). + thisLoc.btreemod<V>()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx); return; } { const _KeyNode *_kn = &k(keypos); _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert() - if ( keypos+1 == n ) { // last key - if ( nextChild != lchild ) { + if ( keypos+1 == this->n ) { // last key + if ( this->nextChild != lchild ) { out() << "ERROR nextChild != lchild" << endl; out() << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl; - out() << " keyPos: " << keypos << " n:" << n << endl; - out() << " nextChild: " << nextChild.toString() << " lchild: " << lchild.toString() << endl; + out() << " keyPos: " << keypos << " n:" << this->n << endl; + out() << " nextChild: " << this->nextChild.toString() << " lchild: " << lchild.toString() << endl; out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; out() << " key: " << key.toString() << endl; dump(); assert(false); } - kn->prevChildBucket = nextChild; + kn->prevChildBucket = this->nextChild; assert( kn->prevChildBucket == lchild ); - nextChild.writing() = rchild; + this->nextChild.writing() = rchild; if ( !rchild.isNull() ) - rchild.btree()->parent.writing() = thisLoc; + BTREE(rchild)->parent.writing() = thisLoc; } else { kn->prevChildBucket = lchild; if ( k(keypos+1).prevChildBucket != lchild ) { out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl; out() << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl; - out() << " keyPos: " << keypos << " n:" << n << endl; + out() << " keyPos: " << keypos << " n:" << this->n << endl; out() << " k(keypos+1).pcb: " << k(keypos+1).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl; out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; out() << " key: " << key.toString() << endl; dump(); assert(false); } - const DiskLoc *pc = &k(keypos+1).prevChildBucket; - *getDur().alreadyDeclared((DiskLoc*) pc) = rchild; // declared in basicInsert() + const Loc *pc = &k(keypos+1).prevChildBucket; + *getDur().alreadyDeclared( const_cast<Loc*>(pc) ) = rchild; // declared in basicInsert() if ( !rchild.isNull() ) - rchild.btree()->parent.writing() = thisLoc; + rchild.btree<V>()->parent.writing() = thisLoc; } return; } } - void BtreeBucket::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) { - assertWritable(); + template< class V > + void BtreeBucket<V>::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const Key& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) { + this->assertWritable(); if ( split_debug ) out() << " " << thisLoc.toString() << ".split" << endl; - int split = splitPos( keypos ); + int split = this->splitPos( keypos ); DiskLoc rLoc = addBucket(idx); - BtreeBucket *r = rLoc.btreemod(); + BtreeBucket *r = rLoc.btreemod<V>(); if ( split_debug ) - out() << " split:" << split << ' ' << keyNode(split).key.toString() << " n:" << n << endl; - for ( int i = split+1; i < n; i++ ) { + out() << " split:" << split << ' ' << keyNode(split).key.toString() << " this->n:" << this->n << endl; + for ( int i = split+1; i < this->n; i++ ) { KeyNode kn = keyNode(i); r->pushBack(kn.recordLoc, kn.key, order, kn.prevChildBucket); } - r->nextChild = nextChild; + r->nextChild = this->nextChild; r->assertValid( order ); if ( split_debug ) - out() << " new rLoc:" << rLoc.toString() << endl; + out() << " this->new rLoc:" << rLoc.toString() << endl; r = 0; - rLoc.btree()->fixParentPtrs(rLoc); + rLoc.btree<V>()->fixParentPtrs(rLoc); { KeyNode splitkey = keyNode(split); - nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r) + this->nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r) if ( split_debug ) { out() << " splitkey key:" << splitkey.key.toString() << endl; } - // promote splitkey to a parent node - if ( parent.isNull() ) { - // make a new parent if we were the root + // Because thisLoc is a descendant of parent, updating parent will + // not affect packing or keys of thisLoc and splitkey will be stable + // during the following: + + // promote splitkey to a parent this->node + if ( this->parent.isNull() ) { + // make a this->new this->parent if we were the root DiskLoc L = addBucket(idx); - BtreeBucket *p = L.btreemod(); + BtreeBucket *p = L.btreemod<V>(); p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc); p->nextChild = rLoc; p->assertValid( order ); - parent = idx.head.writing() = L; + this->parent = idx.head.writing() = L; if ( split_debug ) - out() << " we were root, making new root:" << hex << parent.getOfs() << dec << endl; - rLoc.btree()->parent.writing() = parent; + out() << " we were root, making this->new root:" << hex << this->parent.getOfs() << dec << endl; + rLoc.btree<V>()->parent.writing() = this->parent; } else { // set this before calling _insert - if it splits it will do fixParent() logic and change the value. - rLoc.btree()->parent.writing() = parent; + rLoc.btree<V>()->parent.writing() = this->parent; if ( split_debug ) out() << " promoting splitkey key " << splitkey.key.toString() << endl; - parent.btree()->_insert(parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx); + BTREE(this->parent)->_insert(this->parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx); } } int newpos = keypos; // note this may trash splitkey.key. thus we had to promote it before finishing up here. - truncateTo(split, order, newpos); // note this may trash splitkey.key. thus we had to promote it before finishing up here. + this->truncateTo(split, order, newpos); - // add our new key, there is room now + // add our this->new key, there is room this->now { if ( keypos <= split ) { if ( split_debug ) - out() << " keypos<split, insertHere() the new key" << endl; + out() << " keypos<split, insertHere() the this->new key" << endl; insertHere(thisLoc, newpos, recordLoc, key, order, lchild, rchild, idx); } else { int kp = keypos-split-1; assert(kp>=0); - rLoc.btree()->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx); + BTREE(rLoc)->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx); } } @@ -1308,41 +1422,44 @@ namespace mongo { out() << " split end " << hex << thisLoc.getOfs() << dec << endl; } - /** start a new index off, empty */ - DiskLoc BtreeBucket::addBucket(const IndexDetails& id) { + /** start a this->new index off, empty */ + template< class V > + DiskLoc BtreeBucket<V>::addBucket(const IndexDetails& id) { string ns = id.indexNamespace(); - DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, BucketSize, true); - BtreeBucket *b = loc.btreemod(); + DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, V::BucketSize, true); + BtreeBucket *b = BTREEMOD(loc); b->init(); return loc; } - void BtreeBucket::renameIndexNamespace(const char *oldNs, const char *newNs) { + void renameIndexNamespace(const char *oldNs, const char *newNs) { renameNamespace( oldNs, newNs ); } - const DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) const { + template< class V > + const DiskLoc BtreeBucket<V>::getHead(const DiskLoc& thisLoc) const { DiskLoc p = thisLoc; - while ( !p.btree()->isHead() ) - p = p.btree()->parent; + while ( !BTREE(p)->isHead() ) + p = BTREE(p)->parent; return p; } - DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const { - if ( keyOfs < 0 || keyOfs >= n ) { - out() << "ASSERT failure BtreeBucket::advance, caller: " << caller << endl; + template< class V > + DiskLoc BtreeBucket<V>::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const { + if ( keyOfs < 0 || keyOfs >= this->n ) { + out() << "ASSERT failure BtreeBucket<V>::advance, caller: " << caller << endl; out() << " thisLoc: " << thisLoc.toString() << endl; - out() << " keyOfs: " << keyOfs << " n:" << n << " direction: " << direction << endl; + out() << " keyOfs: " << keyOfs << " this->n:" << this->n << " direction: " << direction << endl; out() << bucketSummary() << endl; assert(false); } int adj = direction < 0 ? 1 : 0; int ko = keyOfs + direction; - DiskLoc nextDown = childForPos(ko+adj); + DiskLoc nextDown = this->childForPos(ko+adj); if ( !nextDown.isNull() ) { while ( 1 ) { - keyOfs = direction>0 ? 0 : nextDown.btree()->n - 1; - DiskLoc loc = nextDown.btree()->childForPos(keyOfs + adj); + keyOfs = direction>0 ? 0 : BTREE(nextDown)->n - 1; + DiskLoc loc = BTREE(nextDown)->childForPos(keyOfs + adj); if ( loc.isNull() ) break; nextDown = loc; @@ -1350,18 +1467,18 @@ namespace mongo { return nextDown; } - if ( ko < n && ko >= 0 ) { + if ( ko < this->n && ko >= 0 ) { keyOfs = ko; return thisLoc; } // end of bucket. traverse back up. DiskLoc childLoc = thisLoc; - DiskLoc ancestor = parent; + DiskLoc ancestor = this->parent; while ( 1 ) { if ( ancestor.isNull() ) break; - const BtreeBucket *an = ancestor.btree(); + const BtreeBucket *an = BTREE(ancestor); for ( int i = 0; i < an->n; i++ ) { if ( an->childForPos(i+adj) == childLoc ) { keyOfs = i; @@ -1369,7 +1486,7 @@ namespace mongo { } } assert( direction<0 || an->nextChild == childLoc ); - // parent exhausted also, keep going up + // this->parent exhausted also, keep going up childLoc = ancestor; ancestor = an->parent; } @@ -1377,7 +1494,14 @@ namespace mongo { return DiskLoc(); } - DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const { + template< class V > + DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const { + KeyOwned k(key); + return locate(idx, thisLoc, k, order, pos, found, recordLoc, direction); + } + + template< class V > + DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const Key& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const { int p; found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false); if ( found ) { @@ -1385,10 +1509,10 @@ namespace mongo { return thisLoc; } - DiskLoc child = childForPos(p); + DiskLoc child = this->childForPos(p); if ( !child.isNull() ) { - DiskLoc l = child.btree()->locate(idx, child, key, order, pos, found, recordLoc, direction); + DiskLoc l = BTREE(child)->locate(idx, child, key, order, pos, found, recordLoc, direction); if ( !l.isNull() ) return l; } @@ -1397,14 +1521,15 @@ namespace mongo { if ( direction < 0 ) return --pos == -1 ? DiskLoc() /*theend*/ : thisLoc; else - return pos == n ? DiskLoc() /*theend*/ : thisLoc; + return pos == this->n ? DiskLoc() /*theend*/ : thisLoc; } - bool BtreeBucket::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const { + template< class V > + bool BtreeBucket<V>::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const { while( 1 ) { if ( l + 1 == h ) { keyOfs = ( direction > 0 ) ? h : l; - DiskLoc next = thisLoc.btree()->k( h ).prevChildBucket; + DiskLoc next = BTREE(thisLoc)->k( h ).prevChildBucket; if ( !next.isNull() ) { bestParent = make_pair( thisLoc, keyOfs ); thisLoc = next; @@ -1415,7 +1540,7 @@ namespace mongo { } } int m = l + ( h - l ) / 2; - int cmp = customBSONCmp( thisLoc.btree()->keyNode( m ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); + int cmp = customBSONCmp( BTREE(thisLoc)->keyNode( m ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); if ( cmp < 0 ) { l = m; } @@ -1438,18 +1563,19 @@ namespace mongo { * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient */ - void BtreeBucket::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const { + template< class V > + void BtreeBucket<V>::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const { int l,h; bool dontGoUp; if ( direction > 0 ) { l = keyOfs; - h = n - 1; - dontGoUp = ( customBSONCmp( keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); + h = this->n - 1; + dontGoUp = ( customBSONCmp( keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); } else { l = 0; h = keyOfs; - dontGoUp = ( customBSONCmp( keyNode( l ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); + dontGoUp = ( customBSONCmp( keyNode( l ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); } pair< DiskLoc, int > bestParent; if ( dontGoUp ) { @@ -1459,16 +1585,16 @@ namespace mongo { } } else { - // go up parents until rightmost/leftmost node is >=/<= target or at top - while( !thisLoc.btree()->parent.isNull() ) { - thisLoc = thisLoc.btree()->parent; + // go up this->parents until rightmost/leftmost node is >=/<= target or at top + while( !BTREE(thisLoc)->parent.isNull() ) { + thisLoc = BTREE(thisLoc)->parent; if ( direction > 0 ) { - if ( customBSONCmp( thisLoc.btree()->keyNode( thisLoc.btree()->n - 1 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) { + if ( customBSONCmp( BTREE(thisLoc)->keyNode( BTREE(thisLoc)->n - 1 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) { break; } } else { - if ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) { + if ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) { break; } } @@ -1477,31 +1603,32 @@ namespace mongo { customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent ); } - void BtreeBucket::customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const { - if ( thisLoc.btree()->n == 0 ) { + template< class V > + void BtreeBucket<V>::customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const { + if ( BTREE(thisLoc)->n == 0 ) { thisLoc = DiskLoc(); return; } // go down until find smallest/biggest >=/<= target while( 1 ) { int l = 0; - int h = thisLoc.btree()->n - 1; + int h = BTREE(thisLoc)->n - 1; // leftmost/rightmost key may possibly be >=/<= search key bool firstCheck; if ( direction > 0 ) { - firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); + firstCheck = ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); } else { - firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); + firstCheck = ( customBSONCmp( BTREE(thisLoc)->keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); } if ( firstCheck ) { DiskLoc next; if ( direction > 0 ) { - next = thisLoc.btree()->k( 0 ).prevChildBucket; + next = BTREE(thisLoc)->k( 0 ).prevChildBucket; keyOfs = 0; } else { - next = thisLoc.btree()->nextChild; + next = BTREE(thisLoc)->nextChild; keyOfs = h; } if ( !next.isNull() ) { @@ -1515,21 +1642,21 @@ namespace mongo { } bool secondCheck; if ( direction > 0 ) { - secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) < 0 ); + secondCheck = ( customBSONCmp( BTREE(thisLoc)->keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) < 0 ); } else { - secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) > 0 ); + secondCheck = ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) > 0 ); } if ( secondCheck ) { DiskLoc next; if ( direction > 0 ) { - next = thisLoc.btree()->nextChild; + next = BTREE(thisLoc)->nextChild; } else { - next = thisLoc.btree()->k( 0 ).prevChildBucket; + next = BTREE(thisLoc)->k( 0 ).prevChildBucket; } if ( next.isNull() ) { - // if bestParent is null, we've hit the end and thisLoc gets set to DiskLoc() + // if bestParent is this->null, we've hit the end and thisLoc gets set to DiskLoc() thisLoc = bestParent.first; keyOfs = bestParent.second; return; @@ -1547,14 +1674,15 @@ namespace mongo { /** @thisLoc disk location of *this */ - int BtreeBucket::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, - const BSONObj& key, const Ordering &order, bool dupsAllowed, + template< class V > + int BtreeBucket<V>::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const Key& key, const Ordering &order, bool dupsAllowed, const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const { - if ( key.objsize() > KeyMax ) { - problem() << "ERROR: key too large len:" << key.objsize() << " max:" << KeyMax << ' ' << key.objsize() << ' ' << idx.indexNamespace() << endl; + if ( key.dataSize() > this->KeyMax ) { + problem() << "ERROR: key too large len:" << key.dataSize() << " max:" << this->KeyMax << ' ' << key.dataSize() << ' ' << idx.indexNamespace() << endl; return 2; } - assert( key.objsize() > 0 ); + assert( key.dataSize() > 0 ); int pos; bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed); @@ -1562,15 +1690,15 @@ namespace mongo { out() << " " << thisLoc.toString() << '.' << "_insert " << key.toString() << '/' << recordLoc.toString() << " l:" << lChild.toString() << " r:" << rChild.toString() << endl; - out() << " found:" << found << " pos:" << pos << " n:" << n << endl; + out() << " found:" << found << " pos:" << pos << " this->n:" << this->n << endl; } if ( found ) { const _KeyNode& kn = k(pos); if ( kn.isUnused() ) { log(4) << "btree _insert: reusing unused key" << endl; - massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull()); - massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull()); + massert( 10285 , "_insert: reuse key but lchild is not this->null", lChild.isNull()); + massert( 10286 , "_insert: reuse key but rchild is not this->null", rChild.isNull()); kn.writing().setUsed(); return 0; } @@ -1580,78 +1708,89 @@ namespace mongo { log() << " " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n'; log() << " " << key.toString() << '\n'; log() << " " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl; - log() << " old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl; - log() << " new l r: " << lChild.toString() << ' ' << rChild.toString() << endl; + log() << " old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl; + log() << " this->new l r: " << lChild.toString() << ' ' << rChild.toString() << endl; } alreadyInIndex(); } DEBUGGING out() << "TEMP: key: " << key.toString() << endl; - DiskLoc child = childForPos(pos); + Loc ch = this->childForPos(pos); + DiskLoc child = ch; if ( insert_debug ) out() << " getChild(" << pos << "): " << child.toString() << endl; - if ( child.isNull() || !rChild.isNull() /* means an 'internal' insert */ ) { + // In current usage, rChild isNull() for a this->new key and false when we are + // promoting a split key. These are the only two cases where _insert() + // is called currently. + if ( child.isNull() || !rChild.isNull() ) { + // A this->new key will be inserted at the same tree height as an adjacent existing key. insertHere(thisLoc, pos, recordLoc, key, order, lChild, rChild, idx); return 0; } - return child.btree()->bt_insert(child, recordLoc, key, order, dupsAllowed, idx, /*toplevel*/false); + return child.btree<V>()->_insert(child, recordLoc, key, order, dupsAllowed, /*lchild*/DiskLoc(), /*rchild*/DiskLoc(), idx); } - void BtreeBucket::dump() const { - out() << "DUMP btreebucket n:" << n; - out() << " parent:" << hex << parent.getOfs() << dec; - for ( int i = 0; i < n; i++ ) { - out() << '\n'; + template< class V > + void BtreeBucket<V>::dump(unsigned depth) const { + string indent = string(depth, ' '); + _log() << "BUCKET n:" << this->n; + _log() << " parent:" << hex << this->parent.getOfs() << dec; + for ( int i = 0; i < this->n; i++ ) { + _log() << '\n' << indent; KeyNode k = keyNode(i); - out() << '\t' << i << '\t' << k.key.toString() << "\tleft:" << hex << - k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec; + string ks = k.key.toString(); + _log() << " " << hex << k.prevChildBucket.getOfs() << '\n'; + _log() << indent << " " << i << ' ' << ks.substr(0, 30) << " Loc:" << k.recordLoc.toString() << dec; if ( this->k(i).isUnused() ) - out() << " UNUSED"; + _log() << " UNUSED"; } - out() << " right:" << hex << nextChild.getOfs() << dec << endl; + _log() << "\n" << indent << " " << hex << this->nextChild.getOfs() << dec << endl; } /** todo: meaning of return code unclear clean up */ - int BtreeBucket::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, - const BSONObj& key, const Ordering &order, bool dupsAllowed, - IndexDetails& idx, bool toplevel) const { + template< class V > + int BtreeBucket<V>::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const BSONObj& _key, const Ordering &order, bool dupsAllowed, + IndexDetails& idx, bool toplevel) const + { + KeyOwned key(_key); + if ( toplevel ) { - if ( key.objsize() > KeyMax ) { - problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.objsize() << ' ' << key.toString() << endl; + if ( key.dataSize() > this->KeyMax ) { + problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.dataSize() << ' ' << key.toString() << endl; return 3; } } int x = _insert(thisLoc, recordLoc, key, order, dupsAllowed, DiskLoc(), DiskLoc(), idx); - assertValid( order ); + this->assertValid( order ); return x; } - void BtreeBucket::shape(stringstream& ss) const { - _shape(0, ss); + template< class V > + void BtreeBucket<V>::shape(stringstream& ss) const { + this->_shape(0, ss); } - int BtreeBucket::getLowWaterMark() { - return lowWaterMark; + template< class V > + int BtreeBucket<V>::getKeyMax() { + return V::KeyMax; } - int BtreeBucket::getKeyMax() { - return KeyMax; - } - - DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const { - indexdetails.checkVersion(); + template< class V > + DiskLoc BtreeBucket<V>::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const { int pos; bool found; - // TODO: is it really ok here that the order is a default? + // TODO: is it really ok here that the order is a default? + // for findById() use, yes. for checkNoIndexConflicts, this->no? Ordering o = Ordering::make(BSONObj()); DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc ); if ( bucket.isNull() ) return bucket; - const BtreeBucket *b = bucket.btree(); + const BtreeBucket<V> *b = bucket.btree<V>(); while ( 1 ) { const _KeyNode& knraw = b->k(pos); if ( knraw.isUsed() ) @@ -1659,23 +1798,24 @@ namespace mongo { bucket = b->advance( bucket , pos , 1 , "findSingle" ); if ( bucket.isNull() ) return bucket; - b = bucket.btree(); + b = bucket.btree<V>(); } KeyNode kn = b->keyNode( pos ); - if ( key.woCompare( kn.key ) != 0 ) + if ( KeyOwned(key).woCompare( kn.key, o ) != 0 ) return DiskLoc(); return kn.recordLoc; } -} // namespace mongo +} // this->namespace mongo #include "db.h" #include "dbhelpers.h" namespace mongo { - void BtreeBucket::a_test(IndexDetails& id) { - BtreeBucket *b = id.head.btreemod(); + template< class V > + void BtreeBucket<V>::a_test(IndexDetails& id) { + BtreeBucket *b = id.head.btreemod<V>(); // record locs for testing DiskLoc A(1, 20); @@ -1703,155 +1843,45 @@ namespace mongo { b->dumpTree(id.head, orderObj); - /* b->bt_insert(id.head, B, key, order, false, id); + /* b->bt_insert(id.head, B, key, order, false, id); b->k(1).setUnused(); - b->dumpTree(id.head, order); - b->bt_insert(id.head, A, key, order, false, id); - b->dumpTree(id.head, order); */ // this should assert. does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing) b->bt_insert(id.head, C, key, order, false, id); -// b->dumpTree(id.head, order); + // b->dumpTree(id.head, order); } - /* --- BtreeBuilder --- */ + template class BucketBasics<V0>; + template class BucketBasics<V1>; + template class BtreeBucket<V0>; + template class BtreeBucket<V1>; - BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) : - dupsAllowed(_dupsAllowed), - idx(_idx), - n(0), - order( idx.keyPattern() ), - ordering( Ordering::make(idx.keyPattern()) ) { - first = cur = BtreeBucket::addBucket(idx); - b = cur.btreemod(); - committed = false; - } - - void BtreeBuilder::newBucket() { - DiskLoc L = BtreeBucket::addBucket(idx); - b->tempNext() = L; - cur = L; - b = cur.btreemod(); - } - - void BtreeBuilder::mayCommitProgressDurably() { - if ( getDur().commitIfNeeded() ) { - b = cur.btreemod(); - } - } - - void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) { - if ( key.objsize() > KeyMax ) { - problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() - << ' ' << key.objsize() << ' ' << key.toString() << endl; - return; - } - - if( !dupsAllowed ) { - if( n > 0 ) { - int cmp = keyLast.woCompare(key, order); - massert( 10288 , "bad key order in BtreeBuilder - server internal error", cmp <= 0 ); - if( cmp == 0 ) { - //if( !dupsAllowed ) - uasserted( ASSERT_ID_DUPKEY , BtreeBucket::dupKeyError( idx , keyLast ) ); - } + struct BTUnitTest : public UnitTest { + void run() { + DiskLoc big(0xf12312, 0x70001234); + DiskLoc56Bit bigl; + { + bigl = big; + assert( big == bigl ); + DiskLoc e = bigl; + assert( big == e ); } - keyLast = key; - } - - if ( ! b->_pushBack(loc, key, ordering, DiskLoc()) ) { - // bucket was full - newBucket(); - b->pushBack(loc, key, ordering, DiskLoc()); - } - n++; - mayCommitProgressDurably(); - } - - void BtreeBuilder::buildNextLevel(DiskLoc loc) { - int levels = 1; - while( 1 ) { - if( loc.btree()->tempNext().isNull() ) { - // only 1 bucket at this level. we are done. - getDur().writingDiskLoc(idx.head) = loc; - break; - } - levels++; - - DiskLoc upLoc = BtreeBucket::addBucket(idx); - DiskLoc upStart = upLoc; - BtreeBucket *up = upLoc.btreemod(); - - DiskLoc xloc = loc; - while( !xloc.isNull() ) { - if ( getDur().commitIfNeeded() ) { - b = cur.btreemod(); - up = upLoc.btreemod(); - } - - BtreeBucket *x = xloc.btreemod(); - BSONObj k; - DiskLoc r; - x->popBack(r,k); - bool keepX = ( x->n != 0 ); - DiskLoc keepLoc = keepX ? xloc : x->nextChild; - - if ( ! up->_pushBack(r, k, ordering, keepLoc) ) { - // current bucket full - DiskLoc n = BtreeBucket::addBucket(idx); - up->tempNext() = n; - upLoc = n; - up = upLoc.btreemod(); - up->pushBack(r, k, ordering, keepLoc); - } - - DiskLoc nextLoc = x->tempNext(); // get next in chain at current level - if ( keepX ) { - x->parent = upLoc; - } - else { - if ( !x->nextChild.isNull() ) - x->nextChild.btreemod()->parent = upLoc; - x->deallocBucket( xloc, idx ); - } - xloc = nextLoc; + { + DiskLoc d; + assert( d.isNull() ); + DiskLoc56Bit l; + l = d; + assert( l.isNull() ); + d = l; + assert( d.isNull() ); + assert( l < bigl ); } - - loc = upStart; - mayCommitProgressDurably(); } - - if( levels > 1 ) - log(2) << "btree levels: " << levels << endl; - } - - /** when all addKeys are done, we then build the higher levels of the tree */ - void BtreeBuilder::commit() { - buildNextLevel(first); - committed = true; - } - - BtreeBuilder::~BtreeBuilder() { - DESTRUCTOR_GUARD( - if( !committed ) { - log(2) << "Rolling back partially built index space" << endl; - DiskLoc x = first; - while( !x.isNull() ) { - DiskLoc next = x.btree()->tempNext(); - string ns = idx.indexNamespace(); - theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x); - x = next; - getDur().commitIfNeeded(); - } - assert( idx.head.isNull() ); - log(2) << "done rollback" << endl; - } - ) - } + } btunittest; } |