summaryrefslogtreecommitdiff
path: root/db/btree.h
blob: bced95e875b6f107c6982c0cd27c846e1927b36d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
// btree.h

/**
*    Copyright (C) 2008 10gen Inc.
*
*    This program is free software: you can redistribute it and/or  modify
*    it under the terms of the GNU Affero General Public License, version 3,
*    as published by the Free Software Foundation.
*
*    This program is distributed in the hope that it will be useful,
*    but WITHOUT ANY WARRANTY; without even the implied warranty of
*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*    GNU Affero General Public License for more details.
*
*    You should have received a copy of the GNU Affero General Public License
*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include "../pch.h"
#include "jsobj.h"
#include "diskloc.h"
#include "pdfile.h"

namespace mongo {

    const int BucketSize = 8192;

#pragma pack(1)
    struct _KeyNode {
        /** Signals that we are writing this _KeyNode and casts away const */
        _KeyNode& writing() const;
        DiskLoc prevChildBucket; // the lchild
        DiskLoc recordLoc; // location of the record associated with the key
        short keyDataOfs() const {
            return (short) _kdo;
        }
        unsigned short _kdo;
        void setKeyDataOfs(short s) {
            _kdo = s;
            assert(s>=0);
        }
        void setKeyDataOfsSavingUse(short s) {
            _kdo = s;
            assert(s>=0);
        }
        void setUsed() { recordLoc.GETOFS() &= ~1; }
        void setUnused() {
            // Setting ofs to odd is the sentinel for unused, as real recordLoc's are always
            //  even numbers.
            // Note we need to keep its value basically the same as we use the recordLoc
            // as part of the key in the index (to handle duplicate keys efficiently).
            recordLoc.GETOFS() |= 1;
        }
        int isUnused() const {
            return recordLoc.getOfs() & 1;
        }
        int isUsed() const {
            return !isUnused();
        }
    };
#pragma pack()

    class BucketBasics;

    /**
     * wrapper - this is our in memory representation of the key.
     * _KeyNode is the disk representation.
     *
     * This object and its bson key will become invalid if the key is moved.
     */
    class KeyNode {
    public:
        KeyNode(const BucketBasics& bb, const _KeyNode &k);
        const DiskLoc& prevChildBucket;
        const DiskLoc& recordLoc;
        BSONObj key;
    };

#pragma pack(1)
    class BtreeData {
    protected:
        DiskLoc parent;
        DiskLoc nextChild; // child bucket off and to the right of the highest key.
        unsigned short _wasSize; // can be reused, value is 8192 in current pdfile version Apr2010
        unsigned short _reserved1; // zero
        int flags;

        // basicInsert() assumes these three are together and in this order:
        int emptySize; // size of the empty region
        int topSize; // size of the data at the top of the bucket (keys are at the beginning or 'bottom')
        int n; // # of keys so far.

        int reserved;
        char data[4];
    };

    /**
     * This class is all about the storage management
     *
     * Const member functions of this class are those which may be called on
     * an object for which writing has not been signaled.  Non const member
     * functions may only be called on objects for which writing has been
     * signaled.  Note that currently some const functions write to the
     * underlying memory representation of this bucket using optimized methods
     * to signal write operations.
     *
     * DiskLoc parameters that may shadow references within the btree should
     * be passed by value rather than by reference to non const member
     * functions or const member functions which may perform writes.  This way
     * a callee need not worry that write operations will change or invalidate
     * its arguments.
     *
     * The current policy for dealing with bson arguments is the opposite of
     * what is described above for DiskLoc arguments.  We do
     * not want to want to copy bson into memory as an intermediate step for
     * btree changes, so if bson is to be moved it must be copied to the new
     * location before the old location is invalidated.
     */
    class BucketBasics : public BtreeData {
        friend class BtreeBuilder;
        friend class KeyNode;
    public:
        /** assert write intent declared for this bucket already */
        void assertWritable();

        void assertValid(const Ordering &order, bool force = false) const;
        void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); }

        /**
         * @return KeyNode for key at index i.  The KeyNode will become invalid
         * if the key is moved or reassigned, or if the node is packed.
         */
        const KeyNode keyNode(int i) const {
            if ( i >= n ) {
                massert( 13000 , (string)"invalid keyNode: " +  BSON( "i" << i << "n" << n ).jsonString() , i < n );
            }
            return KeyNode(*this, k(i));
        }

        static int headerSize() {
            const BucketBasics *d = 0;
            return (char*)&(d->data) - (char*)&(d->parent);
        }
        static int bodySize() { return BucketSize - headerSize(); }

        // for testing
        int nKeys() const { return n; }
        const DiskLoc getNextChild() const { return nextChild; }

    protected:
        char * dataAt(short ofs) { return data + ofs; }

        void init(); // initialize a new node

        /**
         * @return false if node is full and must be split
         * @keypos is where to insert -- inserted before that key #.  so keypos=0 is the leftmost one.
         *  keypos will be updated if keys are moved as a result of pack()
         * This function will modify the btree bucket memory representation even
         * though it is marked const.
         */
        bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const;

        /** @return true if works, false if not enough space */
        bool _pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild);
        void pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) {
            bool ok = _pushBack( recordLoc , key , order , prevChild );
            assert(ok);
        }

        /**
         * This is a special purpose function used by BtreeBuilder.  The
         * interface is quite dangerous if you're not careful.  The bson key
         * returned here points to bucket memory that has been invalidated but
         * not yet reclaimed.
         *
         * TODO Maybe this could be replaced with two functions, one which
         * returns the last key without deleting it and another which simply
         * deletes the last key.  Then the caller would have enough control to
         * ensure proper memory integrity.
         */
        void popBack(DiskLoc& recLoc, BSONObj& key);

        void _delKeyAtPos(int keypos, bool mayEmpty = false); // low level version that doesn't deal with child ptrs.

        /* !Packed means there is deleted fragment space within the bucket.
           We "repack" when we run out of space before considering the node
           to be full.
           */
        enum Flags { Packed=1 };

        const DiskLoc& childForPos(int p) const { return p == n ? nextChild : k(p).prevChildBucket; }
        DiskLoc& childForPos(int p) { return p == n ? nextChild : k(p).prevChildBucket; }

        int totalDataSize() const;
        /** @return true if the key may be dropped by pack() */
        bool mayDropKey( int index, int refPos ) const;

        /**
         * Pack the bucket to reclaim space from invalidated memory.
         * @refPos is an index in the bucket which will may be updated if we
         *  delete keys from the bucket
         * This function may cast away const and perform a write.
         */
        void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const;
        /** Pack when already writable */
        void _packReadyForMod(const Ordering &order, int &refPos);

        /**
         * @return the size of non header data in this bucket if we were to
         * call pack().
         */
        int packedDataSize( int refPos ) const;
        void setNotPacked() { flags &= ~Packed; }
        void setPacked() { flags |= Packed; }
        int _alloc(int bytes);
        void _unalloc(int bytes);
        void truncateTo(int N, const Ordering &order, int &refPos);
        /** drop specified number of keys from beginning of key array, and pack */
        void dropFront(int nDrop, const Ordering &order, int &refPos);
        void markUnused(int keypos);

        /**
         * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
         *   we use tempNext() when we do that to be less confusing. (one might have written a union in C)
         */
        const DiskLoc& tempNext() const { return parent; }
        DiskLoc& tempNext() { return parent; }

        void _shape(int level, stringstream&) const;
        int Size() const;
        const _KeyNode& k(int i) const { return ((const _KeyNode*)data)[i]; }
        _KeyNode& k(int i) { return ((_KeyNode*)data)[i]; }

        /** @return the key position where a split should occur on insert */
        int splitPos( int keypos ) const;

        /**
         * Adds new entries to beginning of key array, shifting existing
         * entries to the right.  After this is called, setKey() must be called
         * on all the newly created entries in the key array.
         */
        void reserveKeysFront( int nAdd );

        /**
         * Sets an existing key using the given parameters.
         * @i index of key to set
         */
        void setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket );
    };

    /**
     * This class adds functionality for manipulating buckets that are assembled
     * in a tree.  The requirements for const and non const functions and
     * arguments are generally the same as in BtreeBucket.  Because this class
     * deals with tree structure, some functions that are marked const may
     * trigger modification of another node in the btree or potentially of the
     * current node.  In such cases, the function's implementation explicitly
     * casts away const when indicating an intent to write to the durability
     * layer.  The DiskLocs provided to such functions should be passed by
     * value if they shadow pointers within the btree.
     *
     * To clarify enforcement of referential integrity in this implementation,
     * we use the following pattern when deleting data we have a persistent
     * pointer to.  The pointer is cleared or removed explicitly, then the data
     * it pointed to is cleaned up with a helper function.
     *
     * TODO It might make sense to put some of these functions in a class
     * representing a full btree instead of a single btree bucket.  That would
     * allow us to use the const qualifier in a manner more consistent with
     * standard usage.  Right now the interface is for both a node and a tree,
     * so assignment of const is sometimes nonideal.
     *
     * TODO There are several cases in which the this pointer is invalidated
     * as a result of deallocation.  A seperate class representing a btree would
     * alleviate some fragile cases where the implementation must currently
     * behave correctly if the this pointer is suddenly invalidated by a
     * callee.
     */
    class BtreeBucket : public BucketBasics {
        friend class BtreeCursor;
    public:
        bool isHead() const { return parent.isNull(); }
        void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const;
        int fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount = 0, bool strict = false) const; /* traverses everything */

        bool isUsed( int i ) const { return k(i).isUsed(); }
        string bucketSummary() const;
        void dump() const;

        /**
         * @return true if key exists in index
         *
         * @order - indicates order of keys in the index.  this is basically the index's key pattern, e.g.:
         *    BSONObj order = ((IndexDetails&)idx).keyPattern();
         * likewise below in bt_insert() etc.
         */
        bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const;

        bool wouldCreateDup(
            const IndexDetails& idx, const DiskLoc &thisLoc,
            const BSONObj& key, const Ordering& order,
            const DiskLoc &self) const;

        static DiskLoc addBucket(const IndexDetails&); /* start a new index off, empty */
        /** invalidates 'this' and thisLoc */
        void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);

        static void renameIndexNamespace(const char *oldNs, const char *newNs);

        /** This function may change the btree root */
        int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
                      const BSONObj& key, const Ordering &order, bool dupsAllowed,
                      IndexDetails& idx, bool toplevel = true) const;

        /** This function may change the btree root */
        bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const;

        /**
         * locate may return an "unused" key that is just a marker.  so be careful.
         *   looks for a key:recordloc pair.
         *
         * @found - returns true if exact match found.  note you can get back a position
         *          result even if found is false.
         */
        DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
                       int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;

        /**
         * find the first instance of the key
         * does not handle dups
         * returned DiskLoc isNull if can't find anything with that
         * @return the record location of the first match
         */
        DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const;

        /** advance one key position in the index: */
        DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const;

        void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const;
        void customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const;

        const DiskLoc getHead(const DiskLoc& thisLoc) const;

        /** get tree shape */
        void shape(stringstream&) const;

        static void a_test(IndexDetails&);

        static int getLowWaterMark();
        static int getKeyMax();

    protected:
        /**
         * Fix parent pointers for children
         * @firstIndex first index to modify
         * @lastIndex last index to modify (-1 means last index is n)
         */
        void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const;

        /** invalidates this and thisLoc */
        void delBucket(const DiskLoc thisLoc, const IndexDetails&);
        /** may invalidate this and thisLoc */
        void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order);

        /**
         * May balance utilization of this bucket with a neighbor, either by
         * merging the buckets or shifting nodes.
         * @return true iff balancing was performed.
         * NOTE This function may invalidate thisLoc.
         */
        bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const;

        /** @return true if balance succeeded */
        bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const;
        void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order );
        void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
                                   BtreeBucket *l, const DiskLoc lchild,
                                   BtreeBucket *r, const DiskLoc rchild,
                                   IndexDetails &id, const Ordering &order );
        void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
                                   BtreeBucket *l, const DiskLoc lchild,
                                   BtreeBucket *r, const DiskLoc rchild,
                                   IndexDetails &id, const Ordering &order );

        /** may invalidate this and thisLoc */
        void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order);

        /** will invalidate this and thisLoc */
        void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id );

        /** @return true iff left and right child can be merged into one node */
        bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const;

        /**
         * @return index of the rebalanced separator; the index value is
         * determined as if we had an array
         * <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> )
         * This is only expected to be called if the left and right child
         * cannot be merged.
         * This function is expected to be called on packed buckets, see also
         * comments for splitPos().
         */
        int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const;

        int indexInParent( const DiskLoc &thisLoc ) const;
        BSONObj keyAt(int keyOfs) const {
            return keyOfs >= n ? BSONObj() : keyNode(keyOfs).key;
        }
        static BtreeBucket* allocTemp(); /* caller must release with free() */

        /** split bucket */
        void split(const DiskLoc thisLoc, int keypos,
                   const DiskLoc recordLoc, const BSONObj& key,
                   const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx);

        void insertHere(const DiskLoc thisLoc, int keypos,
                        const DiskLoc recordLoc, const BSONObj& key, const Ordering &order,
                        const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const;

        int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
                    const BSONObj& key, const Ordering &order, bool dupsAllowed,
                    const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const;
        bool find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;
        bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const;
        static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
        static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction );
        static void fix(const DiskLoc thisLoc, const DiskLoc child);

        /** Replaces an existing key with the new specified key, splitting if necessary */
        void setInternalKey( const DiskLoc thisLoc, int keypos,
                             const DiskLoc recordLoc, const BSONObj &key, const Ordering &order,
                             const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx);

        /**
         * Deletes the specified key, replacing it with the key immediately
         * preceding or succeeding it in the btree.  Either the left or right
         * child of the specified key must be non null.
         */
        void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order );
    public:
        /** simply builds and returns a dup key error message string */
        static string dupKeyError( const IndexDetails& idx , const BSONObj& key );
    };
#pragma pack()

    class BtreeCursor : public Cursor {
    public:
        BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
        BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
        virtual bool ok() { return !bucket.isNull(); }
        virtual bool advance();
        virtual void noteLocation(); // updates keyAtKeyOfs...
        virtual void checkLocation();
        virtual bool supportGetMore() { return true; }
        virtual bool supportYields() { return true; }

        /**
         * used for multikey index traversal to avoid sending back dups. see Matcher::matches().
         * if a multikey index traversal:
         *   if loc has already been sent, returns true.
         *   otherwise, marks loc as sent.
         * @return true if the loc has not been seen
         */
        virtual bool getsetdup(DiskLoc loc) {
            if( _multikey ) {
                pair<set<DiskLoc>::iterator, bool> p = _dups.insert(loc);
                return !p.second;
            }
            return false;
        }

        virtual bool modifiedKeys() const { return _multikey; }
        virtual bool isMultiKey() const { return _multikey; }

        const _KeyNode& _currKeyNode() const {
            assert( !bucket.isNull() );
            const _KeyNode& kn = bucket.btree()->k(keyOfs);
            assert( kn.isUsed() );
            return kn;
        }
        const KeyNode currKeyNode() const {
            assert( !bucket.isNull() );
            return bucket.btree()->keyNode(keyOfs);
        }

        virtual BSONObj currKey() const { return currKeyNode().key; }
        virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); }

        virtual void aboutToDeleteBucket(const DiskLoc& b) {
            if ( bucket == b )
                keyOfs = -1;
        }

        virtual DiskLoc currLoc()  { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();  }
        virtual DiskLoc refLoc()   { return currLoc(); }
        virtual Record* _current() { return currLoc().rec(); }
        virtual BSONObj current()  { return BSONObj(_current()); }
        virtual string toString() {
            string s = string("BtreeCursor ") + indexDetails.indexName();
            if ( _direction < 0 ) s += " reverse";
            if ( _bounds.get() && _bounds->size() > 1 ) s += " multi";
            return s;
        }

        BSONObj prettyKey( const BSONObj &key ) const {
            return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable();
        }

        virtual BSONObj prettyIndexBounds() const {
            if ( !_independentFieldRanges ) {
                return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) );
            }
            else {
                return _bounds->obj();
            }
        }

        void forgetEndKey() { endKey = BSONObj(); }

        virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }

        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher;  }

        virtual long long nscanned() { return _nscanned; }

        /** for debugging only */
        const DiskLoc getBucket() const { return bucket; }

    private:
        /**
         * Our btrees may (rarely) have "unused" keys when items are deleted.
         * Skip past them.
         */
        bool skipUnusedKeys( bool mayJump );
        bool skipOutOfRangeKeysAndCheckEnd();
        void skipAndCheck();
        void checkEnd();

        /** selective audits on construction */
        void audit();

        /** set initial bucket */
        void init();

        /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */
        void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive );

        friend class BtreeBucket;

        set<DiskLoc> _dups;
        NamespaceDetails * const d;
        const int idxNo;
        BSONObj startKey;
        BSONObj endKey;
        bool _endKeyInclusive;
        bool _multikey; // this must be updated every getmore batch in case someone added a multikey
        const IndexDetails& indexDetails;
        const BSONObj _order;
        const Ordering _ordering;
        DiskLoc bucket;
        int keyOfs;
        const int _direction; // 1=fwd,-1=reverse
        BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
        DiskLoc locAtKeyOfs;
        const shared_ptr< FieldRangeVector > _bounds;
        auto_ptr< FieldRangeVector::Iterator > _boundsIterator;
        const IndexSpec& _spec;
        shared_ptr< CoveredIndexMatcher > _matcher;
        bool _independentFieldRanges;
        long long _nscanned;
    };


    inline bool IndexDetails::hasKey(const BSONObj& key) {
        return head.btree()->exists(*this, head, key, Ordering::make(keyPattern()));
    }
    inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) {
        return head.btree()->wouldCreateDup(*this, head, key, Ordering::make(keyPattern()), self);
    }

    /**
     * build btree from the bottom up
     * _ TODO dropDups
     */
    class BtreeBuilder {
        bool dupsAllowed;
        IndexDetails& idx;
        unsigned long long n;
        BSONObj keyLast;
        BSONObj order;
        Ordering ordering;
        bool committed;

        DiskLoc cur, first;
        BtreeBucket *b;

        void newBucket();
        void buildNextLevel(DiskLoc);
        void mayCommitProgressDurably();

    public:
        ~BtreeBuilder();

        BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx);

        /** keys must be added in order */
        void addKey(BSONObj& key, DiskLoc loc);

        /**
         * commit work.  if not called, destructor will clean up partially completed work
         *  (in case exception has happened).
         */
        void commit();

        unsigned long long getn() { return n; }
    };

} // namespace mongo;